예제 #1
0
    def __init__(self, data_path, epsilon=1, seed=42):
        """
        Initializes our object dataset. The data held by this dataset consists
        in state vectors for each object.

        Implementation notes : for now we'll yield vectors in batches of 1, 
        with (object vectors, index) tuples. index refers to the configuration
        index that is used to compare two different set of objects (equal 
        indices means equal configuartions).

        The configurations are stored as a list of (list of arrays, int)
        tuples, as is returned by ut.from_file().

        Arguments :
            - data_path : path to the data file
            - epsilon (float between 0 and 1) : proportion, for one
                configuration, of similar configurations in the dataset. This
                leads to a epsilon**2 to one imbalance in the comparison
                dataset for the positive ('same') class. To overcome this, we
                undersample the negative class by dropping negative examples
                with a probability of 1 - epsilon**2
        """
        self._configs = ut.from_file(data_path)
        self._nb_objects = 3
        self._seed = seed
        self.epsilon = epsilon
        np.random.seed(self._seed)
예제 #2
0
 def word_use_by_year(self, word):
     data = utils.from_file('zipf_by_year.json')
     for x in data:
         dat = data[x]
         if dat['top_words'].count(word):
             index = str(dat['top_words'].index(word) + 1)
         else:
             index = 'No presente'
         print dat['name'] + '	' + index
예제 #3
0
    def new(self, input_file, number_of_blocks=None, block_size=None):

        self.string_filename = input_file
        self.string = utils.from_file(self.string_filename)
        self.tester = Tester(testnames)
        self.tester.run_all_tests(self.string, number_of_blocks, block_size)

        for test in self.testnames:
            self.p_values[test] = [r.p for r in self.tester.results[test]]
예제 #4
0
	def word_use_by_year(self, word):
		data =utils.from_file('zipf_by_year.json')
		for x in data:
			dat = data[x]
			if dat['top_words'].count(word):
				index = str(dat['top_words'].index(word)+1)
			else:
				index = 'No presente'
			print dat['name']+'	'+index
			 	
		
		
			
예제 #5
0
파일: p042.py 프로젝트: willrogers/euler
"""
Find how many of the words in words.txt are triangle words.
All the words are upper case.
"""

from utils import from_file, is_triangle


values = {}
for i in range(26):
    # ascii values of A - Z 
    values[chr(i + 65)] = i + 1


content = from_file("data/words.txt")
content = content.strip()

words = [ word.strip("\"") for word in content.split(",")]

tris = 0
for word in words:
    score = 0
    for letter in word:
        score += values[letter]
    if is_triangle(score):
        tris += 1

print tris


예제 #6
0
### Test script to show off some of the functionality of codebase ###
from time import time
from utils import to_dic, from_file
from cdd_domains import domain_manager

if __name__ == '__main__':
    start = time()
    domains = from_file(domain_manager, 'FullSeaUrchin.txt'
                        )  #.15 seconds for this, about 6 seconds for full file
    domains = to_dic(domains)
    print 'It took str(time()-start) seconds to read in str(len(domains)) sea urchin domains and assign unique\
          dictionary keys'

    ### add some domains from scratch ###
    domains['fake entry'] = domain_manager.dict_make(Query='Fake Query')
    print domains['fake entry']
#    strictnamedtuple('hi')
#    d0=DomainCDD()
#    d1=DomainCDD(Query='steve', Start=20.0)
#    d2=DomainCDD(Query='bo', Start=8.0)
#    d3=DomainCDD(Query='gay', Start=50.0)
#    print dir(d1), d1
#    ds=(d1,d2,d3)
#    dic=ManagerCDD()
#    for d in ds:
#        dic[d.get_uniquekey()]=d
#    domains=from_file('TestSet.txt')
#    print len(domains)
#    dfile=to_dic(domains)
#    fd=formatted_domains(dfile)
#    a=network_diagram(fd, 'cl09099')
예제 #7
0
### Test script to show off some of the functionality of codebase ###
from time import time
from utils import to_dic, from_file
from cdd_domains import domain_manager

if __name__ == '__main__':	
    start=time()
    domains=from_file(domain_manager, 'FullSeaUrchin.txt')  #.15 seconds for this, about 6 seconds for full file
    domains=to_dic(domains)
    print 'It took str(time()-start) seconds to read in str(len(domains)) sea urchin domains and assign unique\
          dictionary keys'
    ### add some domains from scratch ###
    domains['fake entry']=domain_manager.dict_make(Query='Fake Query')
    print domains['fake entry']
#    strictnamedtuple('hi')
#    d0=DomainCDD()
#    d1=DomainCDD(Query='steve', Start=20.0)
#    d2=DomainCDD(Query='bo', Start=8.0)
#    d3=DomainCDD(Query='gay', Start=50.0)
#    print dir(d1), d1
#    ds=(d1,d2,d3)
#    dic=ManagerCDD()
#    for d in ds:
#        dic[d.get_uniquekey()]=d
#    domains=from_file('TestSet.txt')
#    print len(domains)
#    dfile=to_dic(domains)
#    fd=formatted_domains(dfile)
#    a=network_diagram(fd, 'cl09099')
#    network_outfile(a, 'junknetwork', summary=True, adjacency=True)
#    Pixel = namedtuple('DomainCDD', DomainCDD._fields + 'FagField')
예제 #8
0
        self.d2 = self.psi0 - 2*self.psi1 + self.psi2

        self.p1 = sp.gammaincc(2**(self.m-2), self.d1/2)
        self.p2 = sp.gammaincc(2**(self.m-3), self.d2/2)
        self.p = self.p1
        self.success = (self.p >= 0.01)
        self.test_run = True


    def psi_sq_mv1(self, m, padded_bits):
        counts = [0 for i in range(2**m)]
        for idx in range(self.n):
            block = padded_bits[idx:idx+m]
            block_as_int = int(''.join([str(i) for i in block]), 2)
            counts[block_as_int] += 1

        psi_sq_m = sum([c**2 for c in counts])*(2**m)/self.n-self.n

        return psi_sq_m

if __name__ == '__main__':
    import sys
    sys.path.append('../')
    import utils

    e = '../numbers/data.e'
    bits = utils.from_file(e)[:1000000]
    # bits = utils.from_string('0011011101')
    S = Serial(bits)
    print(S.p2) # p-value = 0.56195
예제 #9
0
if __name__ == '__main__':	
    strictnamedtuple('hi')
    d0=DomainCDD()
    d1=DomainCDD(Query='steve', Start=20.0)
    d2=DomainCDD(Query='bo', Start=8.0)
    d3=DomainCDD(Query='gay', Start=50.0)
    print dir(d1), d1
    ds=(d1,d2,d3)
    dic=ManagerCDD()
    for d in ds:
        dic[d.get_uniquekey()]=d
#    print dic

  #  print sortbyarg(dic, 'Query', 'Start')
#    print get_subset(dic, 'Query', 'Start', newkey='Start')
    domains=from_file('TestSet.txt')
    print len(domains)
    dfile=to_dic(domains)
    fd=formatted_domains(dfile)
    a=network_diagram(fd, 'cl09099')
    network_outfile(a, 'junknetwork', summary=True, adjacency=True)
    Pixel = namedtuple('DomainCDD', DomainCDD._fields + 'FagField')
    p=Pixel()
    print p    
    
    
##################
######SCRAP#######
##################

예제 #10
0
파일: p022.py 프로젝트: willrogers/euler
"""
For each item in the list of names, get an alphabetical score.  Then multiply
it by its position in the list to get an overall score.
"""

from utils import from_file

content  = from_file("data/p022_names.txt")

values = {}
for i in range(26):
    # ascii values of A - Z 
    values[chr(i + 65)] = i + 1

content = content.strip()
names = [ word.strip("\"") for word in content.split(",") ]

names = sorted(names)

total = 0

for i in range(len(names)):
    score = sum( [values[x] for x in names[i] ] )
    #score = reduce( lambda x, y: values[x] + values[y], names[i] ) 
    total += score * (i + 1)

print total