Exemplo n.º 1
0
 def check_domain(self, obj, experiment):
     """
     Either get or create assignment.
     """
     from honest_ab.models import ExperimentDomainAllocation
     from honest_ab.models import ExperimentDomain
     try:
         object_name = '.'.join([str(obj.__class__.__module__), str(obj.__class__.__name__)])
         domain_allocation = ExperimentDomainAllocation.objects.get(
             model_pk=obj.pk,
             model=object_name
         )
     except ExperimentDomainAllocation.DoesNotExist:
         num_domains = ExperimentDomain.objects.filter(active=1).count()
         order = murmur.string_hash(str(obj.pk), domain_hash_key) % num_domains
         domain = ExperimentDomain.objects.filter(active=1)[order]
         try:
             ExperimentDomainAllocation.objects.create(
                 experiment_domain=domain,
                 model=object_name,
                 model_pk=obj.pk,
             )
         except IntegrityError:
             # This can occur in high traffic instances where two threads hit this method
             # at the same time. Both will fail the get and both will try to create.
             pass
         return domain.pk == experiment.domain_id
     else:
         return domain_allocation.experiment_domain_id == experiment.domain_id
Exemplo n.º 2
0
 def check_fetched(self, bitmap, url):
     mhash = murmur.string_hash(url)
     if self.r.getbit(bitmap, mhash):
         return True
     else:
         self.r.setbit(bitmap, mhash, 1)
         return False
Exemplo n.º 3
0
def check_fetched(url):
    r = redis.Redis(connection_pool=POOL)
    mhash = murmur.string_hash(url)
    if r.getbit(config.BITMAP, mhash):
        return True
    else:
        r.setbit(config.BITMAP, mhash, 1)
        return False
Exemplo n.º 4
0
    def _make_decision(self, obj, experiment):
        """
        Use a simple hash on the string value of the object pk.
        """
        if random.random() * 100 > experiment.percentage_of_traffic:
            result = HONEST_AB_SKIP_TYPE
        else:
            result = str(murmur.string_hash(str(obj.pk), experiment.pk) % experiment.number_of_classes)

        return result
Exemplo n.º 5
0
def extract_vector(features, bits):
    vector = {}
    for (fset, vals) in features.items():
        for (f, v) in vals.items():
            try:
                h = murmur.string_hash("%s_%s" % (fset, f.encode("utf-8")))
                h = h & (2 ** bits - 1)
                vector[h + 1] = v
            except UnicodeDecodeError, e:
                print >> sys.stderr, "%s: %s" % (e, (f, v))
                continue
Exemplo n.º 6
0
    def _make_decision(self, obj, experiment):
        """
        Use a simple hash on the string value of the object pk.
        """
        if random.random() * 100 > experiment.percentage_of_traffic:
            result = HONEST_AB_SKIP_TYPE
        else:
            result = str(
                murmur.string_hash(str(obj.pk), experiment.pk) %
                experiment.number_of_classes)

        return result
def minhash(features):
    minhashes = []
    sketches = []
    
    #for feature in features:
        #print("feature")
    
    for num_minhash in range(NUM_MINHASHES):
        minhashes.append(
                min([murmur.string_hash(`feature`, num_minhash) for feature in features])
                )
        #print("this is the min hash array: ", minhashes)
        #raw_input("\npress any key...")
        #continue

    for i in xrange(0,NUM_MINHASHES,SKETCH_RATIO):
        sketch = murmur.string_hash('minhashes[i:i+SKETCH_RATIO]')
        sketches.append(sketch)
       
    return np.array(minhashes),sketches
Exemplo n.º 8
0
def extract_features(sha, path_to_files_dir,
                     hash_dim=1024, split_regex=r"\s+"):
    # first, read in the file as a big string:
    file = read_file(sha=sha, dir=path_to_files_dir)
    # next, split the big string into a bunch of different tokens ("words"):
    tokens = re.split(pattern=split_regex, string=file)
    # now take the module(hash of each token) so that each token is replaced
    # by bucket (category) from 1:hash_dim.
    token_hash_buckets = [
        (murmur.string_hash(w) % (hash_dim - 1) + 1) for w in tokens
    ]
    # Finally, we'll count how many hits each bucket got, so that our features
    # always have length hash_dim, regardless of the size of the HTML file:
    token_bucket_counts = np.zeros(hash_dim)
    # this returns the frequency counts for each unique value in
    # token_hash_buckets:
    buckets, counts = np.unique(token_hash_buckets, return_counts=True)
    # and now we insert these counts into our token_bucket_counts object:
    for bucket, count in zip(buckets, counts):
        token_bucket_counts[bucket] = count
    return np.array(token_bucket_counts)
Exemplo n.º 9
0
def haiku_nofriends(user):
    """haiku for someone with no friends."""
    out = []
    poem = Poem()
    poem.seed_user = user

    for i in [5, 7, 5]:
        ln = ' '.join(generate_syllable_chain(i, user.corpus))
        out.append(ln)

    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.lines = out
    poem.poem_id = idnumber

    percussion = [music.screen_to_track(user)]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)
    
    poem.save()
    return poem
Exemplo n.º 10
0
def haiku_nofriends(user):
    """haiku for someone with no friends."""
    out = []
    poem = Poem()
    poem.seed_user = user

    for i in [5, 7, 5]:
        ln = ' '.join(generate_syllable_chain(i, user.corpus))
        out.append(ln)

    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.lines = out
    poem.poem_id = idnumber

    percussion = [music.screen_to_track(user)]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)

    poem.save()
    return poem
Exemplo n.º 11
0
def haiku(user):
    #Can't do users until we're closed under corpus
    out = []
    poem = Poem()
    poem.seed_user = user
    users = [user]
    for i in [5, 7, 5]:
        ln = ' '.join(generate_syllable_chain(i, user.corpus))
        out.append(ln)
        user = get_friend(user)
        users.append(user)

    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.lines = out
    poem.poem_id = idnumber

    percussion = [music.screen_to_track(u) for u in users]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)

    poem.save()
    return poem
Exemplo n.º 12
0
def stanza(user, max_length, lines):
    out = []
    poem = Poem()
    poem.seed_user = user

    for _ in range(lines):
        ln = ' '.join(
            generate_chain(length=max_length, markov=user.corpus,
                           strict=False))
        out.append(ln)
        user = get_friend(user)

    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.poem_id = idnumber
    poem.lines = out

    percussion = [music.screen_to_track(user)]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)

    poem.save()
    return poem
Exemplo n.º 13
0
def haiku(user):
    #Can't do users until we're closed under corpus
    out = []
    poem = Poem()
    poem.seed_user = user
    users = [user]
    for i in [5,7,5]:
        ln = ' '.join(generate_syllable_chain(i, user.corpus))
        out.append(ln)
        user = get_friend(user)
        users.append(user)

    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.lines = out
    poem.poem_id = idnumber

    percussion = [music.screen_to_track(u) for u in users]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)
    
    poem.save()
    return poem 
Exemplo n.º 14
0
def stanza(user, max_length, lines):
    out = []
    poem = Poem()
    poem.seed_user = user

    for _ in range(lines):
        ln = ' '.join(generate_chain(length = max_length, 
                markov = user.corpus,
                strict = False))
        out.append(ln)
        user = get_friend(user)
    
    idnumber = murmur.string_hash((u''.join(out)).encode('utf-8'))
    poem.poem_id = idnumber
    poem.lines = out

    percussion = [music.screen_to_track(user)]
    melody = [music.words_to_track(w) for w in out]
    composition = music.composificate(percussion, melody)
    filename = "audio/{0}.wav".format(str(idnumber))
    music.mp3ificate(composition, filename)
    
    poem.save()
    return poem
Exemplo n.º 15
0
        continue

    net_start = dottedQuadToNum(ip)
    net_size = pow(2, 32 - ipfx)
    #print net_start, net_size

    for i in range(net_size):
        ip_num = net_start + i

        if not use_salt:
            b = str(ip_num)
        else:
            b = str(ip_num) + str(salt)

        if hash_function == Hashes.MURMUR:
            h = murmur.string_hash(b)

            if long(h) % mod == 0:
                ip_str = numToDottedQuad(ip_num)
                print ip_str
        elif hash_function == Hashes.SHA1:
            h = SHA.new(b)

            if number.bytes_to_long(h.digest()) % mod == 0:
                ip_str = numToDottedQuad(ip_num)
                print ip_str
        else:
            h = MD5.new(b)

            if number.bytes_to_long(h.digest()) % mod == 0:
                ip_str = numToDottedQuad(ip_num)
Exemplo n.º 16
0
    def murmur2_hash_c(bytes, seed=0x9747b28c):
        """murmur2_hash_c

        Use the murmur c-extension's string_hash routine
        """
        return murmur.string_hash(str(bytes), seed)
		continue

	net_start = dottedQuadToNum(ip)
	net_size = pow(2, 32 - ipfx)
	#print net_start, net_size

	for i in range(net_size):
		ip_num = net_start + i	

		if not use_salt:
			b = str(ip_num)
		else:
			b = str(ip_num) + str(salt)

		if hash_function == Hashes.MURMUR:
                	h = murmur.string_hash(b)

                	if long(h) % mod == 0 :
				ip_str = numToDottedQuad(ip_num)
                                print ip_str
        	elif hash_function == Hashes.SHA1:
			h = SHA.new(b)		
	
			if number.bytes_to_long(h.digest()) % mod == 0 :
                                ip_str = numToDottedQuad(ip_num)
                                print ip_str
		else: 
			h = MD5.new(b)

			if number.bytes_to_long(h.digest()) % mod == 0 :
				ip_str = numToDottedQuad(ip_num)
Exemplo n.º 18
0
    'f': 4226522672,
    'i': 3451942824,
    'h': 1069002520,
    'k': 3288208012,
    'j': 3131388162,
    'm': 3020367812,
    'l': 2169669117,
    'o': 1720432690,
    'n': 1785613168,
    'q': 2083633015,
    'p': 834694889,
    's': 389143345,
    'r': 744399309,
    'u': 1479000828,
    't': 2418444476,
    'w': 1340422676,
    'v': 3414904798,
    'y': 3657681515,
    'x': 372604132,
    'z': 2195360465
}

actual_dict = {
    letter: murmur.string_hash(letter)
    for letter in string.ascii_lowercase
}

unmatched_items = set(expected_dict.items()) ^ set(actual_dict.items())

assert len(unmatched_items) == 0
Exemplo n.º 19
0
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (
                i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)

        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:
                break

            newl = string.strip(newl)

            # Hash the sentence
            idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE
            # Don't use duplicate sentences
            if usedsentences[idx]: continue

            usedsentences[idx] = True
            nonzeros += 1
            added_this_sentence += 1
            print newl.encode("utf-8")
Exemplo n.º 20
0
    def murmur2_hash_c(bytes, seed=0x9747b28c):
        """murmur2_hash_c

        Use the murmur c-extension's string_hash routine
        """
        return murmur.string_hash(str(bytes), seed)
def is_validation_example(e):
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    examples_per_validation = int(1/HYPERPARAMETERS["PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION"])
    return murmur.string_hash(`e`) % examples_per_validation == 0
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)
        
        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break

            newl = string.strip(newl)

            # Hash the sentence
            idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE
            # Don't use duplicate sentences
            if usedsentences[idx]: continue

            usedsentences[idx] = True
            nonzeros += 1
            added_this_sentence += 1
            print newl.encode("utf-8")