def check_domain(self, obj, experiment): """ Either get or create assignment. """ from honest_ab.models import ExperimentDomainAllocation from honest_ab.models import ExperimentDomain try: object_name = '.'.join([str(obj.__class__.__module__), str(obj.__class__.__name__)]) domain_allocation = ExperimentDomainAllocation.objects.get( model_pk=obj.pk, model=object_name ) except ExperimentDomainAllocation.DoesNotExist: num_domains = ExperimentDomain.objects.filter(active=1).count() order = murmur.string_hash(str(obj.pk), domain_hash_key) % num_domains domain = ExperimentDomain.objects.filter(active=1)[order] try: ExperimentDomainAllocation.objects.create( experiment_domain=domain, model=object_name, model_pk=obj.pk, ) except IntegrityError: # This can occur in high traffic instances where two threads hit this method # at the same time. Both will fail the get and both will try to create. pass return domain.pk == experiment.domain_id else: return domain_allocation.experiment_domain_id == experiment.domain_id
def check_fetched(self, bitmap, url): mhash = murmur.string_hash(url) if self.r.getbit(bitmap, mhash): return True else: self.r.setbit(bitmap, mhash, 1) return False
def check_fetched(url): r = redis.Redis(connection_pool=POOL) mhash = murmur.string_hash(url) if r.getbit(config.BITMAP, mhash): return True else: r.setbit(config.BITMAP, mhash, 1) return False
def _make_decision(self, obj, experiment): """ Use a simple hash on the string value of the object pk. """ if random.random() * 100 > experiment.percentage_of_traffic: result = HONEST_AB_SKIP_TYPE else: result = str(murmur.string_hash(str(obj.pk), experiment.pk) % experiment.number_of_classes) return result
def extract_vector(features, bits): vector = {} for (fset, vals) in features.items(): for (f, v) in vals.items(): try: h = murmur.string_hash("%s_%s" % (fset, f.encode("utf-8"))) h = h & (2 ** bits - 1) vector[h + 1] = v except UnicodeDecodeError, e: print >> sys.stderr, "%s: %s" % (e, (f, v)) continue
def _make_decision(self, obj, experiment): """ Use a simple hash on the string value of the object pk. """ if random.random() * 100 > experiment.percentage_of_traffic: result = HONEST_AB_SKIP_TYPE else: result = str( murmur.string_hash(str(obj.pk), experiment.pk) % experiment.number_of_classes) return result
def minhash(features): minhashes = [] sketches = [] #for feature in features: #print("feature") for num_minhash in range(NUM_MINHASHES): minhashes.append( min([murmur.string_hash(`feature`, num_minhash) for feature in features]) ) #print("this is the min hash array: ", minhashes) #raw_input("\npress any key...") #continue for i in xrange(0,NUM_MINHASHES,SKETCH_RATIO): sketch = murmur.string_hash('minhashes[i:i+SKETCH_RATIO]') sketches.append(sketch) return np.array(minhashes),sketches
def extract_features(sha, path_to_files_dir, hash_dim=1024, split_regex=r"\s+"): # first, read in the file as a big string: file = read_file(sha=sha, dir=path_to_files_dir) # next, split the big string into a bunch of different tokens ("words"): tokens = re.split(pattern=split_regex, string=file) # now take the module(hash of each token) so that each token is replaced # by bucket (category) from 1:hash_dim. token_hash_buckets = [ (murmur.string_hash(w) % (hash_dim - 1) + 1) for w in tokens ] # Finally, we'll count how many hits each bucket got, so that our features # always have length hash_dim, regardless of the size of the HTML file: token_bucket_counts = np.zeros(hash_dim) # this returns the frequency counts for each unique value in # token_hash_buckets: buckets, counts = np.unique(token_hash_buckets, return_counts=True) # and now we insert these counts into our token_bucket_counts object: for bucket, count in zip(buckets, counts): token_bucket_counts[bucket] = count return np.array(token_bucket_counts)
def haiku_nofriends(user): """haiku for someone with no friends.""" out = [] poem = Poem() poem.seed_user = user for i in [5, 7, 5]: ln = ' '.join(generate_syllable_chain(i, user.corpus)) out.append(ln) idnumber = murmur.string_hash((u''.join(out)).encode('utf-8')) poem.lines = out poem.poem_id = idnumber percussion = [music.screen_to_track(user)] melody = [music.words_to_track(w) for w in out] composition = music.composificate(percussion, melody) filename = "audio/{0}.wav".format(str(idnumber)) music.mp3ificate(composition, filename) poem.save() return poem
def haiku(user): #Can't do users until we're closed under corpus out = [] poem = Poem() poem.seed_user = user users = [user] for i in [5, 7, 5]: ln = ' '.join(generate_syllable_chain(i, user.corpus)) out.append(ln) user = get_friend(user) users.append(user) idnumber = murmur.string_hash((u''.join(out)).encode('utf-8')) poem.lines = out poem.poem_id = idnumber percussion = [music.screen_to_track(u) for u in users] melody = [music.words_to_track(w) for w in out] composition = music.composificate(percussion, melody) filename = "audio/{0}.wav".format(str(idnumber)) music.mp3ificate(composition, filename) poem.save() return poem
def stanza(user, max_length, lines): out = [] poem = Poem() poem.seed_user = user for _ in range(lines): ln = ' '.join( generate_chain(length=max_length, markov=user.corpus, strict=False)) out.append(ln) user = get_friend(user) idnumber = murmur.string_hash((u''.join(out)).encode('utf-8')) poem.poem_id = idnumber poem.lines = out percussion = [music.screen_to_track(user)] melody = [music.words_to_track(w) for w in out] composition = music.composificate(percussion, melody) filename = "audio/{0}.wav".format(str(idnumber)) music.mp3ificate(composition, filename) poem.save() return poem
def haiku(user): #Can't do users until we're closed under corpus out = [] poem = Poem() poem.seed_user = user users = [user] for i in [5,7,5]: ln = ' '.join(generate_syllable_chain(i, user.corpus)) out.append(ln) user = get_friend(user) users.append(user) idnumber = murmur.string_hash((u''.join(out)).encode('utf-8')) poem.lines = out poem.poem_id = idnumber percussion = [music.screen_to_track(u) for u in users] melody = [music.words_to_track(w) for w in out] composition = music.composificate(percussion, melody) filename = "audio/{0}.wav".format(str(idnumber)) music.mp3ificate(composition, filename) poem.save() return poem
def stanza(user, max_length, lines): out = [] poem = Poem() poem.seed_user = user for _ in range(lines): ln = ' '.join(generate_chain(length = max_length, markov = user.corpus, strict = False)) out.append(ln) user = get_friend(user) idnumber = murmur.string_hash((u''.join(out)).encode('utf-8')) poem.poem_id = idnumber poem.lines = out percussion = [music.screen_to_track(user)] melody = [music.words_to_track(w) for w in out] composition = music.composificate(percussion, melody) filename = "audio/{0}.wav".format(str(idnumber)) music.mp3ificate(composition, filename) poem.save() return poem
continue net_start = dottedQuadToNum(ip) net_size = pow(2, 32 - ipfx) #print net_start, net_size for i in range(net_size): ip_num = net_start + i if not use_salt: b = str(ip_num) else: b = str(ip_num) + str(salt) if hash_function == Hashes.MURMUR: h = murmur.string_hash(b) if long(h) % mod == 0: ip_str = numToDottedQuad(ip_num) print ip_str elif hash_function == Hashes.SHA1: h = SHA.new(b) if number.bytes_to_long(h.digest()) % mod == 0: ip_str = numToDottedQuad(ip_num) print ip_str else: h = MD5.new(b) if number.bytes_to_long(h.digest()) % mod == 0: ip_str = numToDottedQuad(ip_num)
def murmur2_hash_c(bytes, seed=0x9747b28c): """murmur2_hash_c Use the murmur c-extension's string_hash routine """ return murmur.string_hash(str(bytes), seed)
continue net_start = dottedQuadToNum(ip) net_size = pow(2, 32 - ipfx) #print net_start, net_size for i in range(net_size): ip_num = net_start + i if not use_salt: b = str(ip_num) else: b = str(ip_num) + str(salt) if hash_function == Hashes.MURMUR: h = murmur.string_hash(b) if long(h) % mod == 0 : ip_str = numToDottedQuad(ip_num) print ip_str elif hash_function == Hashes.SHA1: h = SHA.new(b) if number.bytes_to_long(h.digest()) % mod == 0 : ip_str = numToDottedQuad(ip_num) print ip_str else: h = MD5.new(b) if number.bytes_to_long(h.digest()) % mod == 0 : ip_str = numToDottedQuad(ip_num)
'f': 4226522672, 'i': 3451942824, 'h': 1069002520, 'k': 3288208012, 'j': 3131388162, 'm': 3020367812, 'l': 2169669117, 'o': 1720432690, 'n': 1785613168, 'q': 2083633015, 'p': 834694889, 's': 389143345, 'r': 744399309, 'u': 1479000828, 't': 2418444476, 'w': 1340422676, 'v': 3414904798, 'y': 3657681515, 'x': 372604132, 'z': 2195360465 } actual_dict = { letter: murmur.string_hash(letter) for letter in string.ascii_lowercase } unmatched_items = set(expected_dict.items()) ^ set(actual_dict.items()) assert len(unmatched_items) == 0
analyzer = StandardAnalyzer(Version.LUCENE_30) queryparser = QueryParser(Version.LUCENE_30, "text", analyzer) searcher = IndexSearcher(dir) nonzeros = 0 for i, l in enumerate(sys.stdin): if i % 100 == 0: print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % ( i, percent(nonzeros, BLOOM_FILTER_SIZE)) print >> sys.stderr, stats() l = string.strip(l) added_this_sentence = 0 for newl in retrieve(l, searcher, queryparser): # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break newl = string.strip(newl) # Hash the sentence idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE # Don't use duplicate sentences if usedsentences[idx]: continue usedsentences[idx] = True nonzeros += 1 added_this_sentence += 1 print newl.encode("utf-8")
def is_validation_example(e): import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") examples_per_validation = int(1/HYPERPARAMETERS["PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION"]) return murmur.string_hash(`e`) % examples_per_validation == 0
indexDir = "lucene.ukwac" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) queryparser = QueryParser(Version.LUCENE_30, "text", analyzer) searcher = IndexSearcher(dir) nonzeros = 0 for i, l in enumerate(sys.stdin): if i % 100 == 0: print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (i, percent(nonzeros, BLOOM_FILTER_SIZE)) print >> sys.stderr, stats() l = string.strip(l) added_this_sentence = 0 for newl in retrieve(l, searcher, queryparser): # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break newl = string.strip(newl) # Hash the sentence idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE # Don't use duplicate sentences if usedsentences[idx]: continue usedsentences[idx] = True nonzeros += 1 added_this_sentence += 1 print newl.encode("utf-8")