def lemmatizeText(text): processedWords = {} wordsHashMap = {} words = {} word = "" for c in text: if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or (c == '\'') or (c == '-') or (c == '_'): word += c else: if word: word = word.lower() if word not in processedWords: if word in exceptions: if word in words: words[word] += 1 else: words[word] = 1 wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]] else: if len(word) > 3 and word not in stopWords: tag = nltk.pos_tag([word]) # !!! WARNING : takes A LOT OF TIME !!! if tag[0][1] in tags: if word in words: words[word] += 1 else: words[word] = 1 wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]] processedWords[word] = word else: if word in words: words[word] += 1 wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]] word = "" return wordsHashMap
def fit(self, filepath): print '* Langauge model fitting' generator = create_generator(filepath, skip_spam=True, return_set=False, STOP_CONDITION=6300) for doc, _ in generator: n_gram_cnt = len(doc) - self.N + 1 for num in xrange(n_gram_cnt): n_gram_3 = mmh3.hash64(''.join(doc[num:num + self.N - 1]).encode('utf-8')) n_gram_4 = mmh3.hash64(''.join(doc[num:num + self.N]).encode('utf-8')) self.struct_3[n_gram_3] += 1 self.struct_4[n_gram_4] += 1 if n_gram_cnt > 0: n_gram_3 = mmh3.hash64(''.join(doc[1 - self.N:]).encode('utf-8')) self.struct_3[n_gram_3] += 1 summary_cnt_3 = float(sum(self.struct_3.values())) summary_cnt_4 = float(sum(self.struct_4.values())) for key, _ in self.struct_3.iteritems(): self.struct_3[key] /= summary_cnt_3 for key, _ in self.struct_4.iteritems(): self.struct_4[key] /= summary_cnt_4 self.eps = min(self.struct_3.values() + self.struct_4.values()) * 5e-1 print '* Langauge model seccessefully fitted'
def __init__(self): # Define Supported hashes hashes = dict() hashes['md2'] = lambda x: self._get_md2_hash(x) hashes['md4'] = lambda x: self._get_hashlib_hash('md4', x) hashes['md5'] = lambda x: hashlib.md5(x).hexdigest() hashes['sha'] = lambda x: self._get_hashlib_hash('sha', x) hashes['sha1'] = lambda x: hashlib.sha1(x).hexdigest() hashes['sha256'] = lambda x: hashlib.sha256(x).hexdigest() hashes['sha224'] = lambda x: hashlib.sha224(x).hexdigest() hashes['sha384'] = lambda x: hashlib.sha384(x).hexdigest() hashes['sha512'] = lambda x: hashlib.sha512(x).hexdigest() hashes['sha3_224'] = lambda x: sha3.sha3_224(x).hexdigest() hashes['sha3_256'] = lambda x: sha3.sha3_256(x).hexdigest() hashes['sha3_384'] = lambda x: sha3.sha3_384(x).hexdigest() hashes['sha3_512'] = lambda x: sha3.sha3_512(x).hexdigest() hashes['mmh2'] = lambda x: str(mmhash.get_hash(x)) hashes['mmh2_unsigned'] = lambda x: str(mmhash.get_unsigned_hash(x)) hashes['mmh3_32'] = lambda x: str(mmh3.hash(x)) hashes['mmh3_64_1'] = lambda x: str(mmh3.hash64(x)[0]) hashes['mmh3_64_2'] = lambda x: str(mmh3.hash64(x)[1]) hashes['mmh3_128'] = lambda x: str(mmh3.hash128(x)) hashes['ripemd160'] = lambda x: self._get_hashlib_hash('ripemd160', x) hashes['whirlpool'] = lambda x: self._get_hashlib_hash('whirlpool', x) hashes['blake2b'] = lambda x: pyblake2.blake2b(x).hexdigest() hashes['blake2s'] = lambda x: pyblake2.blake2s(x).hexdigest() hashes['crc32'] = lambda x: str(zlib.crc32(x)) hashes['adler32'] = lambda x: str(zlib.adler32(x)) self._hashes = hashes self.hashes_and_checksums = self._hashes.keys() self.supported_hashes = HASHES
def convert(cont_df): task_states = [] prev_end_time = cont_df.loc[0, "start_time"] * 1000 container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1] app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1] sorted_df = df.sort_values("time_stamp") for index, row in sorted_df.iterrows(): this_end_time = row["time_stamp"] * 1000 this_task_state = TaskState( ts_start=prev_end_time, ts_end=this_end_time, workflow_id=app_id, task_id=container_id, resource_id=machine_id, cpu_rate=row["cpu_util_percent"], canonical_memory_usage=row["mem_util_percent"], maximum_disk_bandwidth=row["disk_io_percent"], network_in=row["net_in"], network_out=row["net_out"]) prev_end_time = this_end_time task_states.append(this_task_state.get_parquet_dict()) if None in this_task_state.get_parquet_dict().values( ) or np.isnan(this_task_state.get_parquet_dict().values()): print(this_task_state.get_parquet_dict()) raise ArithmeticError(this_task_state.get_parquet_dict()) return pd.DataFrame(task_states)
def assignObjectWallTime(self): """ Tries to add an object and returns the wall time. """ t = time.time() inputStr = str(np.random.random()) num = mmh3.hash64(inputStr) num1 = num[0] & 4294967295 num2 = num[0] >> 32 num3 = num[1] & 4294967295 num4 = num[1] >> 32 num1 = num1 >> 12 num2 = num2 >> 12 num3 = num3 >> 12 num4 = num4 >> 12 counter = 0 while (self.serversArray[num1] == -1 or self.fullFlag[self.serversArray[num1]]) and (self.serversArray[num2] == -1 or self.fullFlag[self.serversArray[num2]]) and (self.serversArray[num3] == -1 or self.fullFlag[self.serversArray[num3]]) and (self.serversArray[num4] == -1 or self.fullFlag[self.serversArray[num4]]): counter += 1 num = mmh3.hash64(inputStr + hex(counter)) num1 = num[0] & 1048575 # Always positive num2 = num[0] >> 44 num3 = num[1] & 1048575 # Always positive num4 = num[1] >> 44 return time.time() - t
def debug_execution(config, execs, qemu_verbose=False, notifiers=True): log_info("Starting...") zero_hash = mmh3.hash64(("\xFF" * config.config_values['BITMAP_SHM_SIZE'])) q = qemu(1337, config, debug_mode=True, notifiers=notifiers) q.start(verbose=qemu_verbose) q.set_payload(open(config.argument_values["payload"][0]).read()) start = time.time() for i in range(execs): print("+----------------------------------------------+") current_hash = mmh3.hash64(q.send_payload()) if zero_hash == current_hash: print("Hash: " + str(current_hash) + common.color.WARNING + " (WARNING: Zero hash found!)" + common.color.ENDC) else: print("Hash: " + str(current_hash)) end = time.time() print("Performance: " + str(execs / (end - start)) + "t/s") q.__del__() try: for i in range(512): if os.path.exists("/tmp/kAFL_printf.txt." + str(i)): os.remove("/tmp/kAFL_printf.txt." + str(i)) else: break except: pass os.system("stty sane") return 0
def assign_workflow_ids(v): arr = [] for i in v.keys(): if v[i]: arr.append(mmh3.hash64(v[i], signed=True)[0]) else: arr.append( mmh3.hash64(uuid4().bytes, signed=True) [0]) # Assign a UUID, collision chance is negligible. return pd.Series(arr)
def bloom_check(self, buffer): if self.ready == 0: print("Bloom filter was not created properly\n") else: hits = 0 a = mmh3.hash64(buffer)[0] b = mmh3.hash64(buffer)[1] for i in range(self.hashes): x = (a + i * b) % (self.bits) if self.bf[x] == 1: hits += 1 if hits == self.hashes: return 1 # might be there else: return 0 # not there for sure
def _multi_hash(self, document): """ Generates a texts minhash signature using multi-hash method. Uses i random hashes for j permutations selecting the minimum hash value each time to build each texts hash signature. Slower but more stable than k smallest hash method. Args: document (list): List of document shingles. Returns: list: List of text signatures generated using k smallest neighbours method. """ signature = [] for seed in np.nditer(self._hash_seeds): self._min_value = None for shingle in document: if self.hash_bits == 64: hash_value = mmh3.hash64(shingle, int(seed))[0] elif self.hash_bits == 32: hash_value = mmh3.hash(shingle, int(seed)) else: hash_value = mmh3.hash128(shingle, int(seed)) if not self._min_value: self._min_value = hash_value elif self._min_value > hash_value: self._min_value = hash_value signature.append(self._min_value) return signature
def update(self,instance,y): for aggKey in self._keys: key_for_update = hash64(str(tuple([key+'_'+instance[key] for key in aggKey]))) # hash for memory issue temp_list = self._counter_map[key_for_update] if len(temp_list) == self.mem_len: temp_list.popleft() temp_list.append((self.time,y))
def compare_python_to_reference_murmur3_64(data: Any, seed: int = 0) -> None: """ Checks the pure Python implementation of 64-bit murmur3 against the ``mmh3`` C-based module. Args: data: data to hash seed: seed Raises: AssertionError: if the two calculations don't match """ assert mmh3, "Need mmh3 module" c_data = to_str(data) # noinspection PyUnresolvedReferences c_signed_low, c_signed_high = mmh3.hash64(c_data, seed=seed, x64arch=IS_64_BIT) py_data = to_bytes(c_data) py_signed_low, py_signed_high = pymmh3_hash64(py_data, seed=seed) preamble = ( f"Hashing {data!r} with MurmurHash3/64-bit values from 128-bit " f"hash/seed={seed}" ) if c_signed_low == py_signed_low and c_signed_high == py_signed_high: print(preamble + f" -> (low={c_signed_low}, high={c_signed_high}): OK") else: raise AssertionError( preamble + f"; mmh3 says {c_data!r} -> " f"(low={c_signed_low}, high={c_signed_high}), " f"Python version says {py_data!r} -> " f"(low={py_signed_low}, high={py_signed_high})")
def bloom_add(self, buffer): if self.ready == 0: print("Bloom filter was not created properly\n") return 1 else: if self.no_of_elements < self.entries: a = mmh3.hash64(buffer)[0] b = mmh3.hash64(buffer)[1] for i in range(self.hashes): x = (a + i * b) % (self.bits) self.bf[x] = 1 self.no_of_elements += 1 else: print("Bloom filter capacity crossed\n") return -1 return 0
def benchmark(config): log_info("Starting...") q = qemu(1337, config, debug_mode=False) q.start(verbose=False) q.set_payload(open(config.argument_values["payload"][0]).read()) print(mmh3.hash64(q.send_payload())) try: while True: start = time.time() execs = 0 while (time.time() - start < REFRESH): q.set_payload( open(config.argument_values["payload"][0]).read()) q.send_payload() execs += 1 end = time.time() stdout.write(common.color.FLUSH_LINE + "Performance: " + str(execs / (end - start)) + "t/s") stdout.flush() except: print("\nExit") q.__del__() try: for i in range(512): if os.path.exists("/tmp/kAFL_printf.txt." + str(i)): os.remove("/tmp/kAFL_printf.txt." + str(i)) else: break except: pass return 0
def _k_smallest_hash(self, document): """ Generates a texts minhash signature using k smallest neighbours method. Uses a single random hash to simulate a shuffle of each texts shingles. Then selecting i smallest minimum hash values for j permutations. Faster but less stable than multi hash method. Args: document (list): List of text shingles. Returns: list: List of text signatures generated using k smallest neighbours method. """ signature = [] # Uses a heap to make calculating n smallest values more efficient. heapq.heapify(signature) if len(document) <= self.permutations: raise ValueError( 'N permutations must not be >= n shingles for k_smallest_values method' ) for shingle in document: if self.hash_bits == 64: hashed_shingle = mmh3.hash64(shingle, self._hash_seeds)[0] elif self.hash_bits == 32: hashed_shingle = mmh3.hash(shingle, self._hash_seeds) else: hashed_shingle = mmh3.hash128(shingle, self._hash_seeds) heapq.heappush(signature, hashed_shingle) return heapq.nsmallest(self.permutations, signature)
def get_indexes(self, key): """ Generates the indicies corresponding to the given key """ h1, h2 = mmh3.hash64(key) for i in xrange(self.num_hashes): yield (h1 + i * h2) % self.num_bytes
def insert(self, codelet): """ Insert a codelet into the database. :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ query1 = """INSERT INTO code VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE code_id=code_id""" query2 = """INSERT INTO codelets VALUES (DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") code_id = mmh3.hash64(hash_key)[0] with self._conn.cursor() as cursor: cursor.execute(query1, (code_id, codelet.language, codelet.code)) if cursor.rowcount == 1: for sym_type, symbols in codelet.symbols.iteritems(): self._insert_symbols(cursor, code_id, sym_type, symbols) origin, url = self._decompose_url(cursor, codelet.url) cursor.execute(query2, (codelet.name, code_id, origin, url, codelet.rank, codelet.date_created, codelet.date_modified)) codelet_id = cursor.lastrowid authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] cursor.executemany(query3, authors)
def run(encoding_method, files): path = './temp_idx/' if not os.path.exists(path): os.makedirs(path) index, url_list = {}, [] current_partition_id = 0 for doc_idx, doc in tqdm(files): doc_idx = int(doc_idx) text = get_doctext(doc_idx) url_list.append(doc['url'] + '\n') terms = set(extract_words(text)) if len(terms) == 0: # print('Document {0} is empty'.format(doc_idx)) continue for term in terms: key = mmh3.hash64(term)[0] if key in index: index[key].append(doc_idx) else: # Zero index is used for delta computation for first document in sequence index[key] = [0, doc_idx] if (doc_idx + 1) % index_partition_size == 0: filename = os.path.join(INDEX_PATH, 'part{0:03d}'.format(current_partition_id)) write_index_partition(filename, index, encoding_method) current_partition_id += 1 filename = os.path.join(INDEX_PATH, 'part{0:03d}'.format(current_partition_id)) write_index_partition(filename, index, encoding_method) with open(os.path.join(INDEX_PATH, 'encoding.ini'), 'w') as config_file: config_file.write(encoding_method) with open(os.path.join(INDEX_PATH, 'url_list'), 'w') as f: f.writelines(url_list)
def hashing(item): h1 = murmur.hash64(item) hashes = [] for i in range(1, k + 1): uniq = (h1[0] + i * h1[1]) % m hashes.append(uniq) return hashes
def might_contain(self, key, num_hash_functions, array): bit_size = len(array) if isinstance(key, int) and self.INT_MIN <= key <= self.INT_MAX: hash1, hash2 = mmh3.hash64(key.to_bytes(4, byteorder="little")) elif isinstance(key, int) and self.LONG_MIN <= key <= self.LONG_MAX: hash1, hash2 = mmh3.hash64(key.to_bytes(8, byteorder="little")) else: hash1, hash2 = mmh3.hash64(key) combined_hash = hash1 for _ in range(num_hash_functions): index = (combined_hash & self.LONG_MAX) % bit_size if not array[index]: return False combined_hash += hash2 return True
def update_min_hash_signature(word, min_hash_signature): root_hash = mmh3.hash64(pickle.dumps(word))[0] word_hashes = np.bitwise_xor( masks, root_hash ) # XOR root hash with k randomly generated integers to simulate k hash functions min_hash_signature = np.minimum(min_hash_signature, word_hashes) return min_hash_signature
def process_file(tenant, file_path, pipeline, lines): line_counter = lines with open(file_path) as f: for line in f: uuid, tids = line.strip().split(LINE_DELIMITER) hashed_bytes = hash64(uuid)[0].to_bytes(HASH_BYTES, byteorder=BYTE_ORDER, signed=HASH_SIGNED) tenant_bytes = int(tenant).to_bytes(1, byteorder=BYTE_ORDER, signed=HASH_SIGNED) redis_key, redis_field = tenant_bytes + hashed_bytes[:KEY_BYTES], hashed_bytes[ KEY_BYTES:KEY_BYTES + FIELD_BYTES] redis_value = bytearray() for tid in tids.split(TID_DELIMITER): redis_value.extend( int(tid).to_bytes(TID_BYTES, byteorder=BYTE_ORDER, signed=HASH_SIGNED)) pipeline.hset(redis_key, redis_field, bytes(redis_value)) line_counter += 1 if line_counter % PIPELINE_SIZE == 0: pipeline.execute() logging.info(line_counter) pipeline.execute() return line_counter
def copy_bitmap(self, shm, num, size, bitmap, payload, payload_size, effector_mode=False): new_hash = mmh3.hash64(bitmap) if not (self.crashed or self.kasan or self.timeout): if new_hash in self.lookup.non_finding: if effector_mode: shm.seek(size * num) shm.write(bitmap) return True else: shm.seek((size * num) + len(bitmap)) return False if not (self.crashed or self.kasan or self.timeout) and not self.check_for_unseen_bits(bitmap): self.lookup.non_finding[new_hash] = None return False if not (self.timeout): bitmap = self.verifiy_input(payload, bitmap, payload_size) shm.seek(size * num) shm.write(bitmap) self.lookup.non_finding[new_hash] = None return True
def debug_execution(config, execs, qemu_verbose=False, notifiers=True): log_debug("Starting debug execution...(%d traces)" % execs) payload_file = config.argument_values["input"] zero_hash = mmh3.hash64(("\x00" * config.config_values['BITMAP_SHM_SIZE'])) q = qemu(1337, config, debug_mode=True, notifiers=notifiers) q.start() start = time.time() for i in range(execs): log_debug("Launching payload %d/%d.." % (i + 1, execs)) if i % 3 == 0: q.set_payload(read_binary_file(payload_file)) # time.sleep(0.01 * rand.int(0, 9)) # a = str(q.send_payload()) # hexdump(a) result = q.send_payload() current_hash = result.hash() if zero_hash == current_hash: log_debug("Feedback Hash: " + str(current_hash) + common.color.WARNING + " (WARNING: Zero hash found!)" + common.color.ENDC) else: log_debug("Feedback Hash: " + str(current_hash)) #log_debug("Full hexdump:\n" + hexdump(result.copy_to_array())) if result.is_crash(): q.restart() q.shutdown() end = time.time() print("Performance: " + str(execs / (end - start)) + "t/s") return 0
def compute(filename, *, k=21, scaled=1000, output=None, **kwargs): import mmh3 import screed # compute the actual hashes to insert by breaking down the sequence # into k-mers and applying MurmurHash to each one; here, the only # interesting thing that is done by add() is to keep only the # hashes smaller than max_hash, where max_hash = 2^64 / scaled. mh = ScaledMinHash(k=k, scaled=scaled, filename=filename) name = None with screed.open(filename) as f: for record in f: if name is None: name = record.name for kmer in canonical_kmers(record.sequence, k): h = mmh3.hash64(kmer, seed=42)[0] # convert to unsigned int if negative if h < 0: h += 2**64 mh.add(h) mh.name = name if output is None: output = sys.stdout mh.save(output)
def copy_bitmap(self, shm, num, size, bitmap, payload, payload_size, effector_mode_hash=None, apply_patches=True): if self.crashed or self.kasan or self.timeout: shm.seek(size * num) shm.write(bitmap) return True new_hash = mmh3.hash64(bitmap) if effector_mode_hash and effector_mode_hash != new_hash: shm.seek(size * num) shm.write(bitmap) return True if self.lookup.check_value(new_hash): shm.seek((size * num) + len(bitmap)) return False if not (self.timeout) and not self.check_for_unseen_bits(bitmap): self.lookup.set_value(new_hash) return False shm.seek(size * num) shm.write(bitmap) self.lookup.set_value(new_hash) return True
def find(self, item): key = mmh3.hash64(item)[0] bucket_idx = key % self.buckets_count target_bucket = self.buckets[bucket_idx] term_idx = TermDictionary.__binary_search(target_bucket[:, 0], key) offset, n_bytes = target_bucket[term_idx, 1], target_bucket[term_idx, 2] return offset, n_bytes
def get_hash(cls, item, case_sensitive): """ Returns two hash values computed using single hash function on item ... Parameters ---------- item : object The input element to add or check case_sensitive: bool Flag indicates if item value is case sensitive Returns ------- tuple Two hash values computing using mmh3 hash function """ try: # If case sensitive is False, convert item to lowercase for add and exist methods, to enable case difference if case_sensitive: item = item.lower() # Hashing is compute intensive. Hence we compute it only once. # Using non-cryptographic hash function is highly performance efficient and serves purpose of bloom filter. # mmh3.hash64 hash function returns two hash values for input item hash_vals = mmh3.hash64(item) return hash_vals except Exception as e: logger.log(str(e)) sys.exit(1)
def exec_tree(query_tree, Term_dict, InvIndexEncoded, num_docID): if(query_tree is None): return set() if(query_tree.is_operator): S1 = exec_tree(query_tree.left, Term_dict, InvIndexEncoded, num_docID) S2 = exec_tree(query_tree.right, Term_dict, InvIndexEncoded, num_docID) op = query_tree.value if(op == '!'): S = set(range(num_docID)) return S - S2 elif(op == '&'): return S1 & S2 elif(op == '|'): return S1 | S2 else: print "ERROR" return None else: ###query_tree.is_term == True term_hash = mmh3.hash64(query_tree.value)[0] if(term_hash in Term_dict): substr = InvIndexEncoded[Term_dict[term_hash][0] : Term_dict[term_hash][0] + Term_dict[term_hash][1]] return set(encoder.decode(substr)) else: return set()
def readHash(self): hll = Hll(self.p) x = sys.stdin.readline().rstrip('\n') while x: hll.AddItem64(mmh3.hash64(str(x))[0]) x = sys.stdin.readline().rstrip('\n') print hll.Count64()
def _indexes(self, key): """ Generates the indicies corresponding to the given key """ h1, h2 = mmh3.hash64(key) for i in xrange(self.num_hashes): yield (h1 + i * h2) % self.num_bytes
def client(): nodes_ = [] # position servers on the hash ring for i in range(len(servers)): nodes_.append(Node(servers[i], 123 * i, 100)) # now reading data in the csv and pushing it to selected server with open('causes-of-death.csv', mode='r') as csvfile: csvreader_ = csv.reader(csvfile, delimiter=',') row_number_ = 0 for row_ in csvreader_: if row_number_ == 0: # skip header row_number_ += 1 continue #print(', '.join(row_)) key_ = "%s:%s:%s" % (row_[0], row_[2], row_[3]) hash_ = mmh3.hash64(key_)[0] node_ = determine_responsible_node(nodes_, key_) #print("{} - {} - {}".format(hash_, str(node_), ','.join(row_))) payload_ = {'{}'.format(hash_): ','.join(row_)} #print(payload_) # prepare http request and post payload req_ = urllib.request.Request("{}/api/v1/entries".format( node_.name_)) req_.add_header('Content-Type', 'application/json; charset=utf-8') json_data_ = json.dumps(payload_) json_data_bytes_ = json_data_.encode('utf-8') req_.add_header('Content-Length', len(json_data_bytes_)) resp_ = urllib.request.urlopen(req_, json_data_bytes_) print("{} - {}".format(resp_.reason, payload_))
def murmur3_64bit(obj): """ Use murmur3_64bit for 64 bit hash by passing this method: hasher=DeepHash.murmur3_64bit """ obj = obj.encode('utf-8') # This version of murmur3 returns two 64bit integers. return mmh3.hash64(obj, MURMUR_SEED)[0]
def find_lsh_buckets(hash_signature, lsh_band_width, lsh_num_buckets): bands = [ tuple(hash_signature[i:i + lsh_band_width]) for i in range(0, len(hash_signature), lsh_band_width) ] lsh_hashes = [(mmh3.hash64(pickle.dumps(row))[0] % lsh_num_buckets) for row in bands] return lsh_hashes
def update_min_hash_signature(self, word, min_hash_signature): root_hash = mmh3.hash64(word.encode("ascii", "ignore"))[0] # root_hash = mmh3.hash64(pickle.dumps(word))[0] # For MinHashing shingles word_hashes = np.bitwise_xor( self._masks, root_hash ) # XOR root hash with k randomly generated integers to simulate k hash functions, can add bitroll if there's time min_hash_signature = np.minimum(min_hash_signature, word_hashes) return min_hash_signature
def gen_features(self,instance,logtime,D): # generate features based on instance's attribute. # For each key, we generate hash((bin(logtime-time[i]),i,lastY[i])) % D for aggKey in self._keys: key_for_feature = hash64(str(tuple([key+'_'+instance[key] for key in aggKey]))) for idx, content in enumerate(self._counter_map[key_for_feature]): time, lastY = content val = int(log((logtime - time).total_seconds() + 1.)) yield abs(hash(str(aggKey)+'_'+str(idx)+'_'+str((val,lastY)))) % D , 1.
def murmurhash3_64(item, seed = 0): """ Murmurhash 3 for 64-bit integers (returns the first of a tuple of two) """ if type(item) is not str: item = str(item) if type(seed) is not int: seed = int(seed) return mmh3.hash64(item, seed = seed)
def params_stand(self, infos, res_id): """params to stand DB""" infos['issue_time'] = infos.pop("create_time") infos['res_id'] = res_id hstr = "%snaviappfeedback" % infos['res_id'] infos['mid'] = mmh3.hash64(hstr)[0] #unique infos['intelligence_source'] = 31 infos['dispatch_flag'] = 0 infos['update_time'] = infos['commit_time']
def hash64(key, seed): """ Wrapper around mmh3.hash64 to get us single 64-bit value. This also does the extra work of ensuring that we always treat the returned values as big-endian unsigned long, like smhasher used to do. """ hash_val = mmh3.hash64(key, seed)[0] return struct.unpack('>Q', struct.pack('q', hash_val))[0]
def go(self): hll = Hll(self.p) hashvalues = [] for x in range (0,int(self.n)): hashvalues.append(mmh3.hash64(str(x))[0]) debut = int(round (time.time() * 1000)) for i in range(0, int(self.n)): hll.AddItem64(hashvalues[i]) fin = int(round (time.time() * 1000)) print hll.Count64() print "temps = "+str(fin-debut)+"ms";
def find_match(threadName, size, ohash): global match global solution global found_by while (match == 0): rands = str(bytearray(os.urandom(size))) h1 = mmh3.hash64(rands) if (h1 == ohash): solution = rands match = 1 found_by = threadName
def sim_shi4_mm3(text): # NB: It makes quite little sense to use both 64bit numbers to compare # hashes as pairwise Hamming distance using high 64bit is highly correlated # with the distance computed using low 64bit. It's actually expected, but # it means, that summing these distances is not linear and should be avoided. # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) mm = [mmh3.hash64(text[m1.start():m2.end()]) for m1, m2 in itertools.izip(i1, i2)] return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]), simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
def tx_partition(app, txid): """ Return a blob hash for a given application and transaction ID. Args: app: A string specifying the application ID. txid: An integer specifying the transaction ID. Returns: A bytearray that can be used as the transaction partition key. """ murmur_int = mmh3.hash64(app + str(txid))[0] # Distribute the integer range evenly across the byte ordered token range. return bytearray(struct.pack('<q', murmur_int))
def compressor_worker(): while not q.empty(): w_base, w_rel_base, w_f = q.get() w_rel_base = '' if w_rel_base == '.' else w_rel_base abs_path = os.path.join(w_base, w_f) rel_path = os.path.join(w_rel_base, w_f) extension = os.path.splitext(rel_path)[1][1:] raw_filestring = open(abs_path).read() compressed_filestring = lzo.compress(raw_filestring, options.compression) len_raw = len(raw_filestring) len_compressed = len(compressed_filestring) compression_factor = (float(len_compressed) / len_raw) if len_raw else 0 compression_used = False if compression_factor < options.cutoff and False: compression_used = True string_final = compressed_filestring if compression_used else raw_filestring len_final = len(string_final) adler32_final = lzo.adler32(string_final) compressed_data_chunks.append({ 'path': rel_path, 'path_mmh3': mmh3.hash64(rel_path)[0], 'adler32': adler32_final, 'size_before': len_raw, 'size_after': len_final, 'factor': compression_factor, 'compression': 1 if compression_used else 0, 'extension_str': extension, 'extension': extensions[extension] if extension in extensions else 0, 'data': string_final }) if options.verbose: print('\t'.join(( 'Y' if compression_used else 'N', extension, '%.02f' % (compression_factor * 100.0), str(len_raw / 1024), str(len_final / 1024), str(adler32_final), rel_path ))) q.task_done()
def map_sketch(input, options): ''' Returns a list of sketches (x,r, pos, d) x = hashed sketch r = ID of original sequence pos = index of kmer starting position in original sequence d = count of kmers extracted ''' id_seq = input.split('\t') # pp.pprint(type(id_seq[0])) sketches = [(mmh3.hash64(i[0])[0], (int(id_seq[0]), i[1], len(id_seq[1]) - options.kmer + 1)) for i in gen_kmers(id_seq[1], options)] # if (mmh3.hash64(i)[0] % options.mod == 0) return sketches
def hash_line(line, n, size, order=1): line = line.strip().lower().split() res = [] for w in ngrams(line, order): h1, h2 = mmh3.hash64(w) for s in range(n): hashval = (h1 + s * h2) % size res.append(int(hashval)) # res.append(hash("%s\t%s" % (s, w)) % size) res = list(set(res)) res.sort() # print ("%d => %d" %(len(line), len(res))) return res
def test_compute_librarylink_hash(inputdata, expected): bits128 = mmh3.hash64(inputdata) bits64 = bits128[0] hexbits64 = hex(bits64) hexbits128 = [ hex(x) for x in bits128 ] octets = struct.pack('!q', bits64) octets_raw = [ hex(c) for c in octets ] octets_rawer = [ hex(c)[2:].zfill(2) for c in octets ] encoded = base64.urlsafe_b64encode(octets).rstrip(b"=") encoded_unsafe = base64.b64encode(octets).rstrip(b"=") assert encoded == expected, (encoded, expected) encoded = simple_hashstring(inputdata).encode('ascii') assert encoded == expected, (encoded, expected)
def search(self, tree, page=1): """ Search the database for a query and return the *n*\ th page of results. :param tree: The query to search for. :type tree: :py:class:`~.query.tree.Tree` :param page: The result page to display. :type page: int :return: The total number of results, and the *n*\ th page of results. :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) """ query1 = "SELECT 1 FROM cache WHERE cache_id = ?" query2 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp FROM cache INNER JOIN cache_data ON cache_id = cdata_cache WHERE cache_id = ? ORDER BY cdata_index ASC""" query3 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" query4 = "INSERT INTO cache_data VALUES (?, ?, ?)" cache_id = mmh3.hash64(str(page) + ":" + tree.serialize())[0] with self._conn.cursor() as cursor: cursor.execute(query1, (cache_id,)) cache_hit = cursor.fetchall() if cache_hit: cursor.execute(query2, (cache_id,)) rows = cursor.fetchall() num_results = rows[0][1] * (10 ** rows[0][2]) if rows else 0 ids = [row[0] for row in rows] else: ids, num_results = self._search_with_query(cursor, tree, page) num_exp = max(len(str(num_results)) - 3, 0) num_results = int(round(num_results, -num_exp)) num_mnt = num_results / (10 ** num_exp) cursor.execute(query3, (cache_id, num_mnt, num_exp)) cdata = [(cache_id, c_id, i) for i, c_id in enumerate(ids)] cursor.executemany(query4, cdata) codelet_gen = self._get_codelets_from_ids(cursor, ids, tree) return (num_results, list(codelet_gen))
def post(self): status, dinfo = self.storinfo() if status == "error": self.jsonError({"msg":status}) return # insert into source_db try: #sourcedb_result = yield self.db_s.ns_map_infoplat.rf_naviapp_feedback.update(\ # {"ugc_id":dinfo['ugc_id']}, dinfo, upsert=True, w=True) sourcedb_result = yield self.db_s.ns_map_infoplat.rf_naviapp_feedback.insert(dinfo) except pymongo.errors.ConnectionFailure: self.jsonError({"msg": "connect mongodb timeout"}) return ''' dinfo['res_id'] = sourcedb_result.get("upserted", "ERROR") if dinfo['res_id'] == "ERROR": self.jsonError({"msg": "insert source_db ERROR, ugc_id is duplicated"}) return ''' dinfo['res_id'] = sourcedb_result hstr = "%snaviappfeedback" % dinfo['res_id'] dinfo['mid'] = mmh3.hash64(hstr)[0] #unique dinfo['intelligence_source'] = 31 dinfo['dispatch_flag'] = 0 #form geom in mongodb for geo_field in ("siwei_link1_list", "siwei_link2_list", "current_path_list", "current_track_list"): dinfo[geo_field] = self.form_dict_geom(dinfo[geo_field]) # insert into stand_db try: standdb_result = yield self.db_r.info.inte_naviapp_feedback.insert(dinfo) except pymongo.errors.PyMongoError as e: self.jsonError({"msg": e}) return self.jsonOk() '''
def __init__(self, url): self.url = urlnorm.norm(url) self.tld = get_tld(url).encode('ascii','ignore') self.crawl_data = {} self.crawl_data['failure'] = False self.crawl_data['url'] = self.url self.crawl_data['id'] = mmh3.hash64(self.url)[0] self.cassandra_cluster = ['127.0.0.1'] self.keyspace = 'crawlr' self.cluster = Cluster(self.cassandra_cluster) self.session = self.cluster.connect(self.keyspace) self.session.row_factory = dict_factory # Prepared Cassandra queries. self.check_prepped_stmt = self.session.prepare( """ SELECT id FROM pages WHERE id = ?; """) self.add_fail_prepped_stmt = self.session.prepare( """ UPDATE failure_counts SET failures = failures + 1 WHERE id = ?; """) self.del_fail_prepped_stmt = self.session.prepare( """ DELETE failures FROM failure_counts where id = ?; """) self.add_crawl_prepped_stmt = self.session.prepare( """ INSERT INTO pages ( id, url, crawled_at, failure, title, body, internal_links, outbound_links) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """)
def simple_hashstring(obj, bits=64): ''' Creates a simple hash in brief string form from obj bits is an optional bit width, defaulting to 64, and should be in multiples of 8 with a maximum of 64 >>> from bibframe.contrib.datachefids import simple_hashstring >>> simple_hashstring("The quick brown fox jumps over the lazy dog") 'bBsHvHu8S-M' >>> simple_hashstring("The quick brown fox jumps over the lazy dog", bits=48) 'B7x7vEvj' ''' #Useful discussion of techniques here: http://stackoverflow.com/questions/1303021/shortest-hash-in-python-to-name-cache-files #Use MurmurHash3 #Get a 64-bit integer, the first half of the 128-bit tuple from mmh and then bit shift it to get the desired bit length basis = mmh3.hash64(str(obj))[0] >> (64-bits) if bits == 64: raw_hash = struct.pack('!q', basis) else: raw_hash = struct.pack('!q', basis)[:-int((64-bits)/8)] hashstr = base64.urlsafe_b64encode(raw_hash).rstrip(b"=") return hashstr.decode('ascii')
def simple_hashstring(obj, bits=48): ''' Creates a simple hash in brief string form from obj bits is an optional bit width, defaulting to 48, and should be in multiples of 8 >>> from datachef.ids import simple_hashstring >>> simple_hashstring("The quick brown fox jumps over the lazy dog") B7x7vEvj ''' #Useful discussion of techniques here: http://stackoverflow.com/questions/1303021/shortest-hash-in-python-to-name-cache-files #Abandoned idea of using MD5 and truncating #raw_hash = hashlib.md5(title).digest() #Abandoned Adler32 for MurmurHash3 #raw_hash = struct.pack('i', zlib.adler32(title[:plain_len])) #Use MurmurHash3 #Get a 64-bit integer, the first half of the 128-bit tuple from mmh and then bit shift it to get the desired bit length basis = mmh3.hash64(str(obj))[0] >> (64-bits) raw_hash = struct.pack('l', basis)[:-int((64-bits)/8)] hashstr = base64.urlsafe_b64encode(raw_hash).rstrip(b"=") return hashstr.decode('ascii')
CARD = 100000 NB = 1000 STEP = 500 values = [] # Un tableau de CARD/STEP cardinalites for i in range(CARD/STEP): values.append([]) x = 0 for x in range(NB): hll = Hll(14) for j in range(CARD): hll.AddItem64(mmh3.hash64(str(random.randrange(0,CARD*10)))[0]) # Tous les STEP if j%STEP == 0: count = hll.Count64() x1 = min(valuesdata, key=lambda x:abs(x-count)) y1 = data[x1] if valuesdata.index(x1)+1 != len(valuesdata): x2 = valuesdata[valuesdata.index(x1)+1] y2 = data[x2] inter = interpolation(count,x1,y1,x2,y2) values[j/STEP].append(inter) else: values[j/STEP].append(x2)
CARD = 100000 NB = 100 STEP = 500 # Un tableau de CARD/STEP cardinalites for i in range(CARD/STEP): values.append([]) x = 0 z = 0 # On fait NB tests for x in range(NB): print x hll = Hll(14) for j in range(CARD): hll.AddItem64(mmh3.hash64(str(z))[0]) z+=1 # Tous les STEP if j%STEP == 0: count = hll.Count64() values[j/STEP].append(count) for i in range(len(values)): sum = 0 for j in range(len(values[i])): sum = sum + values[i][j] avg = sum / len(values[i]) mid = percentile(values[i],0.50) one = percentile(values[i],0.01)
def vk_hash(string): return mmh3.hash64(string)[0]
def _indexes(self, key): h1, h2 = mmh3.hash64(key) for i in xrange(self.num_hashes): yield (h1 + i * h2) % self.num_bits
def test_hash64(self): h1, h2 = mmh3.hash64('hello') assert (h1, h2) == (-3758069500696749310, 6565844092913065241)
def generate_hash(s): """Generates hash for the string argument.""" k1, k2 = mmh3.hash64(s) anded = 0xFFFFFFFFFFFFFFFF return '%016x%016x' % (k1 & anded, k2 & anded)
def main(argv): input = ["book", "surprise", "and", "nightmare", "or", "caravans", "not", "pray"] actions = ["and", "or", "not"] result = [] folderHashMap = {} wordsHashMap = {} indexDirect = {} indexReversed = {} if len(argv) > 0: # print("give something to search for!") argv = [x.lower() for x in argv] input = argv for i in range(1, len(input)): if input[i] in actions and input[i-1] in actions: return 1 # jet! currentDirName = os.path.basename(os.path.abspath(os.curdir)) print('file : ' + '.\\output\\' + currentDirName + '.folderHashMap') with open('.\\output\\' + currentDirName + '.folderHashMap', 'r') as infile: folderHashMap = json.load(infile) print('file : ' + '.\\output\\' + currentDirName + '.wordsHashMap') with open('.\\output\\' + currentDirName + '.wordsHashMap', 'r') as infile: wordsHashMap = json.load(infile) print('file : ' + '.\\output\\' + currentDirName + '.indexDirect') with open('.\\output\\' + currentDirName + '.indexDirect', 'r') as infile: indexDirect = json.load(infile) print('file : ' + '.\\output\\' + currentDirName + '.indexReversed\n') with open('.\\output\\' + currentDirName + '.indexReversed', 'r') as infile: indexReversed = json.load(infile) if len(folderHashMap) > 0 and len(wordsHashMap) > 0 and len(indexDirect) > 0 and len(indexReversed) > 0 : if len(input) > 1: # remove and operations on 0 position for i in range(0, len(input)): if i == 0 and input[i] in actions: input.pop(0) i = 0 else: break if len(input) > 1: # add missing operations for i in range(1, len(input)): if input[i] not in actions and input[i-1] not in actions: input.insert(i, "and") i += 2 action = input[1] # array of arrays like [[f1, nrOfOccurancies1], [f2, nrOfOccurancies2], ...] wordKey1=str(mmh3.hash64(input[0])[0]) wordKey2=str(mmh3.hash64(input[2])[0]) if wordKey1 in indexReversed: files_1 = [item[0] for item in indexReversed[wordKey1]] else: files_1=[] action="or" if wordKey2 in indexReversed: files_2 = [item[0] for item in indexReversed[wordKey2]] else: files_2=[] action="or" f1_len = len(files_1) f2_len = len(files_2) if action == "and": if f1_len > f2_len: result = [item for item in files_2 if item in files_1] else: result = [item for item in files_1 if item in files_2] elif action == "or": if f1_len > f2_len: aux = files_1 result = [item for item in files_2 if item not in aux] result += aux else: aux = files_2 result = [item for item in files_1 if item not in aux] result += aux elif action == "not": result = [item for item in files_1 if item not in files_2] else: print("what the ???\nsomthing is definitely wrong here!") print(input[0] + " " + input[1] + " " + input[2]) for r in result: print(folderHashMap[str(r)]) print("\n") size = len(input) if size > 3: for i in range(3, size-1): if input[i].lower() in actions and input[i+1] not in actions: action = input[i] # array of arrays like [[f1, nrOfOccurancies1], [f2, nrOfOccurancies2], ...] files_1 = result wordKey2=str(mmh3.hash64(input[i+1])[0]) if wordKey2 in indexReversed: files_2 = [item[0] for item in indexReversed[wordKey2]] else: files_2=[] action="or" f1_len = len(files_1) f2_len = len(files_2) if action == "and": if f1_len > f2_len: result = [item for item in files_2 if item in files_1] else: result = [item for item in files_1 if item in files_2] elif action == "or": if f1_len > f2_len: aux = files_1 result = [item for item in files_2 if item not in aux] result += aux else: aux = files_2 result = [item for item in files_1 if item not in aux] result += aux elif action == "not": result = [item for item in files_1 if item not in files_2] else: print("what the ???\nsomthing is definitely wrong here!") print("previous " + input[i] + " " + input[i+1]) for r in result: print(folderHashMap[str(r)]) print("\n") else: if input[0] not in actions: key = str(mmh3.hash64(input[0])[0]) if key in indexReversed: result = [item[0] for item in indexReversed[key]] else: if input[0] not in actions: key = str(mmh3.hash64(input[0])[0]) if key in indexReversed: result = [item[0] for item in indexReversed[key]] else: print("sorry, have no data to search in.") print("Files for \"" + str(input) + "\" query : ") for r in result: print(folderHashMap[str(r)])