def test_init(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(np.array_equal(lm1.hashvalues, lm2.hashvalues)) self.assertTrue(np.array_equal(lm1.seed, lm2.seed))
def joinable_column_search(): query_id = request.args.get('id', None, type=uuid.UUID) if query_id == None: return jsonify([]) limit = request.args.get('limit', default=50, type=int) original_host_filter = tuple(request.args.getlist('original_host')) cnx = cnxpool.getconn() # Obtain the MinHash of the query. with cnx.cursor(cursor_factory=RealDictCursor) as cursor: _execute_get_column_sketches(cursor, (query_id,)) query = cursor.fetchone() if query is None: # The query does not exist. cnxpool.putconn(cnx) abort(404) # Query the LSH Server. try: resp = requests.post(lshserver_endpoint+"/query", json={"seed": query["seed"], "minhash": query["minhash"]}) resp.raise_for_status() except requests.exceptions.HTTPError as err: app.logger.error("Error in querying the LSH server: {}".format(err)) cnxpool.putconn(cnx) abort(500) column_ids = [column_id for column_id in resp.json() if column_id != str(query_id)] if len(column_ids) == 0: # Return empty result. cnxpool.putconn(cnx) return jsonify([]) # Create the final query results. results = [] query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"]) # Obtain the column sketches of the results. with cnx.cursor(cursor_factory=RealDictCursor) as cursor: _execute_get_column_sketches(cursor, tuple(column_ids), original_hosts=original_host_filter) for column in cursor: # Skip columns from query table. if column["package_file_id"] == query["package_file_id"]: continue # Compute the similarities for each column in the result. jaccard = query_minhash.jaccard(LeanMinHash( seed=column["seed"], hashvalues=column["minhash"])) containment = _containment(jaccard, column["distinct_count"], query["distinct_count"]) column.pop("seed") column.pop("minhash") column["jaccard"] = jaccard column["containment"] = containment if len(results) < limit: heapq.heappush(results, (containment, column["id"], dict(column))) else: heapq.heappushpop(results, (containment, column["id"], dict(column))) # Done with SQL. cnxpool.putconn(cnx) results = [column for _, _, column in heapq.nlargest(limit, results)] return jsonify(results)
def test_init(self): m1 = MinHash(4, 1, hashobj=FakeHash) m2 = MinHash(4, 1, hashobj=FakeHash) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(np.array_equal(lm1.hashvalues, lm2.hashvalues)) self.assertTrue(np.array_equal(lm1.seed, lm2.seed))
def test_union(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) m2.update(12) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) u = LeanMinHash.union(lm1, lm2) self.assertTrue(u.jaccard(lm2) == 1.0)
def test_update(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) try: lm1 = LeanMinHash(m1) lm1.update(12) except TypeError: pass else: raise Exception
def test_update(self): m1 = MinHash(4, 1, hashobj=FakeHash) try: lm1 = LeanMinHash(m1) lm1.update(12) except TypeError: pass else: raise Exception
def test_count(self): m = MinHash(hashobj=FakeHash) m.update(11) m.update(123) m.update(92) m.update(98) m.update(123218) m.update(32) lm = LeanMinHash(m) c = lm.count() self.assertGreaterEqual(c, 0)
def test_count(self): m = MinHash(hashfunc=fake_hash_func) m.update(11) m.update(123) m.update(92) m.update(98) m.update(123218) m.update(32) lm = LeanMinHash(m) c = lm.count() self.assertGreaterEqual(c, 0)
def test_jaccard(self): m1 = MinHash(4, 1, hashobj=FakeHash) m2 = MinHash(4, 1, hashobj=FakeHash) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def test_jaccard(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def _train_LSH(self): # create LSH model lsh = MinHashLSH(num_perm=128, params=(5, 7)) # train LSH model for dom in self.new_domains: # remove TLD tld_info = get_tld('http://' + dom, as_object=True, fail_silently=True) try: d = tld_info.domain except: continue # ignore super short new domains if len(d) <= 3: \ continue # create bigram set bigrams = [d[i:i + 2] for i in range(len(d) - 1)] bigrams = set(bigrams) minhash = MinHash(num_perm=128) for b in bigrams: minhash.update(b.encode('utf-8')) minhash_lean = LeanMinHash(minhash) lsh.insert(dom, minhash_lean) print('LSH Trained!') return lsh
def make_hash(htid, wordset, **kwargs): ''' Make a LeanMinHash and serialize as bytes ''' m = MinHash(**kwargs) for wordid in wordset: m.update(str(wordid).encode('utf-8')) lm = LeanMinHash(m) return serialize_lm(htid, lm)
def make_hash(string, nperm=N_PERM): """ Generates the hash for any string represented as a list of words. MD5 hash used for speed, and global_perm is globally defined to avoid repeatedly calling the random number generator""" mh = MinHash(num_perm=nperm, permutations=global_perm, hashobj=md5) for word in string: mh.update(word.encode(encoding='utf-8')) return LeanMinHash(mh)
def test_hash(self): m = MinHash(hashfunc=fake_hash_func) m.update(11) m.update(123) m.update(92) m.update(98) m.update(123218) m.update(32) lm1 = LeanMinHash(m) lm2 = LeanMinHash(m) self.assertEqual(hash(lm1), hash(lm2)) m.update(444) lm3 = LeanMinHash(m) self.assertNotEqual(hash(lm1), hash(lm3)) d = dict() d[lm1] = True self.assertTrue(d[lm2])
def test_eq(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) m3 = MinHash(4, 2, hashfunc=fake_hash_func) m4 = MinHash(8, 1, hashfunc=fake_hash_func) m5 = MinHash(4, 1, hashfunc=fake_hash_func) m1.update(11) m2.update(12) m3.update(11) m4.update(11) m5.update(11) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) lm3 = LeanMinHash(m3) lm4 = LeanMinHash(m4) lm5 = LeanMinHash(m5) self.assertNotEqual(lm1, lm2) self.assertNotEqual(lm1, lm3) self.assertNotEqual(lm1, lm4) self.assertEqual(lm1, lm5) m1.update(12) m2.update(11) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertEqual(lm1, lm2)
def test_pickle(self): m = MinHash(4, 1, hashfunc=fake_hash_func) m.update(123) m.update(45) lm = LeanMinHash(m) p = pickle.loads(pickle.dumps(lm)) self.assertEqual(p.seed, lm.seed) self.assertTrue(np.array_equal(p.hashvalues, lm.hashvalues))
def deserialize_minhash(column): """ Deserializes minhash binary file for the given column and returns the minhash @param column: @return: """ file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt' if not os.path.isfile(file_path): serialize_min_hash([column]) with open(file_path, 'rb') as file: minhash = LeanMinHash.deserialize(bytearray(file.read())) return minhash
def _create_minhash_from_file(self, current_dir_path, filename): file_path = f'{current_dir_path}/{filename}' with open(file_path) as f: article = Article(**json.load(f)) if not article.body: os.remove(file_path) return minhash = MinHash() for word in article.body.split(' '): minhash.update(word.encode('utf8')) lean_minhash = LeanMinHash(minhash) self.minhashes[file_path] = lean_minhash self.lsh.insert(file_path, lean_minhash)
def make_minhash_mapping(item, shingles: int, num_perm: int): doc_id, doc = item # Create MinHash shingles_set = set(ngrams(doc, shingles)) m = MinHash(num_perm=num_perm) for s in shingles_set: s = ''.join(s).encode('utf8') m.update(s) # Convert to LeanMinHash m = LeanMinHash(m) return doc_id, m, len(shingles_set)
def serialize_min_hash(columns, override=False): """ Writes min hash values to local files @param override: @param columns: @return: """ for column in columns: output_file = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt' if os.path.isfile(output_file) and not override: continue values = queryDatabase.get_distnct_column_values(column['table'], column) tokens = tokenize(values) minhash = MinHash(num_perm=NUM_PERM) for token in tokens: minhash.update(token.encode('utf8')) leanMinHash = LeanMinHash(minhash) buf = bytearray(leanMinHash.bytesize()) leanMinHash.serialize(buf) with open(output_file, 'wb') as file: file.write(buf) print(f'Serialization is complete for {column["table"]}.{column["column"]}.') return
def process_file(file_path, tqdm_func, global_tqdm): reader = Reader() minhashes = [] previous_file_position = 0 for document, metadata in reader.read_jsonl(file_path, get_meta=True): n_grams = extract_ngrams(document, 5) five_gram_set = set(n_grams) minhash = MinHash(num_perm=10) for five_gram in five_gram_set: minhash.update(five_gram.encode('utf8')) minhashes.append(LeanMinHash(minhash)) # Update Progress Bar current_file_position = reader.fh.tell() global_tqdm.update(current_file_position - previous_file_position) previous_file_position = current_file_position return file_path, minhashes
def test_deserialize(self): m1 = MinHash(10, 1, hashfunc=fake_hash_func) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize()) lm1.serialize(buf) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf) self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def _load_lib_profile(profile_path_n_mode_n_repackage): (profile_path, mode, repackage) = profile_path_n_mode_n_repackage analyzer = LibAnalyzer(profile_path) lib_name_version = "{}_{}".format(analyzer.lib_name, analyzer.lib_version) minhash_list = [] relationship_graphs = None if len(analyzer.classes_names) >= SHRINK_MINIMUM_NUMBER: if mode == MODE.ACCURATE: relationship_graphs = analyzer.get_relationship_graphs(repackage) classes_signatures = analyzer.get_classes_signatures() signature_set = set() lib_class_num = 0 for class_name in classes_signatures: if classes_signatures[class_name]: signature_set.update(classes_signatures[class_name]) lib_class_num += 1 for class_name in classes_signatures: class_signatures = classes_signatures[class_name] if class_signatures: m = MinHash(num_perm=LSH_PERM_NUM) for signature in class_signatures: m.update(signature.encode('utf8')) lm = LeanMinHash(m) key = "{}|{}|{}|{}|{}|->{}".format(lib_name_version, analyzer.root_package, lib_class_num, len(signature_set), analyzer.category, class_name) minhash_list.append((key, lm, len(class_signatures))) return (lib_name_version, minhash_list, relationship_graphs)
def test_deserialize_byteorder(self): for byteorder in "@=<>!": m1 = MinHash(10, 1, hashobj=FakeHash) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize(byteorder)) lm1.serialize(buf, byteorder) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf, byteorder) lm1d.hashobj = FakeHash self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def tokenize_words(url, text): # preprocessing step, converts all characters to lowercase words = text.replace(" ", " ").replace("\n", " ").lower().strip() # the tokenizing step tokens = word_tokenize(words) # this is the list of valid tokens ftokens = [] mh = MinHash(num_perm=128) # iterates through full token list to filter out invalid tokens for t in tokens: # do not include the token if it is a stopword if t in stopwords.words(): continue # do not include the token if there are no alphanumeric characters if not re.match('[A-Za-z0-9]+', t): continue # remove any non-alphanumeric characters from the token t2 = re.sub('[^A-Za-z0-9]+', '', t).strip() # skip the token if it is an empty string if len(t2) <= 0: continue ftokens.append(t2) mh.update(t2.encode("utf8")) return ftokens, LeanMinHash(mh)
def __init__(self, hash_type=None, bits=None, hash_func=None, params=None): self.hash_type = hash_type self.hash_func = hash_func self.hash_bits = bits self.hashfunc = sha1_hash32 if self.hash_bits in {32, "32", None}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash32 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh32 else: # "hash32","default": self.hashfunc = sha1_hash32 elif self.hash_bits in {64, "64"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash64 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash64 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh64 else: self.hashfunc = sha1_hash64 elif self.hash_bits in {128, "128"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash128 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash128 else: raise ValueError("请检查对应的hash函数类型与位数") else: raise ValueError("请检查对应的hash函数的位数") if not params: params = {} """ 若只用redis 作为存储截止 配置 storage_config={ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}, } 要顺序插入大量MinHash,建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。 data_list = [("m1", m1), ("m2", m2), ("m3", m3)] with lsh.insertion_session() as session: for key, minhash in data_list: session.insert(key, minhash) 请注意,在打开插入会话期间查询LSH对象可能会导致不一致。 MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例(例如, 当您使用MinHashLSH逐步对文档进行群集时)。 Cassandra存储选项可以配置如下: storage_config={ 'type': 'cassandra', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, }} 参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间(如果尚不存在)时要使用的参数。 如果要强制创建表或键空间(因此要删除现有表或键空间),请将drop_tables和drop_keyspace选项设置为 True。 像Redis副本一样,建议使用插入会话来减少批量插入期间的网络调用数量。 +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 如果您的LSH使用外部存储层(例如Redis),则可以跨多个进程共享它。有两种方法可以做到这一点: 推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的,因此您可以调用pickle: import pickle # Create your LSH object lsh = ... # Serialize the LSH data = pickle.dumps(lsh) # Now you can pass it as an argument to a forked process or simply save it # in an external storage. # In a different process, deserialize the LSH lsh = pickle.loads(data) 使用pickle,您可以保存有关LSH所需的所有知识,例如在一个位置中进行各种参数设置。 另外,您可以在首次创建LSH时在存储配置中指定基本名称。例如: # For Redis. lsh = MinHashLSH( threshold=0.5, num_perm=128, storage_config={ 'type': 'redis', 'basename': b'unique_name_6ac4fg', 'redis': {'host': 'localhost', 'port': 6379}, } ) # For Cassandra. lsh = MinHashLSH( threashold=0.5, num_perm=128, storage_config={ 'type': 'cassandra', 'basename': b'unique_name', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } } ) 的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此,如果使用相同的基名创建新的LSH对象,则将在与旧LSH关联的存储层中使用相同的基础数据。 如果不指定basename,则MinHash LSH将生成一个随机字符串作为基本名称,并且极不可能发生冲突。 更详细的使用见 文档 :http://ekzhu.com/datasketch/lsh.html """ if self.hash_type in {"minhash", "MinHash"}: # 主要计算Jaccard 的相似度, 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度 self.hash = MinHash( num_perm=params.get( "num_perm", 128), # int可选项, 如果hashvalues值不是None,则被忽略。随机排列函数的数量 # 用来控制hash 的精度 seed=params.get("seed", 1), # 随机种子 可选 hashfunc=self. hashfunc, # 可选 使用的hash函数,将输入传递给update 方法。并返回一个可以用32位编码的整数 hashobj=params.get("hashobj", None), # Deprecated.已经被hashfunc 代替 hashvalues=params.get("hashvalues", None), # 可选 数组或列表 permutations=params.get( "permutations", None)) # 置换函数参数, 可选,可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化 elif self.hash_type in { "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh", "MinHashLSH" }: # 加权的最小哈希局部敏感哈希 # WeightedMinHashLSH() 与 MinHashLSH 等价 。 加权jaccard 相似度 查询 # 不支持top-k查询, 但minhashlshforest 支持top-k self.hash = MinHashLSH( threshold=params.get("threshold", 0.9), # 杰卡德距离的阈值 num_perm=params.get("num_perm", 128), # 置换函数设定个数, 在加权minihash 上的 样本规模大小 weights=params.get("weights", (0.5, 0.5)), # 元组, 可选项, 优化jaccard阈值 params=params.get("params", None), # 元组,可选项, – bands 的数量与规模大小 storage_config=params.get("storage_config", None), # 存储配置 prepickle=params.get("prepickle", None)) # 默认使用pk格式存储 elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}: # 相比MinHash 中,内存更小的哈希。 self.hash = LeanMinHash(minhash=params.get("minhash", None), seed=params.get("seed", None), hashvalues=params.get("hashvalues", None)) elif self.hash_type in { "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF" }: self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128), l=params.get("l", 8)) elif self.hash_type in { "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE" }: # 使用新距离做的minhashlsh操作 , 即使用Containment 中文简称为遏制 self.hash = MinHashLSHEnsemble( threshold=params.get("threshold", 0.9), num_perm=params.get("num_perm", 128), num_part=params.get("num_part", 16), # m=params.get("m", 8), weights=params.get("weights", (0.5, 0.5)), storage_config=params.get("storage_config", None), prepickle=params.get("prepickle", None)) elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}: # 相关的接口与HyperLogLog 相同 # HyperLogLog能够使用较小且固定的内存空间,单次估算数据集的基数(不同值的数量) self.hash = HyperLogLog( p=params.get("p", 8), # 与MinHash 中的数据相比较,num_perm 用于控制精度 reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash32), # 内部使用的hash 算法 hashobj=params.get("hashobj", None)) # 可选 数组或列表, 使用hashfunc 代替了 elif self.hash_type in { "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++", "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++" }: # 相关的接口与HyperLogLog 相同 self.hash = HyperLogLogPlusPlus( p=params.get("p", 8), reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash64), # 使用的64位的hash 算法 hashobj=params.get("hashobj", None)) else: raise ValueError("请选择正确的函数函数对象")
def get_minhash(self, shingles, num_perm): mh = MinHash(num_perm=num_perm) for d in shingles: mh.update(d.encode('utf8')) return LeanMinHash(mh)
def test_serialize(self): m1 = MinHash(2, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize()) # Only test for syntax lm1.serialize(buf) m2 = MinHash(2, 1, hashfunc=fake_hash_func) lm2 = LeanMinHash(m2) size = lm1.bytesize() buf = bytearray(size * 2) lm1.serialize(buf) lm2.serialize(buf[size:])
def test_bytesize(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)
def test_is_empty(self): m = MinHash() lm = LeanMinHash(m) self.assertTrue(lm.is_empty())