def build_lsh(self, threshold=0.5): start = time.time() print 'Buidling LSH...' lsh = MinHashLSH(threshold=threshold, num_perm=128) with lsh.insertion_session() as session: for i, entity in enumerate(self.entities): session.insert(i, self.minhash(entity.value)) print '[{} s]'.format(time.time() - start) return lsh
def main() -> None: minhashes = [] files = [] for iterator in tqdm(range(config.COUNT_UNQ_MHS), desc="Generate minHashes:"): minhash = MinHash(num_perm=256) file = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for _ in range(5)) file.append(rand_string) files.append(file) minhash.update_batch([s.encode('utf-8') for s in file]) minhashes.append(("key" + str(iterator), minhash)) lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) for _ in tqdm(range(1), desc="Insert 100 minHashes:"): with lsh.insertion_session(buffer_size=100) as session: for key, minhash in minhashes: session.insert(key, minhash) f_disc_mhs = open('minhashes.txt', 'w+') for minhash in tqdm(minhashes, desc="Log minHashes:"): log(f_disc_mhs, minhash[0], minhash[1].digest()) f_disc_mhs.close() f_disc_files = open('files.txt', 'w+') for iterator in tqdm(range(len(files)), desc="Log files:"): log(f_disc_files, minhashes[iterator][0], files[iterator]) f_disc_mhs.close()
def similarity_threshold_bulk(self, df_library, df_query, only_positive=False, return_df=False): """ Takes a dataframe of 'library' strings to query against, and a dataframe of query strings. Gives these unique IDs. Transforms both the library and the query strings into minhash objects. If return_df==True then df_query will be returned with a column showing how many similar utterances have been found in df_library. TODO: maybe use redis in production """ from datasketch import MinHashLSH lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm) data_library = self.dataframe_to_data_list(df_library, 'lib_') data_query = self.dataframe_to_data_list(df_query, 'query_') # use an insertion session to create an lsh object with all the lib data that can be queried with lsh.insertion_session() as session: for key, minhash in data_library: session.insert(key, minhash) # bulk query the data_query objects against lsh query_results = [] df_query['no_similar'] = 0 for key, minhash in data_query: query_result = lsh.query(minhash) query_result_length = len(query_result) if return_df: df_query.loc[key, 'no_similar'] = len(query_result) elif only_positive: # only need to care about only_positive if not returning a dataframe if query_result_length > 0: query_results.append( (key, query_result, query_result_length)) else: query_results.append((key, query_result, query_result_length)) if return_df: return df_query else: return query_results
def lsh_clustering( signatures: List[np.ndarray], threshold: float = 0.5, num_perm: int = 128, ): lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) with lsh.insertion_session() as session: for key, minhash in enumerate(signatures): session.insert(f"id-{key}", MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors: List[List[int]] = [] for key, minhash in enumerate(signatures): result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors.append([int(x.split("-")[1]) for x in result]) return neighbors
def mass_values_jaccard(cols1: List[Column], cols2: List[Column]): lsh = MinHashLSH( threshold=0.2, num_perm=128, storage_config={ "type": "redis", "redis": { "host": "localhost", "port": 6379 } }, ) with lsh.insertion_session() as session: for idx, col in enumerate(cols1): session.insert(str(idx), col.values) result = lsh.query()
def init_lshs(directory, type, threshold): """Initilize and calculate LSH for the document database Args: directory (str): the directory with source files type (str): type of ngrams to use ('char', 'word') threshold (float): Jaccard threshold value Returns: lsh: datasketch object """ # Create a MinHashLSH index using Redis as the storage layer lsh = MinHashLSH(threshold=threshold, num_perm=128, storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379, 'db': 1}, 'name': 1}) data_list = [] for f in os.listdir(directory): minhash = MinHash(num_perm=128) if type == 'char': filename, text = utils.read_file(os.path.join(directory, f)) print filename for d in nltk.ngrams(text, 3): minhash.update("".join(d).encode('utf-8')) elif type == 'word': filename, text = utils.tokenize_file(os.path.join(directory, f)) print filename for d in nltk.ngrams(text, 3): minhash.update(" ".join(d).encode('utf-8')) data_list.append((filename, minhash)) with lsh.insertion_session() as session: for key, minhash in data_list: session.insert(key, minhash) return lsh
def main(SAMPLE_SIZE, output_type): tweet_data = {} conn = connect_to_database() executor = conn.cursor() executor.execute("SELECT * FROM tweets") tweet_query = executor.fetchall() print("Getting tweets...") tweets = get_tweets(tweet_query, SAMPLE_SIZE) pool = cf.ProcessPoolExecutor() tweet_results = [pool.submit(get_words, tweet) for tweet in tweets] for tweet in cf.as_completed(tweet_results): tweet_data[tweet.result()["nameid"]] = tweet.result() pool.shutdown() word_counts = list(map(lambda d: d["word_counts"], tweet_data.values())) packages = [] for i in range(8): packages.append(word_counts[((i) * (SAMPLE_SIZE // 8)):(i + 1) * (SAMPLE_SIZE // 8)]) package_pool = cf.ProcessPoolExecutor(max_workers=8) package_results = [ package_pool.submit(sum, counts, collections.Counter()) for counts in packages ] word_sums = [f.result() for f in cf.as_completed(package_results)] package_pool.shutdown() all_words_seen = sum(word_sums, collections.Counter()) words_to_remove = build_people_and_find_words(tweets, all_words_seen) print("Removing extraneous...") tweets_to_remove = [] for tweet in tqdm(tweet_data.values(), desc="tweets"): for word in words_to_remove: if word in tweet["word_counts"]: del tweet["word_counts"][word] tweet["square_sum"] = math.sqrt( sum(map((lambda x: x**2), tweet["word_counts"].values()))) if tweet["square_sum"] == 0: tweets_to_remove.append(tweet["nameid"]) for nameid in tweets_to_remove: del tweet_data[nameid] tweets = tweet_data.values() print("Preliminary pairing...") prelim_data = list( map(lambda d: (d["nameid"], set_to_minhash(d["word_counts"])), tweets)) prelim_similarities = MinHashLSH(threshold=LSH_LENIENCY, num_perm=128) #.6 with prelim_similarities.insertion_session() as session: for (key, minhash) in prelim_data: session.insert(key, minhash) pairs_to_check = {} for tweet in tqdm(tweets): pairs = [ match for match in prelim_similarities.query(tweet["minHash"]) if match != tweet["nameid"] ] if len(pairs) > 0: pairs_to_check[tweet["nameid"]] = pairs for pair in pairs: if pair not in pairs_to_check: pairs_to_check[pair] = [] tweets_to_remove = [] for tweet in tweet_data: if tweet not in pairs_to_check: tweets_to_remove.append(tweet) for tweet in tweets_to_remove: del tweet_data[tweet] print("Sanity Checks...") people = list(tweet_data.keys()) p1 = people[0] p2 = people[0] print(cos_dist(tweet_data[p1], tweet_data[p2])) p1_name = tweet_data[p1]["user"]["name"] for (nameid, tweet) in tweet_data.items(): if tweet["user"]["name"] == p1_name and tweet["nameid"] != p1: print("found other tweet") p2 = nameid break print(cos_dist(tweet_data[p1], tweet_data[p2])) for (nameid, tweet) in tweet_data.items(): if tweet["user"]["name"] != p1_name: print("found seperate tweet") p2 = nameid break print(cos_dist(tweet_data[p1], tweet_data[p2])) print("Pairing...") distance_pool = cf.ProcessPoolExecutor(max_workers=8) future_results = [] similarities = {} for (person, potentials) in tqdm(pairs_to_check.items(), desc="prep"): if person in tweet_data: tweet_data[person]["processed"] = True similarities[person] = {} for relation in potentials: if not tweet_data.get(relation, {"processed": True})["processed"]: future_results.append( distance_pool.submit(cos_dist, tweet_data[person], tweet_data[relation])) for comparison in tqdm(cf.as_completed(future_results), desc="futures"): result = comparison.result() similarities[result[0]][result[1]] = result[2] distance_pool.shutdown() for (person, comparisons) in similarities.items(): for (relation, weight) in comparisons.items(): if relation not in similarities: similarities[relation] = {} if person not in similarities[relation]: similarities[relation][person] = weight print("Outputting...") if output_type == "csv": similarity_frame = pd.DataFrame(similarities) similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1) elif output_type == "json": output_to_json("./writeTest.json", similarities, tweet_data) elif output_type == "csv+json": similarity_frame = pd.DataFrame(similarities) similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1) print("Outputted to csv") output_to_json("./writeTest.json", similarities, tweet_data) elif output_type == "none": print("Did not write data.") print("Completed.")
def insertion_session_syncredis(lsh: MinHashLSH, data: list, buffer_size: int): with lsh.insertion_session(buffer_size=buffer_size) as session: for key, minhash in data: session.insert(key, minhash, check_duplication=False)
def main(corpus: str, mode: str, lsh_file: str, minhash_file: str, num_perm: int, shingles: int, threshold: float, n_jobs: int, output_dir: str): if mode != 'query': assert not lsh_file if mode == 'minhash-only': assert not minhash_file print("Making output dir:", output_dir) output_dir = Path(output_dir) output_dir.mkdir() if minhash_file: print('Loading MinHashes from disk:', minhash_file) start = time.time() with open(minhash_file, 'rb') as f: cached_minhashes = pickle.load(f) print("Done loading MinHashes, time elapsed (sec):", time.time() - start) corpus_len = len(cached_minhashes) minhash_iter = starmap(lambda k, v: (k, v, None), cached_minhashes.items()) minhashes = None # Set to none to disable saving minhashes again else: if corpus == 'webtext': corpus_len = 8_282_020 corpus_iter = make_corpus_iter(DATA_DIR / 'webtext_detokenized') elif corpus == 'openwebtext': corpus_len = 8_013_769 corpus_iter = make_corpus_iter(DATA_DIR / 'openwebtext_shards') else: raise RuntimeError print("Using", n_jobs, "processes for MinHashing") minhashes = {} minhash_iter = parallel_create_minhashes(corpus_iter, shingles=shingles, num_perm=num_perm, n_jobs=n_jobs) print("Starting...") if mode == 'lsh' or mode == 'lsh-ensemble': if mode == 'lsh': lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) with lsh.insertion_session() as session: for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='Making MinHashLSH'): if minhashes: minhashes[key] = minhash session.insert( key, minhash, check_duplication=False) # All keys are unique doc ids else: assert mode == 'lsh-ensemble' lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=num_perm, num_part=16) # TODO: try 32 lsh.index( tqdm(minhash_iter, total=corpus_len, desc='Making MinHashLSHEnsemble')) # Save LSH print("Saving LSH...") start = time.time() with open(output_dir / 'lsh.pkl', 'wb') as f: pickle.dump(lsh, f) print("Done saving LSH, time elapsed (sec):", time.time() - start) elif mode == 'query': print('Loading LSH:', lsh_file) start = time.time() with open(lsh_file, 'rb') as f: lsh = pickle.load(f) assert isinstance(lsh, MinHashLSH) and lsh.h == num_perm print("Done loading LSH, time elapsed (sec):", time.time() - start) duplicates_file = output_dir / 'duplicates.jsonl' print("Writing duplicates to", duplicates_file) with open(duplicates_file, 'a') as f: for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='Querying MinHashLSH'): if minhashes: minhashes[key] = minhash duplicates = lsh.query(minhash) if duplicates: json.dump({key: duplicates}, f) f.write('\n') elif mode == 'minhash-only': assert minhashes is not None for key, minhash, size in tqdm(minhash_iter, total=corpus_len, desc='MinHashing'): minhashes[key] = minhash else: raise RuntimeError # Save MinHashes if not minhash_file: print("Saving MinHashes...") start = time.time() with open(output_dir / 'minhashes.pkl', 'wb') as f: pickle.dump(minhashes, f) print("Done saving MinHashes, time elapsed (sec):", time.time() - start)
class StreamData: """ Stream in data sequentially. Uses pandas dataframes for intra-batch deduping and optional lsh_hash for historical deduping (sublinear complexity) """ def __init__(self, filename, chunk=250, min_len=25, clean_fnc=clean_string, lsh_hash=True, use_column=None): self.__dict__.update(locals()) self.generator = pd.read_csv(filename, chunksize=chunk) self.n_processed = 0 if self.lsh_hash == True: self.lsh_hash = MinHashLSH(threshold=0.995, num_perm=128) def __call__(self): """ Get a batch from the generator """ return self._process(self.stream()) def stream(self): """ Iterate generator """ return next(self.generator) def _init_data(self, num_chunks): """ Generate a bunch of data to serve as initialization """ return pd.concat([self.__call__() for _ in range(num_chunks)]) def _process(self, batch): """ If use_column is specified, use that to make a new column of processed text data, remove rows where processed is less than min_len. From the resulting dataframe, remove duplicates. """ if self.use_column is not None: batch = batch.assign(processed=self._clean(batch[self.use_column])) batch = batch[[ len(s.split()) > self.min_len for s in batch.processed ]] deduped = self._dedupe(batch) self.n_processed += len(deduped) return deduped def _clean(self, batch): """ Clean data using some function """ if self.clean_fnc is not None: return [self.clean_fnc(sent) for sent in batch] return batch def _dedupe(self, dataframe): """ Delete duplicates of a dataframe. If use_column is specified, operate on the processed text. After deduping within dataframe, if lsh_hash is enabled check to make sure rows have also not already been seen before. """ if self.use_column is not None: deduped = dataframe.drop_duplicates(subset=['processed']) if type(self.lsh_hash) == MinHashLSH: deduped = self._hash(dataframe) else: deduped = dataframe.drop_duplicates() deduped.index = range(self.n_processed, self.n_processed + len(deduped)) return deduped def _hash(self, dataframe): """ Process dataframe to delete duplicates based on Jaccard similarity then update hash """ # Convert current batch to hash table hash_batch = self._batch_to_hash(dataframe.processed, dataframe.index) # Greedy, locally sensitive query to see if its a duplicate kept_hashes, kept_idx = self._query_hash(hash_batch) # Keep only non-duplicates dataframe = dataframe[kept_idx] # Get new indexes for updates to keep things consistent in lsh dict indexes = range(self.n_processed, self.n_processed + len(dataframe)) # Realign the kept hash update keys updates = [(i, h[1]) for i, h in zip(indexes, kept_hashes)] # Update the hash table self._update_hash(updates) return dataframe def _update_hash(self, hash_batch): """ After processing a batch, update lsh_hash with new entries """ with self.lsh_hash.insertion_session() as session: for idx, hasher in hash_batch: session.insert(idx, hasher) def _query_hash(self, hash_batch): """ Query lsh_hash and ignore entries that have already been seen """ keep_hashes, keep_idx = [], [] for hasher in hash_batch: if not self.lsh_hash.query(hasher[1]): keep_hashes.append(hasher) keep_idx.append(True) else: keep_idx.append(False) return keep_hashes, keep_idx def _batch_to_hash(self, batch, indexes): """ Convert a list of strings to a list of tuples (index, hash object)""" return [(idx, self._str_to_hash(string)) for idx, string in zip(indexes, batch)] def _str_to_hash(self, string): """ Convert string to locality sensitive min-hash """ data = set(string.split()) hasher = MinHash(num_perm=128) for d in data: hasher.update(d.encode('utf-8')) return hasher
print(f'{time.time() - t1} secs was taken to initiate\n') print("Starting minhash + shingle creation....") with mp.Pool() as p: MAX_COUNT = len(k) for res in tqdm(p.imap(minhash_operation, params), total=MAX_COUNT): pass print("Completed creating minhash\nIndexing documents complete") t2 = time.time() lsh = MinHashLSH(threshold=0.50, num_perm=NUM_PERMUTATION, weights=(0.5, 0.5)) with lsh.insertion_session() as session: for key in tqdm(Dict.keys(), desc="LSH processing"): session.insert(key=key, minhash=Dict[key]) query = ['/OneDoc/120.txt', '/OneDoc/123.txt', '/OneDoc/117.txt' ] # using the first ten documents in the k and seeing print(query) query = [Dict[i] for i in query] print(f"{time.time() - t2} secs was taken to create LSH") print("\nfinding candidate pairs.....") res = similarities(query, lsh) pprint(res) with open('result.csv', 'w') as f: for key in res.keys(): f.write("%s,%s\n" % (key, res[key]))
def main(output_type): #SAMPLE_SIZE, output_type): print("Getting tweets...") tweets = get_tweets() #SAMPLE_SIZE) SAMPLE_SIZE = len(tweets) pool = cf.ProcessPoolExecutor() tweet_results = [ pool.submit(get_words, name, tweet) for (name, tweet) in tweets.items() ] tweet_data = { tweet.result()["nameid"]: tweet.result() for tweet in cf.as_completed(tweet_results) } del tweets print("Summing Counts...") word_counts = list(map(lambda d: d["word_counts"], tweet_data.values())) packages = [] for i in tqdm(range(8), desc="breaking_up"): packages.append(word_counts[((i) * (SAMPLE_SIZE // 8)):(i + 1) * (SAMPLE_SIZE // 8)]) package_pool = cf.ProcessPoolExecutor(max_workers=8) package_results = [ package_pool.submit(sum, counts, collections.Counter()) for counts in packages ] word_sums = [f.result() for f in cf.as_completed(package_results)] all_words_seen = sum(word_sums, collections.Counter()) words_to_remove = build_people_and_find_words(SAMPLE_SIZE, all_words_seen) for word in words_to_remove: del all_words_seen[word] print("Removing extraneous and square summing...") tweets_to_remove = [] for tweet in tqdm(tweet_data.values(), desc="tweet loop"): for word in tqdm(words_to_remove, desc="word loop"): if word in tweet["word_counts"]: del tweet["word_counts"][word] if len(tweet["word_counts"].keys()) == 0: tweets_to_remove.append(tweet["nameid"]) # removal_pool = cf.ProcessPoolExecutor(max_workers=8) # removal_results = [removal_pool.submit(remove_extraneous, tweet, words_to_remove) for tweet in tweet_data.values()] # tweet_data = {} # for finished in cf.as_completed(removal_results): # result = finished.result() # if result[0]: # tweet_data[result[1]] = result[2] for nameid in tweets_to_remove: del tweet_data[nameid] # sample_size = len(tweet_data) # idf = sum(map(lambda d:collections.Counter(d["word_counts"].keys()), tweet_data.values()), collections.Counter()) # for key in idf: # idf[key] = sample_size/idf[key] for tweet in tqdm(tweet_data.values(), desc="square sum"): # for word in tweet["word_counts"]: # tweet["word_counts"][word] = tweet["word_counts"][word] * idf[word] tweet["square_sum"] = math.sqrt( sum(map((lambda x: x**2), tweet["word_counts"].values()))) print("Preliminary pairing...") prelim_data = list( map(lambda d: (d["nameid"], set_to_minhash(d["word_counts"])), tweet_data.values())) prelim_similarities = MinHashLSH(threshold=0.3, num_perm=128) with prelim_similarities.insertion_session() as session: for (key, minhash) in prelim_data: session.insert(key, minhash) pairs_to_check = {} for tweet in tqdm(tweet_data.values()): pairs = [ match for match in prelim_similarities.query(tweet["minHash"]) if match != tweet["nameid"] ] if len(pairs) > 0: pairs_to_check[tweet["nameid"]] = pairs for pair in pairs: if pair not in pairs_to_check: pairs_to_check[pair] = [] print("Pairing...") distance_pool = cf.ProcessPoolExecutor(max_workers=8) future_results = [] similarities = {} # for person in tweet_data: # similarities[person] = {} # for relation in tweet_data: # if person != relation: # future_results.append(distance_pool.submit(cos_dist, tweet_data[person], tweet_data[relation])) for (person, potentials) in tqdm(pairs_to_check.items(), desc="person"): if person in tweet_data: tweet_data[person]["processed"] = True similarities[person] = {} for relation in tqdm(potentials, desc="submitting"): if not tweet_data.get(relation, {"processed": True})["processed"]: future_results.append( distance_pool.submit(cos_dist, tweet_data[person], tweet_data[relation])) for comparison in cf.as_completed(future_results): result = comparison.result() similarities[result[0]][result[1]] = result[2] # for (person, comparisons) in similarities.items(): # for (relation, weight) in comparisons.items(): # if relation not in similarities: # similarities[relation] = {} # if person not in similarities[relation]: # similarities[relation][person] = weight print("Outputting...") if output_type == "csv": similarity_frame = pd.DataFrame(similarities) similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1) elif output_type == "json": output_to_json("./writeTest.json", similarities, tweet_data) elif output_type == "csv+json": similarity_frame = pd.DataFrame(similarities) similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1) output_to_json("./writeTest.json", similarities, tweet_data) elif output_type == "none": print("Did not write data.") print("Completed.")