def main(): print('Загружаем корпус') all_csv = pd.read_csv("./all.csv", encoding="utf-8") raw_corpus = all_csv["text"] print('Приводим его к стандартному виду') normalized_copus: List[List[str]] = [ normalize(proverb) for proverb in raw_corpus ] print('Составляем индекс для поиска дублей') lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT) deduplicated_corpus = [] for i, words in enumerate(normalized_copus): words_hash = to_minhash(words) duplicates = lsh.query(words_hash) if duplicates: print(f'Найдены совпадения для ({i}): {raw_corpus[i]}') all_csv.drop(duplicates) for idx in duplicates: print(f'\t{idx:>5d}. {raw_corpus[idx]}') else: lsh.insert(i, words_hash) deduplicated_corpus.append(raw_corpus[i]) print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus)) print( f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} рецензий)' ) all_csv.to_csv("./all_deduplicated.csv", encoding="utf-8", index=False)
def find_duplicates(minhashes, threshold, permutations, name_hashes): """ Find the duplicates amongst the minhashes. Arguments: - minhashes: a list of minhashes - threshold: the Jaccard threshold for similarity / identity - permutations: the number of permutations. Must be the same as for the minhash objects - name_hashes: list of document hashes (or any ID type, really). If not empty, similarities between documents with the same ID are taken for granted and are not reported. """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) for i, mh in enumerate(minhashes, start=1): lsh.insert(str(i), mh, check_duplication=False) for i, mh in enumerate(minhashes, start=1): similar = lsh.query(mh) similar.remove(str(i)) if name_hashes: # Remove matches that occur in the same document similar = [ s for s in similar if name_hashes[i - 1] != name_hashes[int(s) - 1] ] if similar: print('{}\t{}'.format(i, ' '.join(similar)))
def get_topn_similarity_documents_lsh(keywords, n=3): lsh = MinHashLSH(threshold=0.1, num_perm=128) documents_en = docs_col.find({"lang": 'english'}) documents_min = [ lsh_json(str(item["_id"]), item["keyword"]) for item in documents_en ] for item in documents_min: minhash = MinHash(num_perm=128) list_keyword = item["keyword"].split(",") for k in list_keyword: minhash.update(k.encode("utf-8")) lsh.insert(str(item["id"]), minhash) min = MinHash(num_perm=128) keywords = keywords.split(",") for k in keywords: # print(k) min.update(k.encode("utf-8")) result = lsh.query(min) list_docs = [] if result: for item in result: doc = docs_col.find_one({"_id": ObjectId(str(item))}) doc.pop('_id', None) list_docs.append(doc) print(list_docs) return list_docs
class LSH(): def __init__(self,rawlist,shingle_length=2,threshold=0.8): self.indoc = rawlist self.make_lsh(shingle_length=shingle_length,threshold=threshold) def make_shingles(self,doc,length=2): s = [] for i in range(len(doc)-(length-1)): s.append(doc[i:i+length]) return s def make_shingle_sets(self,doclst=None,length=2): if doclst == None: doclst=self.indoc sets = {} for d in doclst: sets[d] = self.make_shingles(d,length) return sets def make_lsh(self,shingle_length=2,threshold=0.8): print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}') sets = self.make_shingle_sets(self.indoc,shingle_length) self.minhashes = {} self.lsh = MinHashLSH(threshold=threshold, num_perm=128) for k in sets.keys(): m = MinHash(num_perm=128) for item in sets[k]: m.update(item.encode('utf8')) self.minhashes[k] = m self.lsh.insert(k,m) def get_minhash(self,doc): return self.minhashes[doc] def get_bucket(self,target_mh): return self.lsh.query(target_mh)
def LSH(): return_result = [] result = part1.readFile(k=4) num_perm = 1024 ''' threshold (float) – Jaccard 距离阈值设定,默认为0.5 num_perm (int, optional) – 哈希置换函数设定个数,在weighted-MinHash中为样本规模大小。 params (tuple, optional) – bands 的数量与规模大小。 ''' lsh = MinHashLSH(threshold=0.9, num_perm=num_perm) #num_perm=128 index = 1 for each in result: #每一个each是一个set doc = MinHash(num_perm=num_perm) for d in each: doc.update(d.encode('utf8')) lsh.insert(str(index), doc) index = index + 1 for each_doc in result: doc_target = MinHash(num_perm=num_perm) for e in each_doc: doc_target.update(e.encode('utf8')) re = lsh.query(doc_target) print("Approximate neighbours with Jaccard similarity > 0.35", re) return_result.append(re) return clean_data(return_result)
def deduplicate_file(file_prefix, output_dir, threshold, permutations): """ Deduplicates a set of minhashed documents (3 files with the same minhash prefix) and writes them to output_dir. Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) total_read = 0 with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: for input_file, results in read_batch(file_prefix): minhashes, new_minhashes = results['minhash'], [] doc_ids, new_doc_ids = results['id'], [] total_read += len(doc_ids) for i, minhash in enumerate(minhashes): if not lsh.query(minhash): lsh.insert('_'.join(doc_ids[i]), minhash) new_minhashes.append(minhash) new_doc_ids.append(doc_ids[i]) bw.write_results(input_file, { 'id': new_doc_ids, 'minhash': new_minhashes }) logging.debug('Kept {} documents out of {}'.format( len(new_doc_ids), len(doc_ids))) logging.info('Processed batch {}; kept {} documents out of {}.'.format( file_base, bw.total_written, total_read))
def build_content_sim_mh_text(network, mh_signatures): def connect(nid1, nid2, score): network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score) # Materialize signatures for convenience mh_sig_obj = [] content_index = MinHashLSH(threshold=0.7, num_perm=512) # Create minhash objects and index for nid, mh_sig in mh_signatures: mh_obj = MinHash(num_perm=512) mh_array = np.asarray(mh_sig, dtype=int) mh_obj.hashvalues = mh_array content_index.insert(nid, mh_obj) mh_sig_obj.append((nid, mh_obj)) # Query objects for nid, mh_obj in mh_sig_obj: res = content_index.query(mh_obj) for r_nid in res: if r_nid != nid: connect(nid, r_nid, 1) return content_index
def main() -> None: for _ in tqdm(range(1), desc="Create finding example:"): minhash = MinHash(num_perm=256) list_strings = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for i in range(5)) list_strings.append(rand_string) minhash.update_batch([s.encode('utf-8') for s in list_strings]) for _ in tqdm(range(1), desc="Connect to existing db:"): lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) try: for _ in tqdm(range(1), desc="Find minHash similarity:"): result = lsh.query(minhash) print("Approximate neighbours with Jaccard similarity > 0.5", result) except BaseException as e: print(str(e)) print("Error")
def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5): """ use local similarity hashing to efficiently remove tweets that are similar to others (might be autogenerated or retweets) english tweets only """ t0 = time.time() df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col]) tweets = [t.split(" ") for t in df["tweet_clean"]] t1 = time.time() print t1 - t0, "cleaned tweets" lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64) # jaccard similarity idx_selected = {} df_indices = df.index.values.tolist() for idx, tweet in zip(df_indices, tweets): s = MinHash(num_perm=64) for word in tweet: s.update(word.encode('utf8')) # only add if the tweet is not similar to existing ones if len(lsh.query(s)) == 0: lsh.insert(idx, s) idx_selected[idx] = True t2 = time.time() print t2 - t1, "created lsh" # only select the first tweet in a group of similar tweets df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices) print df["select"].value_counts() t3 = time.time() print t3-t2, "selected df" return df[df["select"]]
def minHash_LSH(data): # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions # Create LSH index lsh = MinHashLSH(threshold=0.65, num_perm=256) # Create MinHash objects minhashes = {} for c, i in enumerate(data): #c è l'indice, i è la tupla #print(i) minhash = MinHash(num_perm=256) for el in i: minhash.update(el.encode('utf8')) # for d in ngrams(i, 3): # minhash.update("".join(d).encode('utf-8')) lsh.insert(c, minhash) minhashes[c] = minhash #print(str(c)+" "+str(minhashes[c])) res_match=[] for i in range(len(minhashes.keys())): result = lsh.query(minhashes[i]) if result not in res_match and len(result)==2: res_match.append(result) #print("Candidates with Jaccard similarity > 0.6 for input", i, ":", result) #print(res) # for i in range(len(res_match)): # print(data[res_match[i][0]]) # print(data[res_match[i][1]]) return res_match
def search_lsh_jaccard_topk(index_data, query_data, b, r, k): (index_sets, index_keys, index_minhashes) = index_data (query_sets, query_keys, query_minhashes) = query_data num_perm = b * r print("Building LSH Index.") start = time.perf_counter() index = MinHashLSH(num_perm=num_perm, params=(b, r)) # Use the indices of the indexed sets as keys in LSH. for i in range(len(index_keys)): index.insert(i, index_minhashes[num_perm][i]) end = time.perf_counter() print("Indexing time: {:.3f}.".format(end-start)) print("Querying.") times = [] results = [] for query_minhash, query_key, query_set in \ zip(query_minhashes[num_perm], query_keys, query_sets): start = time.perf_counter() result = index.query(query_minhash) # Recover the retrieved indexed sets and # compute the exact Jaccard similarities. result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])] for i in result] # Sort by similarity. result.sort(key=lambda x : x[1], reverse=True) # Take the first k. result = result[:k] duration = time.perf_counter() - start times.append(duration) results.append((query_key, result)) sys.stdout.write(f"\rQueried {len(results)} sets") sys.stdout.write("\n") return (results, times)
def main(): """Точка входа в приложение.""" corpus_root = Path('corpus/clean') """Находим названия всех файлов""" list_files = file_searcher(corpus_root) print('Загружаем корпус') raw_corpus = [] for file in list_files: with open(file, 'r', encoding='utf-8') as src: text_news = '\n'.join([line.rstrip('\r\n') for line in src]) raw_corpus.append(text_news) print('Приводим его к стандартному виду') normalized_copus: List[List[str]] = [ normalize(news) for news in raw_corpus ] print('Составляем индекс для поиска дублей') dst = open('duplicate.txt', 'w', encoding='utf-8') lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT) deduplicated_corpus = [] for i, (file, words) in enumerate(zip(list_files, normalized_copus)): words_hash = to_minhash(words) duplicates = lsh.query(words_hash) if duplicates: print(f'Найдены совпадения для ({file}): {raw_corpus[i]}', file=dst) for idx in duplicates: print(f'\t{list_files[idx]}. {raw_corpus[idx]}', file=dst) print('\n\n\n\n', file=dst) else: lsh.insert(i, words_hash) deduplicated_corpus.append((raw_corpus[i], list_files[i])) print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus), file=dst) print( f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} новостей)' ) # Создаем пустые папки all_genre = [ 'Политика', 'В мире', 'Экономика', 'Общество', 'Происшествия', 'Армия', 'Наука', 'Культура', 'Религия', 'Спорт', 'Туризм' ] import os for genre in all_genre: newpath = 'corpus/super clean/' + genre if not os.path.exists(newpath): os.makedirs(newpath) # Сохраняем корпус for text, name in deduplicated_corpus: with open('corpus/super clean/' + name[13:], 'w', encoding='utf-8') as dst: print(text, file=dst)
class DuplicationIndex: def __init__( self, *, duplication_jaccard_threshold: float = 0.85, ): self._duplication_jaccard_threshold = duplication_jaccard_threshold self._num_perm = NUM_PERM self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) self._duplicate_clusters = defaultdict(set) def add(self, code_key: Tuple, min_hash: MinHash) -> None: """Add a key to _index (MinHashLSH) the min_hash is used to query closest matches based on the jaccard_threshold. The new key is either added to a existing cluster of one close match, or a new cluster is created. The clusters created in this way, depend on the order of add. Args: code_key (Tuple of (index, repo_name, path)): Theoritically any hasbale key. Here we use a tuple to retrieve the information later. min_hash: MinHash of the code_key. """ close_duplicates = self._index.query(min_hash) if code_key in self._index.keys: print(f"Duplicate key {code_key}") return self._index.insert(code_key, min_hash) if len(close_duplicates) > 0: for base_duplicate in close_duplicates: if base_duplicate in self._duplicate_clusters: self._duplicate_clusters[base_duplicate].add(code_key) break else: self._duplicate_clusters[close_duplicates[0]].add(code_key) def get_duplicate_clusters(self) -> List[List[Dict]]: """Export the duplicate clusters. For each cluster, the first element is the base element of the cluster. The base element has an estimation jaccard similarity higher than the threshold with all the other elements. Returns: duplicate_clusters (List[List[Dict]]): List of duplicate clusters. """ duplicate_clusters = [] for base, duplicates in self._duplicate_clusters.items(): cluster = [base] + list(duplicates) # reformat the cluster to be a list of dict cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster] duplicate_clusters.append(cluster) return duplicate_clusters def save(self, filepath) -> None: duplicate_clusters = self.get_duplicate_clusters() with open(filepath, "w") as f: json.dump(duplicate_clusters, f)
def find_relation_class_name_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = nlp.camelcase_to_snakecase(source_name) source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() m = MinHash(num_perm=32) for token in source_name.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('relation', (db_name, original_source_name), m)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=32) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) # Index all the minhashes lsh_index = MinHashLSH(threshold=0.5, num_perm=32) for idx in range(len(names)): lsh_index.insert(idx, names[idx][2]) matchings = [] for idx in range(0, num_relations_inserted): # Compare only with classes N = lsh_index.query(names[idx][2]) for n in N: kind_q = names[idx][0] kind_n = names[n][0] if kind_n != kind_q: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx][1][0], names[idx][1][1], "_"), names[n][1]) matchings.append(match) et = time.time() print("Time to relation-class (name): " + str(et - st)) return matchings
def perform_lsh(lsh_text, standard_labels, title_labels, char_ngram=5, savefile=''): t0 = time.time() shingled_desc = [shingles(desc) for desc in lsh_text] print_elapsed(t0, 'splitting the text into groups of characters') #Create hash signatures for shingles t0 = time.time() hash_objects = [] for i in range(len(shingled_desc)): m = MinHash(num_perm=200) hash_objects.append(m) print_elapsed(t0, 'creating hash signatures') t0 = time.time() for ix, desc in enumerate(shingled_desc): for d in desc: hash_objects[ix].update(d.encode('utf8')) print_elapsed(t0, 'encoding hash objects') #Define LSH and Jaccard similarity threshold lsh = MinHashLSH(threshold=0.8, num_perm=200) content = [] for ix, desc in enumerate(shingled_desc): content.append((standard_labels[ix], hash_objects[ix])) for ix, elem in enumerate(content): #lsh.insert('{}'.format(ix), elem[1]) #elem[0], elem[1]) lsh.insert(elem[0], elem[1]) #For each standard search all signatures and identify potential clashes (e.g. other standards with Jaccard similarity #of shingle sets greater or equal to the threshold). Note: some of the candidates might be false positives. candidates = {} for ix, desc in enumerate(shingled_desc): result = lsh.query(hash_objects[ix]) if len(result) > 1: candidates[standard_labels[ix] + ': ' + title_labels[ix]] = [ (res, df_nos['Title'].loc[res]) for res in result ] #candidates.append(result) print(standard_labels[ix] + ': ' + title_labels[ix], ': ', [(res, df_nos['Title'].loc[res]) for res in result]) #print(standard_labels[ix], ': ',result) print('***************') else: candidates[standard_labels[ix]] = 'none' if len(savefile): pd.DataFrame.from_dict(candidates, orient='index').to_csv(savefile) return candidates, shingled_desc, content, lhs
def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold, permutations): """ Removes all documents from a set of minhashed documents (3 files with the same minhash prefix) that occur in other batches in input_dir. Only batches whose number is higher than the batch in question are considered (i.e. upper triangular matrix). Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) # First, load the (already deduplicated) batch... for input_file, results in read_batch(file_prefix): for doc_id, minhash in zip(results['id'], results['minhash']): lsh.insert('\t'.join(doc_id), minhash) initial_len = len(lsh.keys) to_match_with = find_all_batches(input_dir, int(file_prefix.rpartition(os.sep)[-1])) # Now, remove all documents in it that are contained in other batches # to the "right" of it (with greater batch numbers) for batch in to_match_with: initial_batch_len = len(lsh.keys) for _, results in read_batch(batch): for i, minhash in enumerate(results['minhash']): for duplicate in lsh.query(minhash): lsh.remove(duplicate) logging.info( 'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'. format(file_base, op.basename(batch), initial_batch_len, len(lsh.keys))) # Finally, we print the documents left. Unfortunately, in order to # keep the format, we have to read the original batch again. with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: # OK, we need to re-read the batch unfortunately for input_file, results in read_batch(file_prefix): doc_ids, minhashes = [], [] for doc_id, minhash in zip(results['id'], results['minhash']): if '\t'.join(doc_id) in lsh: doc_ids.append(doc_id) minhashes.append(minhash) bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes}) logging.info('Processed batch {}; kept {} out of {} documents.'.format( file_base, len(lsh.keys), initial_len)) return len(lsh.keys), initial_len
def compare_products(product1, product2): """Checks if the two given series/dicts(strictly) belong to the same product The keys should strictly be followed as per the dataset. """ if product1['imageUrl'] == product2['imageUrl']: print('Yes') return text_cols = ['key_specs_text', 'description', 'title'] id1 = product1['productId'] check_image = False product1['full_text'] = '' product2['full_text'] = '' for col in text_cols: product1['full_text'] += ' ' + product1[col].translate(table) product2['full_text'] += ' ' + product2[col].translate(table) m1 = MinHash(num_perm=258) m2 = MinHash(num_perm=258) for d in ngrams(product1['full_text'], 3): m1.update(d.encode('utf-8')) for d in ngrams(product2['full_text'], 3): m2.update(d.encode('full_text')) lsh = MinHashLSH(threshold=0.9, num_perm=256) lsh.insert(id1, m1) result = lsh.query(m2) if id1 in result: print('Similar Text') check_image = True if not check_image: print('No') else: print(product1['imageUrl']) print(product2['imageUrl']) img1 = download_image(product1['imageUrl']) img2 = download_image(product2['imageUrl']) img1 = np.expand_dims(extract_features(product1['imageUrl'], img1), axis=0) img2 = np.expand_dims(extract_features(product2['imageUrl'], img2), axis=0) cosine_mat = cosine_similarity(img1, img2) if cosine_mat > 0.5: print('Yes') else: print('No')
def learn_duplicates(name, f, verbose=False): print(name) logging.basicConfig(level=logging.DEBUG) texts_sample = [ item['extracted_text'] for item in item_reader(f, name, limit=300)] dupe_predictor = DupePredictor(texts_sample) lsh = MinHashLSH(threshold=0.9, num_perm=128) # separate from dupe_predictor too_common_shingles = dupe_predictor.too_common_shingles threshold = 0.98 y_pred, y_true = [], [] def _report_pr(): tp = sum(p > threshold and d for p, d in zip(y_pred, y_true)) fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true)) fn = sum(p < threshold and d for p, d in zip(y_pred, y_true)) n_dup = tp + fn print('precision: %.3f, recall %.3f at %.2f threshold ' '(%d duplicates)' % ( tp / (tp + fp) if tp else 0., tp / n_dup if n_dup else 0., threshold, n_dup)) for i, item in enumerate(item_reader(f, name)): dupe_prob = dupe_predictor.get_dupe_prob(item['url']) y_pred.append(dupe_prob) min_hash = get_min_hash(item['extracted_text'], too_common_shingles) if dupe_prob < threshold: duplicates = [url for url, _ in dupe_predictor.update_model( item['url'], item['extracted_text'])] else: # We think this is a duplicate: replicate crawling # and do not update the model. duplicates = list(lsh.query(min_hash)) key = canonicalize_url(item['url']) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) y_true.append(bool(duplicates)) if verbose: if duplicates and dupe_prob < threshold: path = _full_path(item['url']) sample = [url for url in duplicates if _full_path(url) == path] or duplicates print('false negative %s (%s, %d more)' % ( item['url'], sample[0], len(sample) - 1)) elif not duplicates and dupe_prob > threshold: print('false positive', item['url']) if i % 100 == 0: _report_pr() _report_pr()
def main(): path = Path('C:/Data/Python/JobLoss') orig_data = [] ind_map = [] ind = 0 with open(path / 'Processed.json') as f: data = json.load(f) for tweet in data: if tweet['type'] != 'retweet': orig_data.append(tweet['orig_text']) ind_map.append(ind) ind += 1 # orig_data.append(tweet['orig_text']) markers = [0 for _ in range(len(orig_data))] lsh = MinHashLSH(threshold=0.5, num_perm=128) minhashes = {} for c, i in enumerate(orig_data): # print(c) minhash = MinHash(num_perm=128) for d in ngrams(i, 5): minhash.update(''.join(d).encode('utf-8')) lsh.insert(c, minhash) minhashes[c] = minhash for i in range(len(minhashes.keys())): result = lsh.query(minhashes[i]) if markers[i] == 2: continue markers[i] = 1 for j in result: if markers[j] != 1: markers[j] = 2 doc_set = set() similar_removed = [ data[ind_map[ind]] for ind, val in enumerate(markers) if val != 2 ] final = [] identicals = 0 for line in similar_removed: doc = ' '.join(line['text']) if doc in doc_set: identicals += 1 continue doc_set.add(doc) final.append(line) print(identicals) print(len(final)) with open(path / 'ProcessedSimilarRemoved.json', 'w') as f: json.dump(final, f)
def lsh_similar(minhashes: Dict[T, MinHash], num_perm: int, bands: int, rows: int) -> Generator[Tuple[T, T], None, None]: """Yields all of similar pairs of minhashes using LSH minhashes - Dictionary of key to Minhash num_perm - Number of permutations used in Minhash bands - Number of bands to use in LSH rows - Number of rows to use in LSH """ lsh = MinHashLSH(num_perm=num_perm, params=(bands, rows)) for i, mh in minhashes.items(): # Check if duplicate of already seen item for j in lsh.query(mh): yield (j, i) # Add to the seen items lsh.insert(i, mh)
def benchmark_lsh(threshold, index_data, query_data): print("Building LSH index") num_perm = len(index_data.minhashes[0].hashvalues) lsh = MinHashLSH(threshold, num_perm) for key, minhash in zip(index_data.filenames, index_data.minhashes): lsh.insert(key, minhash) print("Querying") times = [] results = [] for minhash in query_data.minhashes: start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(result) return times, results
def benchmark_lsh(threshold, index_data, query_data): print("Building LSH index") num_perm = len(index_data.minhashes[0].hashvalues) lsh = MinHashLSH(threshold, num_perm) for key, minhash in zip(index_data.filenames, index_data.minhashes): lsh.insert(key, minhash) print("Querying") times = [] results = [] for minhash in query_data.minhashes: start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(result) return times, results
class DuplicateChecker: def __init__(self): self.minhashes = {} self.lsh = MinHashLSH(threshold=THRESHOLD) def create_minhashes_reading_articles(self, start_date, end_date): """Fills the minhashes dict with the files paths as the keys and the minhashes from the articles bodies as the values""" for category in read_categories_from_file(): for date_between in get_dates_between(start_date, end_date): try: date_between = date_between.strftime('%Y/%m/%d') current_dir_path = f'{DUMP_DIR}/{category}/{date_between}' for filename in os.listdir(current_dir_path): self._create_minhash_from_file(current_dir_path, filename) except FileNotFoundError: pass def _create_minhash_from_file(self, current_dir_path, filename): file_path = f'{current_dir_path}/{filename}' with open(file_path) as f: article = Article(**json.load(f)) if not article.body: os.remove(file_path) return minhash = MinHash() for word in article.body.split(' '): minhash.update(word.encode('utf8')) lean_minhash = LeanMinHash(minhash) self.minhashes[file_path] = lean_minhash self.lsh.insert(file_path, lean_minhash) def find_similar_articles(self): """Finds every similar article from the LSH index, and removes it from the index itself as well as the file from the disk""" for path, minhash in self.minhashes.items(): # The LSH will find at least the path itself, so we need to filter it for similar_article_path in [ x for x in self.lsh.query(minhash) if x is not path ]: print( f'\tremoving similar article from {similar_article_path}') self.lsh.remove(similar_article_path) with contextlib.suppress(FileNotFoundError): os.remove(similar_article_path)
def benchmark_lsh(num_perm, threshold, index_data, query_data): print("Building LSH index") lsh = MinHashLSH(threshold, num_perm) for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]): lsh.insert(key, minhash) print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = lsh.query(minhash) duration = time.clock() - start times.append(duration) results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x : x[1], reverse=True)) return times, results
def similarity_threshold_bulk(self, df_library, df_query, only_positive=False, return_df=False): """ Takes a dataframe of 'library' strings to query against, and a dataframe of query strings. Gives these unique IDs. Transforms both the library and the query strings into minhash objects. If return_df==True then df_query will be returned with a column showing how many similar utterances have been found in df_library. TODO: maybe use redis in production """ from datasketch import MinHashLSH lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm) data_library = self.dataframe_to_data_list(df_library, 'lib_') data_query = self.dataframe_to_data_list(df_query, 'query_') # use an insertion session to create an lsh object with all the lib data that can be queried with lsh.insertion_session() as session: for key, minhash in data_library: session.insert(key, minhash) # bulk query the data_query objects against lsh query_results = [] df_query['no_similar'] = 0 for key, minhash in data_query: query_result = lsh.query(minhash) query_result_length = len(query_result) if return_df: df_query.loc[key, 'no_similar'] = len(query_result) elif only_positive: # only need to care about only_positive if not returning a dataframe if query_result_length > 0: query_results.append( (key, query_result, query_result_length)) else: query_results.append((key, query_result, query_result_length)) if return_df: return df_query else: return query_results
def _index_records(self, records): """ Constructs Minhash LSH buckets for a given set of records Args: records (dict) : dict of (record_id -> record_value) Returns: None """ indexer = defaultdict(list) # Create minhashes minhashes = {} for rid in records: m = MinHash(num_perm=self._num_perm) for d in records[rid]: qgrams = set(self.nt.basic(d, 2)) for gram in qgrams: m.update(gram.encode('utf-8')) minhashes[rid] = m # Create LSH instance and add min hashes if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS: lsh = MinHashLSH(threshold=self._threshold, num_perm=self._num_perm) else: lsh = MinHashLSH(num_perm=self._num_perm, params=(self._bands, self._rows)) max_blocks = [] for rid in records: lsh.insert(rid, minhashes[rid]) max_blocks.append(rid) # Generate blocks while (len(max_blocks) > 0): key = max_blocks[0] bucket = lsh.query(minhashes[key]) for rid in bucket: if rid in max_blocks: max_blocks.remove(rid) indexer["b" + str(self._block_index)].append(rid) self._block_index += 1 self._write_indexer(indexer)
def lsh_clustering( signatures: List[np.ndarray], threshold: float = 0.5, num_perm: int = 128, ): lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) with lsh.insertion_session() as session: for key, minhash in enumerate(signatures): session.insert(f"id-{key}", MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors: List[List[int]] = [] for key, minhash in enumerate(signatures): result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash)) neighbors.append([int(x.split("-")[1]) for x in result]) return neighbors
def mass_values_jaccard(cols1: List[Column], cols2: List[Column]): lsh = MinHashLSH( threshold=0.2, num_perm=128, storage_config={ "type": "redis", "redis": { "host": "localhost", "port": 6379 } }, ) with lsh.insertion_session() as session: for idx, col in enumerate(cols1): session.insert(str(idx), col.values) result = lsh.query()
def validate(Session, event): host, redis_url, port = os.environ["REDIS_URL"].split(":") redis_url = redis_url.replace("//", "") print({'host': redis_url, 'port': port}) lsh = MinHashLSH( storage_config={ 'type': 'redis', 'redis': { 'host': redis_url, 'port': port }, 'basename': b'digital_checker', }) uid = uuid.uuid4().hex body = event["body-json"] print(body) api_key_id = event["context"]["api-key-id"] try: digest_str = body["digest"] meta_books = body["meta_media"] validate_params(Session, meta_books) except Exception as e: print("Error " + str(e)) return {"statusCode": 200, "body": json.dumps({"message": str(e)})} m1 = convert_str_to_minhash(digest_str) result = lsh.query(m1) if len(result) > 0: return { "statusCode": 200, "body": json.dumps({ "message": "Duplicate", }), } else: insert_mysql(Session, api_key_id, uid, body) lsh.insert(key=uid, minhash=m1) return { "statusCode": 200, "body": json.dumps({ "message": "Ok", "id": uid, }), }
def deduplicate_self(file_prefix, output_dir, threshold, permutations): """ Deduplicates a set of minhashed documents (3 files with the same minhash prefix) and writes them to output_dir. Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) total_read = 0 duplicate_urls = 0 with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: for input_file, results in read_batch(file_prefix): minhashes, new_minhashes = results['minhash'], [] doc_ids, new_doc_ids = results['id'], [] total_read += len(doc_ids) input_duplicate_urls = 0 for doc_id, minhash in zip(doc_ids, minhashes): key = '_'.join(doc_id) if key in lsh: input_duplicate_urls += 1 continue if not lsh.query(minhash): lsh.insert(key, minhash) new_minhashes.append(minhash) new_doc_ids.append(doc_id) bw.write_results(input_file, { 'id': new_doc_ids, 'minhash': new_minhashes }) duplicate_urls += input_duplicate_urls logging.debug('Kept {} documents out of {} in file {}; ' '{} duplicate urls.'.format(len(new_doc_ids), len(doc_ids), input_file, input_duplicate_urls)) logging.info('Deduplicated batch {}; kept {} documents out of {}; ' '{} duplicate urls.'.format(file_base, bw.total_written, total_read, duplicate_urls)) return bw.total_written, total_read
def get_most_similar(self, threshold=0.5, num_perm=128, ngrams_num=3): lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) minhashes = {} data = self.create_data() for single_data in data: minhash = MinHash(num_perm=num_perm) file_name = single_data['file_name'] content = single_data['content'] for d in ngrams(content, ngrams_num): minhash.update("".join(d).encode('utf-8')) lsh.insert(file_name, minhash) minhashes[file_name] = minhash for file_name in minhashes.keys(): result = lsh.query(minhashes[file_name]) # 排除自身文件,若存在相似度大于0.5,则打印 result.remove(file_name) if len(result) > 0: print("Candidates with Jaccard similarity > 0.5 for input ", file_name, ":", result)
def consolidate_dupes(self, agg_files): # Remove short items for key, value in agg_files.items(): for fkey in list(value['files'].keys()): # print("File params: ", value['files'][fkey].keys()) if not 'content_text' in value['files'][fkey]: print("Missing file:", key, fkey) value['files'].pop(fkey) elif len(value['files'][fkey]['content_text']) < 100: print("Removing short file: ", (key, fkey)) value['files'].pop(fkey) smap = {} for key, value in agg_files.items(): for fkey in value['files']: smap[(key, fkey)] = value['files'][fkey]['content_text'] perms = 512 gram_sz = 10 thresh = 0.5 lsh = MinHashLSH(threshold=thresh, num_perm=perms) print("Loading word hashes") minhashes = {} with ProcessPoolExecutor(max_workers=10) as ex: print("Submitting jobs") futures = [(key, ex.submit(minhash_str, content, perms, gram_sz)) for key, content in smap.items() ] print("Submitted %s jobs. Consuming futures" % len(futures)) for key, future in tqdm.tqdm(futures, "Hashing"): minhash = future.result() lsh.insert(key, minhash) minhashes[key] = minhash lens = {} for key, content in smap.items(): clen = len(content) lens.setdefault(clen, []) lens[clen].append(key) lenl = list(lens.keys()) lenl.sort() print("%s items in file map before dupe elimination" % len(smap)) for clen in lenl: tgt_keys = lens[clen] for key in tgt_keys: if key not in smap: continue if key not in minhashes: continue result = lsh.query(minhashes[key]) if key in result: result.remove(key) if result: still_ok = [tmp for tmp in result if tmp in smap] if still_ok: smap.pop(key) akey, fkey = key agg_files[akey]['files'].pop(fkey) # for res in result: # print(key) # print("Similar: ", result) print("%s items in file map after dupe elimination" % len(smap)) return agg_files
def print_stats( f, show=None, skip_unique=False, max_int_value=5, duration_limit=None, print_duplicates=False, print_urls=False, limit=None): stats = Counter() if not skip_unique: lsh = MinHashLSH(threshold=0.9, num_perm=128) too_common = get_too_common_shingles(f, limit=1000) urls = {} min_timestamp = max_timestamp = None for i, item in enumerate(item_reader(f, limit=limit)): if print_urls: print(item['url']) content_type = item.get('content_type', 'missing') stats.update([ 'content_type: ' + content_type, 'content_type[0]: ' + content_type.split('/')[0]]) if min_timestamp is None: min_timestamp = item['timestamp'] max_timestamp = item['timestamp'] if duration_limit and \ (max_timestamp - min_timestamp) / 1000 > duration_limit: break if 'extracted_text' not in item: assert item['obj_stored_url'] stats.update(['documents']) continue stats.update(['items']) for key, value in item['extracted_metadata'].items(): if key == 'forms': for form in value: stats.update(['form_{}'.format(form['form'])]) stats.update(['form_field {}'.format(f) for f in form['fields'].values()]) if isinstance(value, list): value = len(value) if isinstance(value, int) and not isinstance(value, bool): if value >= max_int_value: value = '{}+'.format(max_int_value) key = '{}_{}'.format(key, value) if value: stats.update([key]) if key == show: print(item['url']) if not skip_unique: min_hash = get_min_hash(item['extracted_text'], too_common) duplicates = lsh.query(min_hash) if not duplicates: stats.update(['unique_items']) elif print_duplicates: print('{} {} duplicates: {}'.format( item['url'], len(duplicates), ' '.join(urls[k] for k in duplicates[:10]))) key = 'item_{}'.format(i) lsh.insert(key, min_hash) urls[key] = item['url'] if max_timestamp and min_timestamp: stats['duration'] = (max_timestamp - min_timestamp) / 1000 for k, v in sorted(stats.items()): print(k.ljust(20), v) return stats
# Create MinHash objects m = [] for i in range(0,allshingle.__len__()): m.append(MinHash(num_perm=128)) for i in range(allshingle.__len__()): for d in allshingle[i]: m[i].update(d.encode('utf8')) # Create an MinHashLSH index optimized for Jaccard threshold 0.5, # that accepts MinHash objects with 128 permutations functions lsh = MinHashLSH(threshold=1, num_perm=128) # Insert m into the index for i in range(0, m.__len__()): lsh.insert("m%d"%i, m[i]) # Search all the frequent shingle which frequency bigger than 100 result = [] for i in range(0, m.__len__()): if len(lsh.query(m[i])) > 100: result.append(lsh.query(m[i])) #Find the frequency of the shingle index = [] for i in range(0,result.__len__()): tem = len(result[i]) index.append(tem)
class NearDuplicate(EtlProcessor): """A class that acts over the raw tweets collected from the twitter stream in order to detect whether the tweet is duplicate, near-duplicate or nothing at all""" punct = re.compile(r"[\.,;:]\\xe2", re.IGNORECASE) langs = { "es": "spanish", "en": "english" } process_count = 0 def __init__( self, connector=None, lang='en', threshold=0.8, permutations=90, autostart=True ): self.permutations = permutations self.threshold = threshold self.lang = lang self.connector = None self.lsh = None EtlProcessor.__init__(self, connector=connector, autostart=autostart) if autostart: self.load() self.listen() def listen(self): """Performs a model check on whether the current tweet resembles at least to a 80% level as other previous tweets""" for msg in self.connector.listen(): tweet = json.loads(msg.value()) try: if self.is_unique(tweet): self.connector.send( msg.value() ) self.connector.log( json.dumps({ "id_str": tweet['id_str'], "source": self.connector.consumer_topic, "dest": self.connector.producer_topic }) ) except ValueError: self.connector.send( json.dumps({ "id_str": tweet['id_str'], "source": self.connector.consumer_topic, "dest": "error" }) ) continue finally: self.process_count += 1 if self.process_count % 1000 == 0: self.save() def load(self): """Loads the stored model data from previous runs""" if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)): self.lsh = pickle.load( open( './minhash-%s--%d-%.2f.pkl' % ( self.lang, self.permutations, self.threshold ), 'rb' ) ) else: self.lsh = MinHashLSH( threshold=self.threshold, num_perm=self.permutations ) def save(self): """Stores the currently processed data for this model""" pickle.dump( self.lsh, open( './minhash-%s--%d-%.2f.pkl' % ( self.lang, self.permutations, self.threshold ), 'wb+' ) ) def replace_urls(self, tweet): """Convenience function that replaces the compressed URLs by their expanded counterparts, in order to treat the same real URL as it is (and not obfuscating the same URL in diferent tweets by a different t.co link)""" removed_characters = 0 if 'entities' in tweet and 'urls' in tweet['entities']: for url in tweet['entities']['urls']: tweet['text'] = tweet['text'][:(url['indices'][0] - removed_characters - 1)] + \ tweet['text'][(url['indices'][1] - removed_characters - 1):] removed_characters += url['indices'][1] - url['indices'][0] for url in tweet['entities']['urls']: tweet['text'] += ' ' + url['expanded_url'] return tweet @lru_cache(maxsize=1e06) def minhash_tweet(self, tweet_text): """Minhashing operation that allows for a caching of up to 1M tweets in order to speed up the checking procedure when it's the same tweet text""" tweet_hash = MinHash(num_perm=self.permutations) for word in tweet_text.split(): tweet_hash.update( self.punct.sub( "", word.encode('utf8') ) ) return tweet_hash def is_unique(self, tweet): """Core method to check whether this tweet resembles enough to other previous tweets to label it as unique or near-duplicate""" is_unique_tweet = False urlfied_tweet = self.replace_urls(tweet) mht = self.minhash_tweet( urlfied_tweet['text'] ) if 'minteressa' not in tweet: tweet['minteressa'] = {} if self.lsh.is_empty() is not True: similars = self.lsh.query(mht) if len(similars) == 0: # It's a unique tweet try: self.lsh.insert( tweet['id_str'], mht ) is_unique_tweet = True except ValueError: logging.error(ValueError) else: # nondupe for tweet_idx in similars: if 'nearduplicates' not in tweet['minteressa']: tweet['minteressa']['nearduplicates'] = 0 tweet['minteressa']['nearduplicates'] += 1 else: is_unique_tweet = True self.lsh.insert( tweet['id_str'], mht ) return is_unique_tweet
def minhash_merger_series(interactive=True): matchlogger = MatchLogBuilder() if interactive: callback=askuser_callback_series else: callback=matchlogger.add_match_series print("fetching series") with app.app_context(): items = models.Series.query.options( joinedload(Series.alternatenames) ).all() altn = [] for item in items: for name in item.alternatenames: altn.append((name.id, name.series, name.cleanname, item.title)) print("Building mapping dictionaries") # Map altname id to series id altnid_sid_dict = dict([(tmp[0], tmp[1]) for tmp in altn]) altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn]) sid_sname_dict = dict([(tmp[1], tmp[3]) for tmp in altn]) sid_altnid_dict = {} for nid, sid in altnid_sid_dict.items(): sid_altnid_dict.setdefault(sid, []) sid_altnid_dict[sid].append(nid) print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict))) perms = 512 gram_sz = 3 minhashes = {} lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms) print("Building lsh minhash data structure") with ProcessPoolExecutor(max_workers=8) as ex: print("Submitting jobs") futures = [(key, ex.submit(minhash_str, content, perms, gram_sz)) for key, content in altnid_name_dict.items() if len(content) >= 5 ] print("Consuming futures") for key, future in tqdm.tqdm(futures): minhash = future.result() lsh.insert(key, minhash) minhashes[key] = minhash print("Doing search") for key, minhash in minhashes.items(): result = lsh.query(minhashes[key]) if key in result: result.remove(key) if result: sid = altnid_sid_dict[result[0]] src_sid = altnid_sid_dict[key] if sid != src_sid: sname = sid_sname_dict[sid] res_sids = set([altnid_sid_dict[tmp] for tmp in result]) names = [] for res_id in result: if altnid_sid_dict[res_id] != src_sid: names.append((altnid_sid_dict[res_id], res_id, altnid_name_dict[res_id])) if names: names.sort() print("Search returned %s results in %s series for %s:%s" % (len(result), len(res_sids), src_sid, sname)) for sid, nid, name in names: print(" %s -> %s: %s" % (str(sid).rjust(8), str(nid).rjust(8), name)) if not interactive: matchlogger.save_log("./seriesname-matchset-minhash.json")