class LshCluster(): def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2): self.nGram = nGram self.BandNum = BandNum self.PrenutNum = PrenutNum self.MinJaccard = MinJaccard self.Seed = 3 self.CreateLsh (Content) def Transform(self, Contexts): NewContexts = [] for ctx in Contexts: ctx = ctx.strip() ctx = ctx.replace("_", "") ctx = ctx.lower() NewContexts.append(ctx) return NewContexts def CreateLsh(self, InContext): Labels = range(len(InContext)) InContext = self.Transform (InContext) Hash = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed) self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum) self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard)) def QuerySimilars(self, Index): if Index >= self.MaxIndex: return [] Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard) return Results
def find_adjacency(draws): draws_idx = list(draws) draws_nos = list(draws.values()) # labels = [draws_idx[0]] # content = [" ".join(map(str, draws_nos[0]))] # print(labels) # print(content) # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3) # lsh = LSH(minhash, labels, no_of_bands=50) # print(lsh.query(1, min_jaccard=0.5)) new_labels = [] new_content = [] for i in range(0, len(draws)): new_labels.append(draws_idx[i]) new_content.append(" ".join(map(str, draws_nos[i]))) # print(new_labels) # print(new_content) new_labels.append(99999) new_content.append(" ".join(map(str, draws_nos[0]))) minhash = MinHash(new_content, n_gram=9, permutations=500, hash_bits=64, seed=3) lsh = LSH(minhash, new_labels, no_of_bands=100) adjacency_list = lsh.adjacency_list(min_jaccard=0.5) for key, value in adjacency_list.items(): if len(value) > 0: print(key, value)
def test_lsh_adjacency_list(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.adjacency_list(sensitivity=1000) sensitivity_list = lsh.adjacency_list(sensitivity=2) assert sensitivity_list == { 1: [8, 4], 2: [], 3: [5], 4: [1, 8], 5: [3], 6: [], 7: [], 8: [1, 4], 9: [] } jaccard_list = lsh.adjacency_list(min_jaccard=0.6) assert jaccard_list == { 1: [], 2: [], 3: [5], 4: [], 5: [3], 6: [], 7: [], 8: [], 9: [] } default_list = lsh.adjacency_list() assert default_list == { 1: [8, 4], 2: [], 3: [5], 4: [1, 8], 5: [3], 6: [], 7: [], 8: [1, 4], 9: [] }
def _get_duplicate_ids(text: List[str], lsh: LSH, min_jaccard: float) -> Iterable[str]: """Uses the given `lsh` object to find near duplicate text in `text`. Returns a list of indices into `text` which point to duplicate texts. """ duplicate_ids = set() adjacency_list = lsh.adjacency_list(min_jaccard=min_jaccard) with typer.progressbar(adjacency_list.items(), label="Deduplicating text") as progress: for query_id, similar_ids in progress: # If query_id exists in duplicate_ids, we have already accounted for it. if query_id in duplicate_ids: continue duplicate_ids.update(similar_ids) typer.secho( f"{SEARCH} Found a total of {len(duplicate_ids)} duplicate texts.", bold=True, ) return list(duplicate_ids)
#generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) #2.update lsh model with new hash signitures and verify lsh model updates reflected lsh.update(new_minhash, new_labels) print(lsh.contains()) #print the adjacency_list of all docs print(lsh.adjacency_list()) #print the edge list of all docs that are flagged as duplicates to plot in text similarity graph print(lsh.edge_list()) #remove text and label from model (if its not there , you will get an error returned) lsh.remove(6) print(lsh.contains()) #get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations) minhash.signatures.shape
def identify_dublicates(self, ctnt_to_dedup): _ix = [i for i in range(len(ctnt_to_dedup))] _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed) _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands) candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh) return candidates