class LshCluster(): def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2): self.nGram = nGram self.BandNum = BandNum self.PrenutNum = PrenutNum self.MinJaccard = MinJaccard self.Seed = 3 self.CreateLsh (Content) def Transform(self, Contexts): NewContexts = [] for ctx in Contexts: ctx = ctx.strip() ctx = ctx.replace("_", "") ctx = ctx.lower() NewContexts.append(ctx) return NewContexts def CreateLsh(self, InContext): Labels = range(len(InContext)) InContext = self.Transform (InContext) Hash = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed) self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum) self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard)) def QuerySimilars(self, Index): if Index >= self.MaxIndex: return [] Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard) return Results
def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'): """Using LSH object finds the near duplicate strings. Args: query_sentences (dict): Dict with query strings and version of string in lower case and without comma. sentences (dict): Dict with target strings and version of string in lower case and without comma. min_jaccard_value (float): Minimum value for the Jaccard Distance. no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets. n_permutations (int): Number of permutations used to create minhash signatures used in LSH model. n_gram (int): Size of each overlapping text shingle to break text into prior to hashing. """ for i in targets: if len(i) < n_gram: pp(i) # Create MinHash object. minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type, permutations=n_permutations, hash_bits=64, seed=SEED) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=no_of_bands) # Query to find near duplicates the string in `search` closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value) # print("QUERY: {}".format(labels[0])) # pp(closest_results) return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
def test_lsh_query(): lsh = LSH(minhash, labels) with pytest.raises(KeyError): lsh.query(10) with pytest.raises(KeyError): lsh.query(0) with pytest.raises(ValueError): lsh.query(2, sensitivity=100) result = lsh.query(1) assert result == [8, 4] result = lsh.query(1, sensitivity=29) assert result == [4] result = lsh.query(1, min_jaccard=0.55) assert result == [4]
#hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit). #NOTE: should be chosen based on text length and a trade off between performance ad accuracy hash_bits = 64 # Create MinHash object. minhash = MinHash(content, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=50) #query to find near duplicates for text 1 print(lsh.query(1, min_jaccard=.5)) #update model #generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits,
from snapy import MinHash, LSH import numpy as np from fasta_parser import parse_to_list contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa" print("parsing contigs to list...") contigs_list, num_contigs = parse_to_list(contigs_file) print("number of contigs (shorter then 1,00bp):", num_contigs) labels = np.arange(num_contigs).tolist() # Create MinHash object. print("creating minhash object...") minhash = MinHash(contigs_list, n_gram=24) # Create LSH model. print("creating LSH object...") lsh = LSH(minhash, labels) print("query object:") print("similar strings to the string in index 0:", lsh.query(0)) print("similar strings to the string in index 1:", lsh.query(1)) print("similar strings to the string in index 2:", lsh.query(2)) print("similar strings to the string in index 3:", lsh.query(3)) print("similar strings to the string in index 4:", lsh.query(4)) print("similar strings to the string in index 5:", lsh.query(5)) print(lsh.contains())