def test_minhash_errors(): with pytest.raises(ValueError): MinHash(content, n_gram_type='words') with pytest.raises(ValueError): MinHash(content, hash_bits=65) with pytest.raises(ValueError): MinHash(content, method='universal') with pytest.raises(ValueError): MinHash(content, n_gram=63)
def create_lsh(self, content, no_of_bands, n_permutations, n_gram): """Create Minhash and Locality Sensitive Hashing (LSH) to detect near duplicate texts. Args: content (list): List with string to build LSH. no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets. n_permutations (int): Number of permutations used to create minhash signatures used in LSH model. n_gram (int): Size of each overlapping text shingle to break text into prior to hashing. no_of_bands(int): Number of bands to break minhash signature into before hashing into buckets. Returns: class 'snapy.lsh.LSH': Snapy LSH object. """ labels = range(len(content)) # Create MinHash object. minhash = MinHash(content, n_gram=n_gram, permutations=n_permutations, hash_bits=64, seed=SEED) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=no_of_bands) return lsh
def test_update_lsh(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.update(minhash, labels) new_content = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.' ] new_labels = [11, 12] incorrect_minhash = MinHash(new_content, permutations=10) with pytest.raises(ValueError): lsh.update(incorrect_minhash, new_labels) correct_minhash = MinHash(new_content) lsh.update(correct_minhash, new_labels) assert lsh.permutations == 100 assert list(lsh._i_bucket) == labels + [11, 12]
def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'): """Using LSH object finds the near duplicate strings. Args: query_sentences (dict): Dict with query strings and version of string in lower case and without comma. sentences (dict): Dict with target strings and version of string in lower case and without comma. min_jaccard_value (float): Minimum value for the Jaccard Distance. no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets. n_permutations (int): Number of permutations used to create minhash signatures used in LSH model. n_gram (int): Size of each overlapping text shingle to break text into prior to hashing. """ for i in targets: if len(i) < n_gram: pp(i) # Create MinHash object. minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type, permutations=n_permutations, hash_bits=64, seed=SEED) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=no_of_bands) # Query to find near duplicates the string in `search` closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value) # print("QUERY: {}".format(labels[0])) # pp(closest_results) return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
def find_adjacency(draws): draws_idx = list(draws) draws_nos = list(draws.values()) # labels = [draws_idx[0]] # content = [" ".join(map(str, draws_nos[0]))] # print(labels) # print(content) # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3) # lsh = LSH(minhash, labels, no_of_bands=50) # print(lsh.query(1, min_jaccard=0.5)) new_labels = [] new_content = [] for i in range(0, len(draws)): new_labels.append(draws_idx[i]) new_content.append(" ".join(map(str, draws_nos[i]))) # print(new_labels) # print(new_content) new_labels.append(99999) new_content.append(" ".join(map(str, draws_nos[0]))) minhash = MinHash(new_content, n_gram=9, permutations=500, hash_bits=64, seed=3) lsh = LSH(minhash, new_labels, no_of_bands=100) adjacency_list = lsh.adjacency_list(min_jaccard=0.5) for key, value in adjacency_list.items(): if len(value) > 0: print(key, value)
def test_terms_minhash(): minhash = MinHash(content, n_gram_type='term', seed=seed) assert minhash.n_gram_type == 'term' assert type(minhash.signatures) is np.ndarray signature = minhash.signatures assert signature.shape == (9, 100) assert signature[0][0] == -8115786556000760185 assert np.array(signature[0][0]).dtype == 'int64' assert signature[-1][-1] == -579511180950999701
def multi_hash_tests(first_hash, second_hash, hash_size): minhash = MinHash(content, hash_bits=hash_size, seed=seed) assert minhash.seed == 3 assert minhash.method == 'multi_hash' assert type(minhash.signatures) is np.ndarray assert minhash.signatures.shape == (9, 100) signature = minhash.signatures assert signature[0][0] == first_hash assert signature[-1][-1] == second_hash
def test_terms_minhash(): minhash = MinHash(content, n_gram_type='term') assert minhash.n_gram_type == 'term' assert type(minhash.signatures) is np.ndarray signature = minhash.signatures assert signature.shape == (9, 100) assert signature[0][0] == -3695830800917301951 assert np.array(signature[0][0]).dtype == 'int64' assert signature[-1][-1] == -7737361925742332862
def k_smallest_hash_tests(first_hash, second_hash, hash_size): minhash = MinHash(content, permutations=53, hash_bits=hash_size, method='k_smallest_values', seed=seed) assert minhash._hash_seeds == 83957611 assert minhash.method == 'k_smallest_values' assert type(minhash.signatures) is np.ndarray assert minhash.signatures.shape == (9, 53) signature = minhash.signatures assert signature[0][0] == first_hash assert signature[-1][-1] == second_hash with pytest.raises(ValueError): MinHash(content, permutations=200, hash_bits=hash_size, method='k_smallest_values', seed=seed)
def test_minhash_defaults(): minhash = MinHash(content) assert type(minhash.signatures) is np.ndarray assert minhash.signatures.shape == (9, 100) assert minhash.n_gram == 9 assert minhash.n_gram_type == 'char' assert minhash.permutations == 100 assert minhash.hash_bits == 64 assert minhash.method == 'multi_hash' assert minhash._hash_seeds.shape[0] == 100
def get_lsh_model(documents, seed_int): seed = seed_int # Create MinHash object. minhash = MinHash(my_documents, n_gram=9, permutations=100, hash_bits=64, seed=seed) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=50) return lsh
def _create_lsh( text: List[str], labels: List[int], n_gram: int, n_permutations: int, hash_bits: int, no_of_bands: int, ) -> LSH: """Returns a `snapy.lsh.LSH` object constructed from `text` to detect near duplicate texts. """ minhash = MinHash(text, n_gram=n_gram, permutations=n_permutations, hash_bits=hash_bits, seed=SEED) lsh = LSH(minhash, labels, no_of_bands=no_of_bands) typer.secho( f"{HASHING} Hashed the normalized text using Locality-Sensitive Hashing (LSH).", bold=True, ) return lsh
def identify_dublicates(self, ctnt_to_dedup): _ix = [i for i in range(len(ctnt_to_dedup))] _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed) _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands) candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh) return candidates
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] content = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', 'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes ' 'when described as the proportion of mass contributed by different atoms.', 'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium', 'A helium atom has about four times as much mass as a hydrogen atom and the composition changes ' 'when described as a proportion of mass contributed by different atoms.', 'Theoretical models indicate that if Jupiter had much more mass than it does at present, it would shrink.', 'This process causes Jupiter to shrink by about 2 cm each year.', 'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium', 'The Great Red Spot is large enough to accommodate Earth within its boundaries.' ] minhash = MinHash(content, seed=3) def test_initialize_from_empty_lsh(): lsh = LSH() assert lsh.no_of_bands is None assert lsh._buckets == defaultdict(list) assert lsh._i_bucket == defaultdict(list) assert lsh.permutations is None lsh.update(minhash, labels) assert list(lsh._i_bucket) == labels buckets = lsh._buckets assert buckets[4466445138223010106] == [1, 8] assert buckets[-3939654010681976230] == [1, 4, 8] assert lsh.permutations == 100 assert lsh.no_of_bands == 50
from snapy import MinHash, LSH import numpy as np from fasta_parser import parse_to_list contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa" print("parsing contigs to list...") contigs_list, num_contigs = parse_to_list(contigs_file) print("number of contigs (shorter then 1,00bp):", num_contigs) labels = np.arange(num_contigs).tolist() # Create MinHash object. print("creating minhash object...") minhash = MinHash(contigs_list, n_gram=24) # Create LSH model. print("creating LSH object...") lsh = LSH(minhash, labels) print("query object:") print("similar strings to the string in index 0:", lsh.query(0)) print("similar strings to the string in index 1:", lsh.query(1)) print("similar strings to the string in index 2:", lsh.query(2)) print("similar strings to the string in index 3:", lsh.query(3)) print("similar strings to the string in index 4:", lsh.query(4)) print("similar strings to the string in index 5:", lsh.query(5)) print(lsh.contains())
def CreateLsh(self, InContext): Labels = range(len(InContext)) InContext = self.Transform (InContext) Hash = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed) self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum) self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))
seed = 3 #size of each overlapping text shingle to break text into prior to hashing n_gram = 9 #number of randomly sampled hash values to use for generating each texts minhash signature (larger = more accurate & slower) permutations = 100 #hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit). #NOTE: should be chosen based on text length and a trade off between performance ad accuracy hash_bits = 64 # Create MinHash object. minhash = MinHash(content, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=50) #query to find near duplicates for text 1 print(lsh.query(1, min_jaccard=.5)) #update model #generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ]
def test_string_input_minhash(): minhash = MinHash(content[0]) assert type(minhash.signatures) is np.ndarray assert minhash.signatures.shape == (1, 100)