def test_initialize_from_empty_lsh(): lsh = LSH() assert lsh.no_of_bands is None assert lsh._buckets == defaultdict(list) assert lsh._i_bucket == defaultdict(list) assert lsh.permutations is None lsh.update(minhash, labels) assert list(lsh._i_bucket) == labels assert lsh.permutations == 100 assert lsh.no_of_bands == 50
def test_initialize_from_empty_lsh(): lsh = LSH() assert lsh.no_of_bands is None assert lsh._buckets == defaultdict(list) assert lsh._i_bucket == defaultdict(list) assert lsh.permutations is None lsh.update(minhash, labels) assert list(lsh._i_bucket) == labels buckets = lsh._buckets assert buckets[4466445138223010106] == [1, 8] assert buckets[-3939654010681976230] == [1, 4, 8] assert lsh.permutations == 100 assert lsh.no_of_bands == 50
def test_update_lsh(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.update(minhash, labels) new_content = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.' ] new_labels = [11, 12] incorrect_minhash = MinHash(new_content, permutations=10) with pytest.raises(ValueError): lsh.update(incorrect_minhash, new_labels) correct_minhash = MinHash(new_content) lsh.update(correct_minhash, new_labels) assert lsh.permutations == 100 assert list(lsh._i_bucket) == labels + [11, 12]
#generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) #2.update lsh model with new hash signitures and verify lsh model updates reflected lsh.update(new_minhash, new_labels) print(lsh.contains()) #print the adjacency_list of all docs print(lsh.adjacency_list()) #print the edge list of all docs that are flagged as duplicates to plot in text similarity graph print(lsh.edge_list()) #remove text and label from model (if its not there , you will get an error returned) lsh.remove(6) print(lsh.contains()) #get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations) minhash.signatures.shape