#generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) #2.update lsh model with new hash signitures and verify lsh model updates reflected lsh.update(new_minhash, new_labels) print(lsh.contains()) #print the adjacency_list of all docs print(lsh.adjacency_list()) #print the edge list of all docs that are flagged as duplicates to plot in text similarity graph print(lsh.edge_list()) #remove text and label from model (if its not there , you will get an error returned) lsh.remove(6) print(lsh.contains()) #get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations) minhash.signatures.shape
def test_lsh_remove(): lsh = LSH(minhash, labels) lsh.remove(5) assert list(lsh._i_bucket) == [1, 2, 3, 4, 6, 7, 8, 9] with pytest.raises(KeyError): lsh.remove(11)