def test_lsh_edge_list(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.edge_list(sensitivity=101) assert lsh.edge_list() == [(8, 1), (8, 4), (5, 3), (4, 1)] assert lsh.edge_list(sensitivity=20) == [(8, 1), (5, 3), (4, 1)] assert lsh.edge_list(min_jaccard=0.7) == [] assert lsh.edge_list(min_jaccard=0.6) == [(5, 3)] assert lsh.edge_list(jaccard_weighted=True, min_jaccard=0.55) == [(5, 3, 0.6), (4, 1, 0.58)]
#generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) #2.update lsh model with new hash signitures and verify lsh model updates reflected lsh.update(new_minhash, new_labels) print(lsh.contains()) #print the adjacency_list of all docs print(lsh.adjacency_list()) #print the edge list of all docs that are flagged as duplicates to plot in text similarity graph print(lsh.edge_list()) #remove text and label from model (if its not there , you will get an error returned) lsh.remove(6) print(lsh.contains()) #get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations) minhash.signatures.shape