示例#1
0
def test_lsh_edge_list():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.edge_list(sensitivity=101)
    assert lsh.edge_list() == [(8, 1), (8, 4), (5, 3), (4, 1)]
    assert lsh.edge_list(sensitivity=20) == [(8, 1), (5, 3), (4, 1)]
    assert lsh.edge_list(min_jaccard=0.7) == []
    assert lsh.edge_list(min_jaccard=0.6) == [(5, 3)]
    assert lsh.edge_list(jaccard_weighted=True,
                         min_jaccard=0.55) == [(5, 3, 0.6), (4, 1, 0.58)]
示例#2
0
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
                      n_gram=n_gram,
                      permutations=permutations,
                      hash_bits=hash_bits,
                      seed=seed)

#2.update lsh model with new hash signitures and verify lsh model updates reflected
lsh.update(new_minhash, new_labels)
print(lsh.contains())

#print the adjacency_list of all docs
print(lsh.adjacency_list())

#print the edge list of all docs that are flagged as duplicates to plot in text similarity graph
print(lsh.edge_list())

#remove text and label from model (if its not there , you will get an error returned)
lsh.remove(6)
print(lsh.contains())

#get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations)
minhash.signatures.shape