Пример #1
0
def test_initialize_from_empty_lsh():
    lsh = LSH()
    assert lsh.no_of_bands is None
    assert lsh._buckets == defaultdict(list)
    assert lsh._i_bucket == defaultdict(list)
    assert lsh.permutations is None
    lsh.update(minhash, labels)
    assert list(lsh._i_bucket) == labels
    assert lsh.permutations == 100
    assert lsh.no_of_bands == 50
Пример #2
0
def test_initialize_from_empty_lsh():
    lsh = LSH()
    assert lsh.no_of_bands is None
    assert lsh._buckets == defaultdict(list)
    assert lsh._i_bucket == defaultdict(list)
    assert lsh.permutations is None
    lsh.update(minhash, labels)
    assert list(lsh._i_bucket) == labels
    buckets = lsh._buckets
    assert buckets[4466445138223010106] == [1, 8]
    assert buckets[-3939654010681976230] == [1, 4, 8]
    assert lsh.permutations == 100
    assert lsh.no_of_bands == 50
Пример #3
0
def test_update_lsh():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.update(minhash, labels)
    new_content = [
        'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
        'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.'
    ]
    new_labels = [11, 12]
    incorrect_minhash = MinHash(new_content, permutations=10)
    with pytest.raises(ValueError):
        lsh.update(incorrect_minhash, new_labels)
    correct_minhash = MinHash(new_content)
    lsh.update(correct_minhash, new_labels)
    assert lsh.permutations == 100
    assert list(lsh._i_bucket) == labels + [11, 12]
Пример #4
0
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
                      n_gram=n_gram,
                      permutations=permutations,
                      hash_bits=hash_bits,
                      seed=seed)

#2.update lsh model with new hash signitures and verify lsh model updates reflected
lsh.update(new_minhash, new_labels)
print(lsh.contains())

#print the adjacency_list of all docs
print(lsh.adjacency_list())

#print the edge list of all docs that are flagged as duplicates to plot in text similarity graph
print(lsh.edge_list())

#remove text and label from model (if its not there , you will get an error returned)
lsh.remove(6)
print(lsh.contains())

#get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations)
minhash.signatures.shape