Exemplo n.º 1
0
def test_hasher_json_serialisation(default_hasher, tmpdir):
    path = str(tmpdir.join("hasher.json"))

    default_hasher.to_json(path)
    loaded_hasher = MinHasher.from_json_file(path)

    doc = 'Once upon a time in a galaxy far far away and what not'
    np.testing.assert_array_equal(default_hasher.fingerprint(doc),
                                  loaded_hasher.fingerprint(doc))
Exemplo n.º 2
0
def test_cache(char_ngram, hashbytes, num_bands, seed):
    hasher = MinHasher(seeds=200,
                       char_ngram=char_ngram,
                       hashbytes=hashbytes,
                       random_state=seed)
    lsh = Cache(hasher, num_bands=num_bands)
    # very small band width => always find duplicates

    short_doc = 'This is a simple document'
    another_doc = 'Some text about animals.'
    long_doc = 'A much longer document that contains lots of information\
       different words. The document produces many more shingles.'

    assert not lsh.is_duplicate(short_doc)
    lsh.add_doc(short_doc, 0)
    assert lsh.get_duplicates_of(short_doc) == {0}
    assert lsh.is_duplicate(short_doc, doc_id=0)
    assert lsh.is_duplicate(short_doc)

    assert not lsh.is_duplicate(long_doc)
    lsh.add_doc(long_doc, 1)
    lsh.add_doc(another_doc, 2)
    assert lsh.is_duplicate(another_doc)

    assert lsh.is_duplicate(long_doc, doc_id=1)
    assert lsh.is_duplicate(long_doc)

    words = long_doc.split()
    long_doc_missing_word = ' '.join([words[0]] + words[2:])

    assert lsh.get_duplicates_of(long_doc_missing_word) == {1}
    assert lsh.is_duplicate(long_doc_missing_word)
    assert lsh.is_duplicate(long_doc + ' Word.')

    assert lsh.get_all_duplicates() == set()
    lsh.add_doc(long_doc_missing_word, 3)
    assert lsh.get_all_duplicates() == {(1, 3)}

    lsh.add_doc(long_doc_missing_word, 4)
    assert lsh.get_all_duplicates() == {(1, 3), (1, 4), (3, 4)}
Exemplo n.º 3
0
    def from_json(path):
        with open(path) as inf:
            data = json.load(inf)

        cache = Cache(MinHasher.from_json_str(data.pop('hasher')),
                      **data)
        bins = []
        for bin in data['bins']:
            b1 = defaultdict(set)
            b1.update({int(k): set(v) for k, v in bin.items()})
            bins.append(b1)
        cache.bins = bins

        key_typecast = {
            'int': int,
            'str': str,
            '': lambda x: x
        }
        func = key_typecast[data.pop('id_key_type', '')]
        cache.fingerprints = {func(k[0]): np.array(v)
                              for k, v in data['fingerprints'].items()}
        return cache
Exemplo n.º 4
0
def test_num_bands(doc):
    """
    add near-duplicate documents to three caches with different settings
    check that hashers with low band_width finds more matches (over 50 runs)
    """
    suffixes = ['teamless', 'retired', 'awesome', 'overweight']
    duplicates = []
    divisors_of_200 = [4, 10, 20, 25, 40, 50, 100]

    for seed in range(10):
        hasher = MinHasher(seeds=200, char_ngram=5, random_state=seed)
        caches = [Cache(hasher, num_bands=n) for n in divisors_of_200]

        for c in caches:
            c.add_doc(doc + suffixes[0], 0)

        for s in suffixes[1:]:
            duplicates.append([c.is_duplicate(doc + s) for c in caches])

    sums = np.array(duplicates).sum(axis=0)
    print(sums)
    assert is_nondecreasing(sums)
Exemplo n.º 5
0
 def dedup(self):
     deduper = Cache(MinHasher(100))
     for x, doc in enumerate(self.data):
         deduper.add_doc(doc[0], x)
     dups = deduper.get_all_duplicates(min_jaccard=0.80)
     return dups
Exemplo n.º 6
0
def default_hasher():
    return MinHasher(seeds=100)