Пример #1
0
def test_nilsimsa_near_duplicates_speed_perf(  # noqa
    label_store, store, num_texts=5,
    num_exact_dups_each=10,
    num_near_dups_each=10,
):
    different_texts = [random_text() for _ in range(num_texts)]

    fcs = []
    for idx1, text in enumerate(different_texts):
        fc = make_fc(text)
        fcs.append(('%d-original-exact' % idx1, fc))
        for idx2 in range(num_exact_dups_each):
            fcs.append(('%d-%d-exact' % (idx1, idx2), copy.deepcopy(fc)))
        for idx2 in range(num_near_dups_each):
            fcs.append(
                ('%d-%d-exact' % (idx1, idx2), make_fc(mutate(text, 10))))

    query_content_id, query_fc = fcs.pop(0)
    store.put([(query_content_id, query_fc)])
    accumulating_predicate = nilsimsa_near_duplicates(
        label_store, store,
        threshold=0.85).set_query_id(query_content_id).create_predicate()

    start = time.time()
    results = filter(accumulating_predicate, fcs)
    elapsed = time.time() - start
    print '%d filtered to %d in %f seconds, %f per second' % (
        len(fcs), len(results), elapsed, len(fcs) / elapsed)
    assert len(results) == num_texts - 1  # minus the query
Пример #2
0
def test_nilsimsa_near_duplicates_basic(label_store, store):  # noqa

    fcs = [(str(idx), make_fc(text))
           for idx, text in enumerate(near_duplicate_texts)]
    query_content_id, query_fc = fcs.pop(0)

    store.put([(query_content_id, query_fc)])

    accumulating_predicate = nilsimsa_near_duplicates(
        label_store, store,
        # lower threshold for short test strings
        threshold=0).set_query_id(query_content_id).create_predicate()

    assert len(filter(accumulating_predicate, fcs)) == 0
Пример #3
0
def test_nilsimsa_near_duplicates_update_logic(label_store, store):  # noqa
    fcs = [(str(idx), make_fc(text))
           for idx, text in enumerate(chain(*repeat(near_duplicate_texts,
                                                    1000)))]

    query_content_id, query_fc = fcs.pop(0)

    store.put([(query_content_id, query_fc)])

    accumulating_predicate = nilsimsa_near_duplicates(
        label_store, store,
        # lower threshold for short test strings
        threshold=120).set_query_id(query_content_id).create_predicate()

    start = time.time()
    results = filter(accumulating_predicate, fcs)
    elapsed = time.time() - start
    print '%d filtered to %d in %f seconds, %f per second' % (
        len(fcs), len(results), elapsed, len(fcs) / elapsed)
    assert len(results) == 3