def test_nilsimsa_near_duplicates_speed_perf( # noqa label_store, store, num_texts=5, num_exact_dups_each=10, num_near_dups_each=10, ): different_texts = [random_text() for _ in range(num_texts)] fcs = [] for idx1, text in enumerate(different_texts): fc = make_fc(text) fcs.append(('%d-original-exact' % idx1, fc)) for idx2 in range(num_exact_dups_each): fcs.append(('%d-%d-exact' % (idx1, idx2), copy.deepcopy(fc))) for idx2 in range(num_near_dups_each): fcs.append( ('%d-%d-exact' % (idx1, idx2), make_fc(mutate(text, 10)))) query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, threshold=0.85).set_query_id(query_content_id).create_predicate() start = time.time() results = filter(accumulating_predicate, fcs) elapsed = time.time() - start print '%d filtered to %d in %f seconds, %f per second' % ( len(fcs), len(results), elapsed, len(fcs) / elapsed) assert len(results) == num_texts - 1 # minus the query
def test_nilsimsa_near_duplicates_basic(label_store, store): # noqa fcs = [(str(idx), make_fc(text)) for idx, text in enumerate(near_duplicate_texts)] query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, # lower threshold for short test strings threshold=0).set_query_id(query_content_id).create_predicate() assert len(filter(accumulating_predicate, fcs)) == 0
def test_nilsimsa_near_duplicates_update_logic(label_store, store): # noqa fcs = [(str(idx), make_fc(text)) for idx, text in enumerate(chain(*repeat(near_duplicate_texts, 1000)))] query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, # lower threshold for short test strings threshold=120).set_query_id(query_content_id).create_predicate() start = time.time() results = filter(accumulating_predicate, fcs) elapsed = time.time() - start print '%d filtered to %d in %f seconds, %f per second' % ( len(fcs), len(results), elapsed, len(fcs) / elapsed) assert len(results) == 3