def test_nilsimsa_near_duplicates_speed_perf( # noqa label_store, store, num_texts=5, num_exact_dups_each=10, num_near_dups_each=10, ): different_texts = [random_text() for _ in range(num_texts)] fcs = [] for idx1, text in enumerate(different_texts): fc = make_fc(text) fcs.append(('%d-original-exact' % idx1, fc)) for idx2 in range(num_exact_dups_each): fcs.append(('%d-%d-exact' % (idx1, idx2), copy.deepcopy(fc))) for idx2 in range(num_near_dups_each): fcs.append( ('%d-%d-exact' % (idx1, idx2), make_fc(mutate(text, 10)))) query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, threshold=0.85).set_query_id(query_content_id).create_predicate() start = time.time() results = filter(accumulating_predicate, fcs) elapsed = time.time() - start print '%d filtered to %d in %f seconds, %f per second' % ( len(fcs), len(results), elapsed, len(fcs) / elapsed) assert len(results) == num_texts - 1 # minus the query
def test_nilsimsa_near_duplicates_basic(label_store, store): # noqa fcs = [(str(idx), make_fc(text)) for idx, text in enumerate(near_duplicate_texts)] query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, # lower threshold for short test strings threshold=0).set_query_id(query_content_id).create_predicate() assert len(filter(accumulating_predicate, fcs)) == 0
def test_nilsimsa_near_duplicates_update_logic(label_store, store): # noqa fcs = [(str(idx), make_fc(text)) for idx, text in enumerate(chain(*repeat(near_duplicate_texts, 1000)))] query_content_id, query_fc = fcs.pop(0) store.put([(query_content_id, query_fc)]) accumulating_predicate = nilsimsa_near_duplicates( label_store, store, # lower threshold for short test strings threshold=120).set_query_id(query_content_id).create_predicate() start = time.time() results = filter(accumulating_predicate, fcs) elapsed = time.time() - start print '%d filtered to %d in %f seconds, %f per second' % ( len(fcs), len(results), elapsed, len(fcs) / elapsed) assert len(results) == 3
def test_fc_get(store): # noqa store.put([(visid_to_dbid('abc'), FeatureCollection({'foo': {'a': 1}}))]) fc = routes.v1_fc_get(dbid_to_visid, store, 'abc') assert fc['foo']['a'] == 1
def test_random_no_name_index(store): # noqa store.put([('foo', FeatureCollection({u'NAME': {'bar': 1}}))]) # just make sure it runs search_engines.random(store).set_query_id('foo').results()