class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(2, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups))
class TestSimhashIndex(TestCase): data = { 1: u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.', 2: u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than', 3: u'This is a different one.', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs) def test_get_near_dup(self): s1 = Simhash( u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank' ) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 1) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2)
class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.delete('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 2) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3) self.index.add('1', Simhash(self.data[1])) dups = self.index.get_near_dups(s1) self.assertEqual(len(dups), 3)
def find_near_matches(session, collection, index_size, probability_index_near_match): from simhash import Simhash, SimhashIndex logging.getLogger().setLevel(logging.CRITICAL) tweet_id_simhash_value = session.execute( sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']]) .where(model.Tweet.collection == collection) ) simhash_index = SimhashIndex([], k=7) insert_relation_stmt = pg.insert(model.relation) # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update( # index_elements=['tweet_id', 'collection'], # set_={ # 'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id # } # ) indexed_tweet_ids = [] for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value): if (i % 100000) == 1000: logger.info('Processed %s tweets. Committing.', i) session.commit() simhash = Simhash(simhash_value) near_matches_ids = simhash_index.get_near_dups(simhash) if not near_matches_ids: simhash_index.add(tweet_id, simhash) indexed_tweet_ids.append((tweet_id, simhash)) if len(indexed_tweet_ids) > index_size: simhash_index.delete(*indexed_tweet_ids.pop(0)) if near_matches_ids: near_match_id = min(near_matches_ids) logger.debug('A near match %s for tweet %s', near_match_id, tweet_id) session.execute( insert_relation_stmt.values( [(tweet_id, collection, 'near_match', near_match_id)] ) ) session.commit()