class TestSimhashIndex(TestCase): data = { 1: 'How are you? I Am fine. blar blar blar blar blar Thanks.', 2: 'How are you i am fine. blar blar blar blar blar than', 3: 'This is simhash test.', 4: 'How are you i am fine. blar blar blar blar blar thank1', } def setUp(self): objs = [(str(k), Simhash(v)) for k, v in self.data.items()] self.index = SimhashIndex(objs, k=10) def test_get_near_dup(self): s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank') dupes = self.index.get_near_dupes(s1) self.assertTrue(isinstance(list(dupes)[0], tuple)) self.assertEqual(len(list(dupes)[0]), 2) self.assertEqual(len(dupes), 3) self.index.delete('1', Simhash(self.data[1])) dupes = self.index.get_near_dupes(s1) self.assertEqual(len(dupes), 2) self.index.delete('1', Simhash(self.data[1])) dupes = self.index.get_near_dupes(s1) self.assertEqual(len(dupes), 2) self.index.add('1', Simhash(self.data[1])) dupes = self.index.get_near_dupes(s1) self.assertEqual(len(dupes), 3) self.index.add('1', Simhash(self.data[1])) dupes = self.index.get_near_dupes(s1) self.assertEqual(len(dupes), 3)