def test_keyblocking_couples(self): blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1, callback=partial(soundexcode, language='english')) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) pairs = list(blocking.iter_id_pairs()) self.assertEqual(len(pairs), 8) for pair in SOUNDEX_PAIRS: self.assertIn(pair, pairs)
def test_baseblocking_indice_blocks(self): blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1, callback=partial(soundexcode, language='english')) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) blocks = list(blocking.iter_indice_blocks()) self.assertEqual(len(blocks), 3) self.assertIn(([0, 6], [2, 5]), blocks) self.assertIn(([1, 4], [3]), blocks) self.assertIn(([2], [0, 1]), blocks)
def test_baseblocking_id_blocks(self): blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1, callback=partial(soundexcode, language='english')) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) blocks = list(blocking.iter_id_blocks()) self.assertEqual(len(blocks), 3) self.assertIn((['a1', 'a7'], ['b3', 'b6']), blocks) self.assertIn((['a2', 'a5'], ['b4']), blocks) self.assertIn((['a3'], ['b1', 'b2']), blocks)
def test_baseblocking_blocks(self): blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1, callback=partial(soundexcode, language='english')) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) blocks = list(blocking.iter_blocks()) self.assertEqual(len(blocks), 3) self.assertIn(([(0, 'a1'), (6, 'a7')], [(2, 'b3'), (5, 'b6')]), blocks) self.assertIn(([(1, 'a2'), (4, 'a5')], [(3, 'b4')]), blocks) self.assertIn(([(2, 'a3')], [(0, 'b1'), (1, 'b2')]), blocks)
def test_baseblocking_pairs(self): blocking = KeyBlocking(ref_attr_index=1, target_attr_index=1, callback=partial(soundexcode, language='english')) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) pairs = list(blocking.iter_pairs()) ref_ind = dict((r[0], ind) for ind, r in enumerate(SOUNDEX_REFSET)) target_ind = dict((r[0], ind) for ind, r in enumerate(SOUNDEX_TARGETSET)) true_pairs = [((ref_ind[r[0]], r[0]), (target_ind[r[1]], r[1])) for r in SOUNDEX_PAIRS] self.assertEqual(len(pairs), len(true_pairs)) for pair in true_pairs: self.assertIn(pair, pairs)