def test_ngram_blocks(self): blocking = NGramBlocking(ref_attr_index=1, target_attr_index=1) blocking.fit(SOUNDEX_REFSET, SOUNDEX_TARGETSET) blocks = list(blocking.iter_id_blocks()) self.assertEqual(len(blocks), 3) self.assertIn((['a3'], ['b1', 'b2']), blocks) self.assertIn((['a5'], ['b4']), blocks) self.assertIn((['a1', 'a4'], ['b3']), blocks)
def test_ngram_blocks_2(self): refset = [['3', 'ccdd', 'aabb'], ['4', 'ccdd', 'bbaa']] targetset = [['c', 'ccdd', 'aabb'], ['d', 'ccdd', 'bbaa']] true_pairs = [('3', 'c'), ('4', 'd')] blocking = NGramBlocking(ref_attr_index=2, target_attr_index=2, ngram_size=2, depth=1) blocking.fit(refset, targetset) pairs = list(blocking.iter_id_pairs()) self.assertEqual(len(pairs), len(true_pairs))