def test_set_padding(self): tok = QgramTokenizer() self.assertEqual(tok.get_padding(), True) self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) tok.set_padding(False) self.assertEqual(tok.get_padding(), False) self.assertEqual(tok.tokenize('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
def test_set_suffix_pad(self): tok = QgramTokenizer() self.assertEqual(tok.get_suffix_pad(), '$') self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) tok.set_suffix_pad('!') self.assertEqual(tok.get_suffix_pad(), '!') self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!'])
def test_apply_matcher_with_join_attr_of_type_int(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' l_join_attr = 'A.zipcode' r_join_attr = 'B.zipcode' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[l_join_attr])), tok.tokenize(str(row[r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher( candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok, sim_func, threshold) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, False, [self.l_join_attr], [self.r_join_attr], out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, DEFAULT_L_OUT_PREFIX + self.l_join_attr, DEFAULT_R_OUT_PREFIX + self.r_join_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def __init__(self, vals): self.vals = sorted(vals, key=lambda x: x.lower(), reverse=True) self.val_map = vals if isinstance(vals, dict) else None # whether to show debugging info or not self.show = False jarowinkler_sim = JaroWinkler() levenshtein_sim = Levenshtein() qgtok = QgramTokenizer(qval=3, padding=True) jaccard_sim = Jaccard() Jaccard3Gram = lambda x, y: jaccard_sim.get_sim_score( qgtok.tokenize(x), qgtok.tokenize(y)) self.str_sims = { 'Jaro-Winkler': JaroWinkler().get_sim_score, 'Levenshtein': Levenshtein().get_sim_score, '3gram Jaccard': Jaccard3Gram } self.linkages = ['single', 'average', 'complete']
def test_set_qval(self): tok = QgramTokenizer(padding=False) self.assertEqual(tok.get_qval(), 2) self.assertEqual(tok.tokenize('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) self.assertEqual(tok.set_qval(3), True) self.assertEqual(tok.get_qval(), 3) self.assertEqual(tok.tokenize('database'), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) tok = QgramTokenizer() self.assertEqual(tok.get_qval(), 2) self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) self.assertEqual(tok.set_qval(3), True) self.assertEqual(tok.get_qval(), 3) self.assertEqual(tok.tokenize('database'), [ '##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$' ])
def test_set_return_set(self): tok = QgramTokenizer(padding=False) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) tok = QgramTokenizer() self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$'])
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables( self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def get_tversky_index(proto, query, n=3, beta=0.5): tversky = TverskyIndex(beta=beta) qgram = QgramTokenizer(qval=n, padding=False) inters = tversky.get_sim_score(qgram.tokenize(query), qgram.tokenize(proto)) return inters
class QgramTokenizerTestCases(unittest.TestCase): def setUp(self): self.qg1_tok = QgramTokenizer(qval=1, padding=False) self.qg2_tok = QgramTokenizer(padding=False) self.qg2_tok_return_set = QgramTokenizer(padding=False, return_set=True) self.qg3_tok = QgramTokenizer(qval=3, padding=False) self.qg1_tok_wipad = QgramTokenizer(qval=1) self.qg2_tok_wipad = QgramTokenizer() self.qg2_tok_wipad_return_set = QgramTokenizer(return_set=True) self.qg3_tok_wipad = QgramTokenizer(qval=3) self.qg3_tok_wipad_diffpad = QgramTokenizer(qval=3, prefix_pad='^', suffix_pad='!') def test_qgrams_valid(self): self.assertEqual(self.qg2_tok.tokenize(''), []) self.assertEqual(self.qg2_tok.tokenize('a'), []) self.assertEqual(self.qg2_tok.tokenize('aa'), ['aa']) self.assertEqual(self.qg2_tok.tokenize('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) self.assertEqual( self.qg2_tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) self.assertEqual(self.qg2_tok_return_set.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) self.assertEqual(self.qg1_tok.tokenize('d'), ['d']) self.assertEqual(self.qg3_tok.tokenize('database'), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) self.assertEqual(self.qg2_tok_wipad.tokenize(''), ['#$']) self.assertEqual(self.qg2_tok_wipad.tokenize('a'), ['#a', 'a$']) self.assertEqual(self.qg2_tok_wipad.tokenize('aa'), ['#a', 'aa', 'a$']) self.assertEqual( self.qg2_tok_wipad.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) self.assertEqual( self.qg2_tok_wipad.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) self.assertEqual(self.qg2_tok_wipad_return_set.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) self.assertEqual(self.qg1_tok_wipad.tokenize('d'), ['d']) self.assertEqual(self.qg3_tok_wipad.tokenize('database'), [ '##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$' ]) self.assertEqual(self.qg3_tok_wipad_diffpad.tokenize('database'), [ '^^d', '^da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se!', 'e!!' ]) def test_get_return_set(self): self.assertEqual(self.qg2_tok.get_return_set(), False) self.assertEqual(self.qg2_tok_return_set.get_return_set(), True) self.assertEqual(self.qg2_tok_wipad.get_return_set(), False) self.assertEqual(self.qg2_tok_wipad_return_set.get_return_set(), True) def test_get_qval(self): self.assertEqual(self.qg2_tok.get_qval(), 2) self.assertEqual(self.qg3_tok.get_qval(), 3) self.assertEqual(self.qg2_tok_wipad.get_qval(), 2) self.assertEqual(self.qg3_tok_wipad.get_qval(), 3) def test_set_return_set(self): tok = QgramTokenizer(padding=False) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) tok = QgramTokenizer() self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual( tok.tokenize('aabaabcdba'), ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) def test_set_qval(self): tok = QgramTokenizer(padding=False) self.assertEqual(tok.get_qval(), 2) self.assertEqual(tok.tokenize('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) self.assertEqual(tok.set_qval(3), True) self.assertEqual(tok.get_qval(), 3) self.assertEqual(tok.tokenize('database'), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) tok = QgramTokenizer() self.assertEqual(tok.get_qval(), 2) self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) self.assertEqual(tok.set_qval(3), True) self.assertEqual(tok.get_qval(), 3) self.assertEqual(tok.tokenize('database'), [ '##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$' ]) def test_set_padding(self): tok = QgramTokenizer() self.assertEqual(tok.get_padding(), True) self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) tok.set_padding(False) self.assertEqual(tok.get_padding(), False) self.assertEqual(tok.tokenize('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) def test_set_prefix_pad(self): tok = QgramTokenizer() self.assertEqual(tok.get_prefix_pad(), '#') self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) tok.set_prefix_pad('^') self.assertEqual(tok.get_prefix_pad(), '^') self.assertEqual( tok.tokenize('database'), ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) def test_set_suffix_pad(self): tok = QgramTokenizer() self.assertEqual(tok.get_suffix_pad(), '$') self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) tok.set_suffix_pad('!') self.assertEqual(tok.get_suffix_pad(), '!') self.assertEqual( tok.tokenize('database'), ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']) @raises(TypeError) def test_qgrams_none(self): self.qg2_tok.tokenize(None) @raises(AssertionError) def test_qgrams_invalid1(self): invalid_qg_tok = QgramTokenizer(0) @raises(TypeError) def test_qgrams_invalid2(self): self.qg2_tok.tokenize(99) @raises(AssertionError) def test_set_qval_invalid(self): qg_tok = QgramTokenizer() qg_tok.set_qval(0) @raises(AssertionError) def test_padding_invalid(self): _ = QgramTokenizer(padding=10) @raises(AssertionError) def test_set_padding_invalid(self): qg = QgramTokenizer() qg.set_padding(10) @raises(AssertionError) def test_prefixpad_invalid1(self): _ = QgramTokenizer(prefix_pad=10) @raises(AssertionError) def test_prefixpad_invalid2(self): _ = QgramTokenizer(prefix_pad="###") @raises(AssertionError) def test_set_prefix_pad_invalid1(self): qg = QgramTokenizer() qg.set_prefix_pad(10) @raises(AssertionError) def test_set_prefix_pad_invalid2(self): qg = QgramTokenizer() qg.set_prefix_pad('###') @raises(AssertionError) def test_suffixpad_invalid1(self): _ = QgramTokenizer(suffix_pad=10) @raises(AssertionError) def test_suffixpad_invalid2(self): _ = QgramTokenizer(suffix_pad="###") @raises(AssertionError) def test_set_suffix_pad_invalid1(self): qg = QgramTokenizer() qg.set_suffix_pad(10) @raises(AssertionError) def test_set_suffix_pad_invalid2(self): qg = QgramTokenizer() qg.set_suffix_pad('###')
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))