def test_invalid_tokenizer(self): sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, sim_func, sim_func, threshold)
def test_invalid_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher([], DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_invalid_rtable(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, [], self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_invalid_tokenizer(self): sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, sim_func, sim_func, threshold)
def test_invalid_r_out_attr(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher(pd.DataFrame([], columns=['_id', 'l_A.ID', 'r_B.ID']), DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, r_out_attrs=['invalid_attr'])
def test_invalid_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 apply_matcher([], DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_empty_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 empty_candset = pd.DataFrame(columns=[ DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr ]) apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_empty_candset(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 empty_candset = pd.DataFrame( columns=[DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr]) apply_matcher(empty_candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold)
def test_apply_matcher_with_join_attr_of_type_int(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' l_join_attr = 'A.zipcode' r_join_attr = 'B.zipcode' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[l_join_attr])), tok.tokenize(str(row[r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher( candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok, sim_func, threshold) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, False, [self.l_join_attr], [self.r_join_attr], out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, DEFAULT_L_OUT_PREFIX + self.l_join_attr, DEFAULT_R_OUT_PREFIX + self.r_join_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables( self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))