def test_feature_fn_valid_nosim_tok(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, dict(), dict())
def test_feature_fn_valid_nosim_tok(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, dict(), dict())
def test_add_feature_invalid_df_columns(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(pd.DataFrame(), 'test', f_dict)
def test_add_feature_name_already_present(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = create_feature_table() len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) add_feature(feature_table, 'test', f_dict)
def test_rulebased_matcher_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 0.4'] # same as rule_1 self.brm.add_rule(test_rule, self.feature_table) predictions = self.brm.predict(table=self.C) assert_equal(expected_labels_1, predictions)
def test_rulebased_matcher_rule_wi_overlap_coeff_sim_fn(self): feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 1'] # should return an empty set self.brm.add_rule(test_rule, self.feature_table) predictions = self.brm.predict(table=self.C) assert_equal(expected_labels_all_zeroes, predictions)
def test_rb_block_tables_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) < 0.3'] # same as rule_1 self.rb.add_rule(test_rule, self.feature_table) C = self.rb.block_tables(self.A, self.B, show_progress=False) validate_metadata(C) validate_data(C, expected_ids_1)
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1+1, len2) self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_rb_block_tables_rule_wi_overlap_coeff_sim_fn(self): feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) <= 1'] # should return an empty set self.rb.add_rule(test_rule, self.feature_table) C = self.rb.block_tables(self.A, self.B, show_progress=False) validate_metadata(C) validate_data(C)
def test_trigger_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 0.4'] # same as rule_1 self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_1, predictions)
def test_trigger_rule_wi_diff_tokenizers(self): feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 1'] # should return an empty set self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_zeros, predictions)
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1 + 1, len2) self.assertEqual( feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_add_feature_invalid_df_columns(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) with self.assertRaises(AssertionError) as ctx: add_feature(pd.DataFrame(), 'test', f_dict) actual = str(ctx.exception) print(actual) expected = 'Feature table does not have all required columns\n ' \ 'The following columns are missing: feature_name, left_attribute, right_attribute, ' \ 'left_attr_tokenizer,' \ ' right_attr_tokenizer, simfunction, function, function_source, is_auto_generated' self.assertEqual(actual, expected)
def test_get_feature_fn_invalid_feat_str(self): get_feature_fn(None, dict(), dict())
def test_get_feature_fn_invalid_sim(self): get_feature_fn("", None, dict())
def test_get_feature_fn_invalid_tok(self): get_feature_fn("", dict(), None)