def test_ml_matcher_inplace_false_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', inplace=False, append=True) self.assertNotEqual(id(predictions), id(test)) self.assertEqual(len(predictions), len(test)) self.assertEqual( set(list(test.columns)).issubset(list(predictions.columns)), True) p_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(p_col, 'predicted')
def test_ml_matcher_invalid_input_combn_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(x=train, table=train)
def test_ml_matcher_invalid_df_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True)
def test_ml_matcher_target_attr_not_present_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold1')
def test_ml_matcher_invalid_df_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict( table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True)
def test_select_matcher_valid_3(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = feature_vectors[l] Y = feature_vectors['gold'] result = select_matcher(matchers, x=X, y=Y, metric='recall') header = ['Name', 'Matcher', 'Num folds'] result_df = result['cv_stats'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_valid_1(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # C['labels'] = labels feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold', k=7) header = ['Name', 'Matcher', 'Num folds'] result_df = result['cv_stats'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_target_attr_not_present(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors) ]) feature_vectors = feature_vectors[l] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs='_id', target_attr='labels1', k=2)
def test_vis_debug_matcher_dt_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt( dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_ml_matcher_valid_1(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold') predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) self.assertEqual(set(list(predictions.columns)).issubset(list(test.columns)), True) p_col = predictions.columns[len(predictions.columns)-1] self.assertEqual(p_col, 'predicted')
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree(dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_debug_dt_matcher_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_vis_debug_matcher_dt_invalid_tar_attr(self): _vis_debug_dt( DTMatcher(), pd.DataFrame(), pd.DataFrame(), exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr=None, show_window=False)
def test_ml_matcher_valid_2(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = train[l] Y = train['gold'] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def test_ml_matcher_append_false_predict(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', append=False) self.assertEqual(len(predictions), len(test))
def test_vis_tuple_debug_dt_matcher_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher(dt.clf, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_ml_matcher_valid_with_id_in_y(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [ cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold' ]) X = train[l] Y = train[['_id', 'gold']] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree( dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_vis_tuple_debug_dt_matcher_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher( dt, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_ml_matcher_ex_attrs_not_list(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs=['_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) l = len(set(list(predictions.columns)).difference(list(test.columns))) self.assertEqual(l, 0) p_col = predictions.columns[len(predictions.columns)-1] self.assertEqual(p_col, 'predicted')
def test_ml_matcher_target_attr_present_in_ex_attrs(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict( table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) self.assertEqual(len(predictions), len(test)) l = len(set(list(predictions.columns)).difference(list(test.columns))) self.assertEqual(l, 0) p_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(p_col, 'predicted')
def test_debug_dt_matcher_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_decisiontree_matcher( dt, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_valid_names_for_matchers(self): matchers1 = { "DT": DTMatcher(), "LinReg": LinRegMatcher(), "LogReg": LogRegMatcher(), "NB": NBMatcher(), "RF": RFMatcher(), "SVM": SVMMatcher() } matchers2 = { "DT": DTMatcher(name='temp'), "LinReg": LinRegMatcher(name='temp'), "LogReg": LogRegMatcher(name='temp'), "NB": NBMatcher(name='temp'), "RF": RFMatcher(name='temp'), "SVM": SVMMatcher(name='temp') } for m_name, matcher in six.iteritems(matchers1): self.assertEqual(isinstance(matcher.name, six.string_types), True) for m_name, matcher in six.iteritems(matchers2): self.assertEqual(matcher.name, 'temp')
def test_ml_matcher_invalid_df(self): dt = DTMatcher(name='DecisionTree') dt.fit(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold')
def test_ml_invalid_predict_sign(self): dt = DTMatcher(name='DecisionTree') dt.predict()
def test_ml_matcher_set_name(self): dt = DTMatcher() dt.set_name('Decision Tree') self.assertEqual(dt.get_name(), 'Decision Tree')
def test_ml_matcher_invalid_df_1(self): dt = DTMatcher(name='DecisionTree') dt.fit(x="", y="")
import os import magellan.matcher.matcherutils as mu from magellan.io.parsers import read_csv_metadata from magellan.matcher.dtmatcher import DTMatcher from magellan.utils.generic_helper import get_install_path feat_datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'matcherselector']) fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv']) fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv']) fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv']) fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv']) A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) print('Done')
import os import magellan.matcher.matcherutils as mu from magellan.io.parsers import read_csv_metadata from magellan.matcher.dtmatcher import DTMatcher from magellan.utils.generic_helper import get_install_path feat_datasets_path = os.sep.join( [get_install_path(), 'datasets', 'test_datasets', 'matcherselector']) fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv']) fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv']) fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv']) fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv']) A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.train_test_split(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold') predictions = dt.predict( table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='predicted', append=True) print('Done')