def test_vis_debug_matcher_dt_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt( dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_vis_debug_matcher_rf_ex_attrs_notin_test(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] test.drop('_id', inplace=True, axis=1) _vis_debug_rf( rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_extract_feature_vecs_invalid_feature_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) F = extract_feature_vecs(C, attrs_before='ltable_name', feature_table=None, attrs_after=['label', '_id'])
def test_extract_feature_vecs_invalid_feature_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching(A, B) F = extract_feature_vecs(C, attrs_before='ltable_name', feature_table=None, attrs_after=['label', '_id'])
def test_extract_feature_vecs_invalid_attrs_after(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching( A, B, validate_inferred_attr_types=False) F = extract_feature_vecs( C, attrs_before='ltable_name', feature_table=pd.DataFrame(columns=feature_table.columns), attrs_after=['label1', '_id'])
def test_extract_feature_vecs_with_parralel_job_count_less_than_zero(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table, n_jobs=-1) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) self.assertEqual(F.columns[4], 'rtable_name') self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False) self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree(dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_extract_feature_vecs_valid_8(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) F = extract_feature_vecs(C, feature_table=pd.DataFrame(columns=feature_table.columns), attrs_after=['label', '_id']) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) # self.assertEqual(F.columns[3], 'ltable_name') self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True) self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_debug_dt_matcher_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], feature_table=feature_table, table_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_debug_rf_matcher_invalid_feat_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feature_table=None, table_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_extract_feature_vecs_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching(A, B) F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) self.assertEqual(F.columns[3], 'ltable_name') self.assertEqual(F.columns[4], 'rtable_name') self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False) self.assertEqual( cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_vis_tuple_debug_rf_matcher_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_rf_matcher(rf, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_extract_feature_vecs_valid_8(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching( A, B, validate_inferred_attr_types=False) F = extract_feature_vecs( C, feature_table=pd.DataFrame(columns=feature_table.columns), attrs_after=['label', '_id']) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) # self.assertEqual(F.columns[3], 'ltable_name') self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True) self.assertEqual( cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_vis_debug_matcher_rf_label_col_wi_sp_name(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['_predicted'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='_predicted') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], target_attr='_predicted', show_window=False)
def test_extract_feature_vecs_with_parralel_job_count_more_than_one(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) col_pos = len(C.columns) C.insert(col_pos, 'label', [0] * len(C)) feature_table = get_features_for_matching( A, B, validate_inferred_attr_types=False) F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table, n_jobs=2) self.assertEqual(isinstance(F, pd.DataFrame), True) self.assertEqual(F.columns[0], '_id') self.assertEqual(F.columns[1], cm.get_fk_ltable(C)) self.assertEqual(F.columns[2], cm.get_fk_rtable(C)) self.assertEqual(F.columns[4], 'rtable_name') self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False) self.assertEqual( cm.get_all_properties(C) == cm.get_all_properties(F), True)
def test_vis_debug_matcher_dt_tar_attr_notin_train(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_dt(dt, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels1', show_window=False)
def test_visualize_tree_invalid_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') visualize_tree( dt.clf, feature_vectors.columns, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_vis_tuple_debug_dt_matcher_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'], axis=1, inplace=True) s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher(dt.clf, s1, exclude_attrs=None)
def test_vis_debug_matcher_rf_ex_attrs_notin_test(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] test.drop('_id', inplace=True, axis=1) _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels', show_window=False)
def test_vis_tuple_debug_dt_matcher_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_dt_matcher( dt, s1, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
def test_debug_dt_matcher_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_decisiontree_matcher( dt, A.ix[1], B.ix[2], feature_table=feature_table, table_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_vis_tuple_debug_rf_matcher_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') feature_vectors.drop(['_id', 'ltable_ID', 'rtable_ID', 'labels'], axis=1, inplace=True) s = pd.DataFrame(feature_vectors.ix[0]) s1 = s.T vis_tuple_debug_rf_matcher(rf.clf, s1, exclude_attrs=None)
def test_vis_debug_matcher_rf_label_col_wi_sp_name(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['_predicted'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='_predicted') rf = RFMatcher() train_test = mu.split_train_test(feature_vectors) train = train_test['train'] test = train_test['test'] _vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], target_attr='_predicted', show_window=False)
def test_debug_rf_matcher_invalid_feat_table(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = get_features_for_matching(A, B) feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = RFMatcher() rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') debug_randomforest_matcher( rf, A.ix[1], B.ix[2], feature_table=None, table_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
def test_extract_feature_vecs_invalid_df(self): F = extract_feature_vecs(None, attrs_before='ltable_name', feature_table=pd.DataFrame(), attrs_after=['label', '_id'])
def test_extract_feature_vecs_invalid_df(self): F = extract_feature_vecs(None, attrs_before='ltable_name', feature_table=pd.DataFrame(), attrs_after=['label', '_id'])