def test_select_matcher_valid_multiple_metrics(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold', k=7) header = ['Name', 'Matcher', 'Num folds'] result_df_p = result['drill_down_cv_stats']['precision'] result_df_f = result['drill_down_cv_stats']['f1'] result_df_r = result['drill_down_cv_stats']['recall'] # Check header of precision dataframe self.assertEqual(set(header) == set(list(result_df_p.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df_p.columns[len(result_df_p.columns) - 1]) # Check header of f1 dataframe self.assertEqual(set(header) == set(list(result_df_f.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df_f.columns[len(result_df_f.columns) - 1]) # Check header of recall dataframe self.assertEqual(set(header) == set(list(result_df_r.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df_p.columns[len(result_df_r.columns) - 1]) d = result_df_p.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_valid_2(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = feature_vectors[l] Y = feature_vectors['gold'] result = select_matcher(matchers, x=X, y=Y) header = ['Name', 'Matcher', 'Num folds'] result_df = result['drill_down_cv_stats']['precision'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_valid_cv_stats_3(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], metric_to_select_matcher='recall', metrics_to_display='recall', target_attr='gold', k=7) header = ['Matcher', 'Average recall'] result_df = result['cv_stats'] result_df_r = result['drill_down_cv_stats']['recall'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1]])), True) d = result_df.set_index('Matcher') p_max = d.ix[result['selected_matcher'].name, 'Average recall'] a_max = pd.np.max(result_df_r['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_valid_1(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # C['labels'] = labels feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() # xgmatcher = XGBoostMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold', k=7) header = ['Name', 'Matcher', 'Num folds'] result_df = result['drill_down_cv_stats']['precision'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_check_table_order_invalid_df2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
def test_feature_fn_valid_nosim_tok(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, dict(), dict())
def test_validate_attr_types_invalid_corres(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) # attr_corres = au.get_attr_corres(A, B) response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
def test_debugblocker_14(self): path_ltable = os.sep.join([debugblocker_datasets_path, 'test_debugblocker_ltable.csv']) path_rtable = os.sep.join([debugblocker_datasets_path, 'test_debugblocker_rtable.csv']) path_cand = os.sep.join([debugblocker_datasets_path, 'test_debugblocker_cand.csv']) ltable = read_csv_metadata(path_ltable, key='ID') rtable = read_csv_metadata(path_rtable, key='book_id') cand_set = read_csv_metadata(path_cand, ltable=ltable, rtable=rtable, fk_ltable='ltable_ID', fk_rtable='rtable_book_id', key='_id') attr_corres = [('title', 'book_title'), ('price', 'price'), ('desc', 'description'), ('genre', 'book_genre'), ('year', 'pub_year'), ('lang', 'language'), ('author', 'author'), ('publisher', 'publisher')] output_size = 1 ret_dataframe = db.debug_blocker(cand_set, ltable, rtable, output_size, attr_corres) expected_columns = ['_id', 'ltable_ID', 'rtable_book_id', 'ltable_title', 'ltable_desc', 'ltable_year', 'ltable_lang', 'ltable_author', 'ltable_publisher', 'rtable_book_title', 'rtable_description', 'rtable_pub_year', 'rtable_language', 'rtable_author', 'rtable_publisher'] self.assertEqual(list(ret_dataframe.columns), expected_columns) ret_record = list(ret_dataframe.ix[0]) expected_record = [0, 1, 'B001', 'data analysis', 'introduction to data analysis', 2015, 'ENG', 'Jane Doe', 'BCD publisher', 'introduction to data analysis', float('nan'), 'English', 'introduction to data analysis', 'John Doe', 'ABC publisher10.00'] print(ret_record) print(expected_record) self.assertEqual(expected_record[2], ret_record[2]) self.assertEqual(expected_record[3], ret_record[3])
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0]*num_ones # gold.extend([1]*num_zeros) predicted = [1]* (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln+1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_select_matcher_target_attr_not_present(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors) ]) feature_vectors = feature_vectors[l] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs='_id', target_attr='labels1', k=2)
def test_ml_matcher_return_probs_true_predict_diff_colname(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') train.drop('ltable.id', axis=1, inplace=True) train.drop('rtable.id', axis=1, inplace=True) test.drop('ltable.id', axis=1, inplace=True) test.drop('rtable.id', axis=1, inplace=True) test.drop('gold', axis=1, inplace=True) dt.fit(table=train, exclude_attrs='_id', target_attr='gold') predictions = dt.predict(table=test, exclude_attrs='_id', target_attr='predicted', probs_attr='probas', inplace=False, append=True, return_probs=True) self.assertNotEqual(id(predictions), id(test)) self.assertEqual(len(predictions), len(test)) self.assertEqual(set(list(test.columns)).issubset(list(predictions.columns)), True) p_col = predictions.columns[len(predictions.columns)-2] self.assertEqual(p_col, 'predicted') r_col = predictions.columns[len(predictions.columns) - 1] self.assertEqual(r_col, 'probas') self.assertEqual(sum((predictions[r_col] >= 0.0) & (predictions[r_col] <= 1.0)), len(predictions))
def test_label_table_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) D = label_table(C, 'label') p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D) self.assertEqual(p1, p2)
def test_valid_path_df_metadata_split_betn_file_kw(self): cm.del_catalog() del_files_in_dir(sndbx_path) A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv']) C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID')
def test_blocker_combiner_valid_8(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B) C1.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C1.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C1, 'ltable_ID') cm.set_fk_rtable(C1, 'rtable_ID') C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B) C2.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C2.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C2, 'ltable_ID') cm.set_fk_rtable(C2, 'rtable_ID') C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_') C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B) C_exp.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C_exp.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C_exp, 'ltable_ID') cm.set_fk_rtable(C_exp, 'rtable_ID') # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True) # C_exp.reset_index(inplace=True, drop=True) # C_exp['_id'] = six.moves.range(0, len(C_exp)) # C_exp.drop('r_address', axis=1, inplace=True) if os.name != 'nt': self.assertEqual(C.equals(C_exp), True) p1 = cm.get_all_properties(C) p2 = cm.get_all_properties(C_exp) self.assertEqual(p1, p2)
def test_sample_table_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) D = sample_table(C, 10, False) self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D)) self.assertEqual(len(D), 10)
def test_eval_matches_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'predicted', 'gold') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 14) self.assertEqual(result['prec_denominator'], 14) self.assertAlmostEqual(result['precision'], 1) self.assertEqual(result['recall_numerator'], 14) self.assertEqual(result['recall_denominator'], 15) self.assertEqual(result['recall'], 0.9333333333333333) self.assertEqual(result['f1'], 0.9655172413793104) self.assertEqual(result['pred_pos_num'], 14) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 1) self.assertEqual(result['false_neg_num'], 1.0) self.assertEqual(len(result['false_neg_ls']), 1) t = result['false_neg_ls'][0] self.assertEqual(t[0], 'a1') self.assertEqual(t[1], 'b1')
def test_assemble_topk_table_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4), (0.16666666666666666, 0, 3)] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) expected_columns = ['_id', 'ltable_ID', 'rtable_ID', 'ltable_name', 'ltable_birth_year', 'ltable_hourly_wage', 'ltable_address', 'ltable_zipcode', 'rtable_name', 'rtable_birth_year', 'rtable_hourly_wage', 'rtable_address', 'rtable_zipcode'] self.assertEqual(len(ret_dataframe), 3) self.assertEqual(list(ret_dataframe.columns), expected_columns) expected_recs = [[0, 'a2', 'b1', 'Michael Franklin', 1988, 27.5, '1652 Stockton St, San Francisco', 94122, 'Mark Levene', 1987, 29.5, '108 Clement St, San Francisco', 94107], [1, 'a1', 'b5', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Alfons Kemper', 1984, 35.0, '170 Post St, Apt 4, San Francisco', 94122], [2, 'a1', 'b4', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Joseph Kuan', 1982, 26.0, '108 South Park, San Francisco', 94122]] self.assertEqual(list(ret_dataframe.ix[0]), expected_recs[0]) self.assertEqual(list(ret_dataframe.ix[1]), expected_recs[1]) self.assertEqual(list(ret_dataframe.ix[2]), expected_recs[2])
def test_ml_matcher_invalid_input_combn_fit(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') dt.fit(x=train, table=train)
def test_valid_path_df_metadata_invalid_rtable(self): cm.del_catalog() del_files_in_dir(sndbx_path) A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') # path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv']) C = read_csv_metadata(path_c, rtable="temp", ltable=A)
def test_valid_path_type_is_not_string(self): cm.del_catalog() with self.assertRaises(AssertionError) as ctx: read_csv_metadata(1001) actual = str(ctx.exception) expected = 'Input file path: 1001 \nis not of type string' self.assertEqual(actual, expected)
def test_debugblocker_18(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key='_id') ret_table = db.debug_blocker(C, A, B, n_jobs = 2)
def test_select_features_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) actual_selected_features = db._select_features(A, B, A_key, B_key) expected_selected_features = [1, 3, 4, 2, 5] self.assertEqual(actual_selected_features, expected_selected_features)
def test_check_table_order_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, True)
def test_get_features_invalid_ltable_rtable_switch(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(B, A) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
def test_debugblocker_7(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key='_id') attr_corres = [('ID', 'ID'), ['ID', 'ID']] db.debug_blocker(C, A, B, 200, attr_corres)
def test_check_table_order_invalid_attrcorres_ltable(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) attr_corres['ltable'] = pd.DataFrame() status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, False)
def test_get_features_for_blocking_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_blocking(A, B, validate_inferred_attr_types=False) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def test_get_attr_corres_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') ac = get_attr_corres(A, B) for c in ac['corres']: self.assertEqual(c[0], c[1]) self.assertEqual(all(ac['ltable'] == A), True) self.assertEqual(all(ac['rtable'] == B), True)
def test_assemble_topk_table_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) self.assertEqual(len(ret_dataframe), 0) self.assertEqual(list(ret_dataframe.columns), [])
def test_validate_types_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key = '_id') A_key = em.get_key(A) B_key = em.get_key(B) attr_corres = None db._validate_types(A, B, C, 100, attr_corres, False)
def test_down_sample_inv_index_valid_1(self): A = read_csv_metadata(path_a) inv_index = _inv_index(A) self.assertNotEqual(len(inv_index), 0)
def test_valid_path_wi_valid_metadata(self): cm.del_catalog() A = read_csv_metadata(path_a) pd_A = pd.read_csv(path_a) self.assertEqual(A.equals(pd_A), True) self.assertEqual(cm.get_key(A), 'ID')
def test_invalid_nonstr_path(self): cm.del_catalog() A = read_csv_metadata(10)
def test_invalid_str_path(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'xyz.csv']) A = read_csv_metadata(p)
def test_debugblocker_10(self): A = read_csv_metadata(path_a) B = pd.DataFrame([]) C = pd.DataFrame([]) db.debug_blocker(C, A, B)
def test_valid_path_wi_invalidmetadata_wrongformat(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'A_md_wrongformat.csv']) IM = read_csv_metadata(p, key='ID')
def test_check_input_field_correspondence_list_5(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) field_corres_list = [('address', 'fdsa')] db._check_input_field_correspondence_list(A, B, field_corres_list)
def setUp(self): self.A = read_csv_metadata(path_a, key='ID') self.B = read_csv_metadata(path_b, key='ID')
def test_get_record_id_to_index_map_1(self): A = read_csv_metadata(path_a, key='ID') key = em.get_key(A) actual_rec_id_to_idx = db._get_record_id_to_index_map(A, key) expected_rec_id_to_idx = {'a1': 0, 'a3': 2, 'a2': 1, 'a5': 4, 'a4': 3} self.assertEqual(actual_rec_id_to_idx, expected_rec_id_to_idx)
def test_invalid_path_2(self): cm.del_catalog() del_files_in_dir(sndbx_path) A = read_csv_metadata(path_a) to_csv_metadata(A, None)
def test_check_input_field_correspondence_list_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) field_corres_list = [] db._check_input_field_correspondence_list(A, B, field_corres_list)
def test_debugblocker_2(self): A = read_csv_metadata(path_a) B = [] C = [] db.debug_blocker(C, A, B)
def test_debugblocker_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = None db.debug_blocker(C, A, B)
def test_debugblocker_4(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) output_size = '200' db.debug_blocker(C, A, B, output_size)
def test_down_sample_probe_index_validchk1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') in_index = _inv_index(A) s_tbl_indices = _probe_index(B, 5, len(A), in_index) self.assertNotEqual(len(s_tbl_indices), 0)
def test_valid_path_wi_invalidmetadata_replace_key(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv']) IM = read_csv_metadata(p, key='ID') self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.is_property_present_for_df(IM, 'key'), True)
def test_debugblocker_11(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = pd.DataFrame([]) output_size = 0 db.debug_blocker(C, A, B, output_size)
def test_validpath_metadata_set_to_none_1(self): cm.del_catalog() del_files_in_dir(sndbx_path) A = read_csv_metadata(path_a, key=None) self.assertEqual(cm.is_dfinfo_present(A), True) cm.get_key(A)
def test_valid_path_wi_invalidmetadata_wrongkey(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'InvalidMetadata2.csv']) IM = read_csv_metadata(p)
def test_valid_path_wi_metadata_unknownprop(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv']) IM = read_csv_metadata(p) self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
def test_valid_path_wo_metadata(self): cm.del_catalog() B = read_csv_metadata(path_b) pd_B = pd.read_csv(path_b) self.assertEqual(B.equals(pd_B), True) self.assertEqual(cm.is_dfinfo_present(B), True)
def test_down_sample_get_str_cols_list_valid2(self): A = read_csv_metadata(path_a) str_col_list = _get_str_cols_list(A) self.assertNotEqual(len(str_col_list), 0)
def test_debugblocker_5(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) attr_corres = set() db.debug_blocker(C, A, B, 200, attr_corres)
def test_down_sample_inv_index_value_check(self): A = read_csv_metadata(path_a) inv_index = _inv_index(A) self.assertNotEqual(len(inv_index.get('beach')), 0)
def test_down_sample_probe_index_invalid_set(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') in_index = _inv_index(A) s_tbl_indices = _probe_index(B, 5, len(A), in_index) self.assertTrue(type(s_tbl_indices) is set)
def test_check_input_field_correspondence_list_7(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) field_corres_list = [('zipcode', 'zipcode'), ('birth_year', 'birth_year')] db._check_input_field_correspondence_list(A, B, field_corres_list)
def test_down_sample_get_str_cols_list_valid1(self): A = read_csv_metadata(path_a) str_col_list = _get_str_cols_list(pd.DataFrame())
def test_get_features_for_blocking_invalid_df2(self): A = read_csv_metadata(path_a) # B = read_csv_metadata(path_b, key='ID') feat_table = afg.get_features_for_blocking( A, None, validate_inferred_attr_types=False)
def test_down_sample_inv_index_key_check(self): A = read_csv_metadata(path_a) inv_index = _inv_index(A) self.assertTrue('meadows' in inv_index)