def test_eval_matches_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'predicted', 'gold') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 14) self.assertEqual(result['prec_denominator'], 14) self.assertAlmostEqual(result['precision'], 1) self.assertEqual(result['recall_numerator'], 14) self.assertEqual(result['recall_denominator'], 15) self.assertEqual(result['recall'], 0.9333333333333333) self.assertEqual(result['f1'], 0.9655172413793104) self.assertEqual(result['pred_pos_num'], 14) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 1) self.assertEqual(result['false_neg_num'], 1.0) self.assertEqual(len(result['false_neg_ls']), 1) t = result['false_neg_ls'][0] self.assertEqual(t[0], 'a1') self.assertEqual(t[1], 'b1')
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0] * num_ones # gold.extend([1]*num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0]*num_ones # gold.extend([1]*num_zeros) predicted = [1]* (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln+1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_eval_matches_predicted_attr_not_in_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'gold', 'predicted1')
def cv_matcher_and_trigger(matcher, triggers, table, exclude_attrs, target_attr, k=5, metric=None, random_state=None): """ Cross validate matcher and trigger. Parameters ---------- matcher : object, An ML-object in Magellan triggers : List of MatchTrigger objects table : MTable, on which match + trigger should be done exclude_attrs : List of string, attribute names that should be excluded from training and evaluation target_attr : String, attribute name containing labels in the 'table' k : integer, specifies the number of folds for cross-validation. The default value is 5. metric : List of strings. Currently, the following values are allowed: 'precision', 'recall', 'f1', The list should form a subset of ['precision', 'recall', 'f1']. The default value is set to None. If None, then all the three metrics are computed for each fold and returned back to the user. random_state: int,Pseudo-random number generator state used for random sampling. If None, use default numpy RNG for shuffling :return: """ metric = validate_and_get_metric_as_list(metric) folds = KFold(len(table), k, shuffle=True, random_state=random_state) table = table.copy() if isinstance(triggers, list) == False: triggers = [triggers] eval_ls = [] ltable = table.get_property('ltable') rtable = table.get_property('rtable') foreign_key_ltable = table.get_property('foreign_key_ltable') foreign_key_rtable = table.get_property('foreign_key_rtable') if mg._progbar: bar = pyprind.ProgBar(k) for train_ind, test_ind in folds: train = mg.create_mtable(table.iloc[train_ind], key=table.get_key(), ltable=ltable, rtable=rtable, foreign_key_ltable=foreign_key_ltable, foreign_key_rtable=foreign_key_rtable) test = mg.create_mtable(table.iloc[test_ind], key=table.get_key(), ltable=ltable, rtable=rtable, foreign_key_ltable=foreign_key_ltable, foreign_key_rtable=foreign_key_rtable) if isinstance(matcher, BooleanRuleMatcher) == True: pred_col = get_name_for_predict_column(table.columns) predicted = matcher.predict(table=test, append=True, target_attr=pred_col, inplace=False) else: matcher.clf = clone(matcher.clf) matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) pred_col = get_name_for_predict_column(table.columns) predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, append=True, target_attr=pred_col, inplace=False) for t in triggers: t.execute(predicted, pred_col, inplace=True) eval_summary = eval_matches(predicted, target_attr, pred_col) eval_ls.append(eval_summary) if mg._progbar: bar.update() header = ['Metric', 'Num folds'] fold_header = ['Fold ' + str(i + 1) for i in range(k)] header.extend(fold_header) header.append('Mean score') dict_list = [] for m in metric: d = get_metric_dict(eval_ls, k, m, header) dict_list.append(d) stats = pd.DataFrame(dict_list) stats = stats[header] res = OrderedDict() res['cv_stats'] = stats res['fold_stats'] = eval_ls return res
def test_eval_matches_invalid_predicted_attr(self): eval_matches(pd.DataFrame(), "", None)
def test_eval_matches_invalid_gold_attr(self): eval_matches(pd.DataFrame(), None, "")
def test_eval_matches_invalid_df(self): eval_matches(None, "", "")
def cv_matcher_and_trigger(matcher, triggers, table, exclude_attrs, target_attr, k=5, metric=None, random_state=None): """ Cross validate matcher and trigger. Parameters ---------- matcher : object, An ML-object in Magellan triggers : List of MatchTrigger objects table : MTable, on which match + trigger should be done exclude_attrs : List of string, attribute names that should be excluded from training and evaluation target_attr : String, attribute name containing labels in the 'table' k : integer, specifies the number of folds for cross-validation. The default value is 5. metric : List of strings. Currently, the following values are allowed: 'precision', 'recall', 'f1', The list should form a subset of ['precision', 'recall', 'f1']. The default value is set to None. If None, then all the three metrics are computed for each fold and returned back to the user. random_state: int,Pseudo-random number generator state used for random sampling. If None, use default numpy RNG for shuffling :return: """ metric = validate_and_get_metric_as_list(metric) folds = KFold(len(table), k, shuffle=True, random_state=random_state) table = table.copy() if isinstance(triggers, list) == False: triggers = [triggers] eval_ls = [] ltable=table.get_property('ltable') rtable=table.get_property('rtable') foreign_key_ltable=table.get_property('foreign_key_ltable') foreign_key_rtable=table.get_property('foreign_key_rtable') if mg._progbar: bar = pyprind.ProgBar(k) for train_ind, test_ind in folds: train = mg.create_mtable(table.iloc[train_ind], key=table.get_key(), ltable=ltable,rtable=rtable, foreign_key_ltable=foreign_key_ltable, foreign_key_rtable=foreign_key_rtable) test = mg.create_mtable(table.iloc[test_ind], key=table.get_key(), ltable=ltable,rtable=rtable, foreign_key_ltable=foreign_key_ltable, foreign_key_rtable=foreign_key_rtable) if isinstance(matcher, BooleanRuleMatcher) == True: pred_col = get_name_for_predict_column(table.columns) predicted = matcher.predict(table=test, append=True, target_attr=pred_col, inplace=False) else: matcher.clf = clone(matcher.clf) matcher.fit(table=train, exclude_attrs=exclude_attrs,target_attr=target_attr) pred_col = get_name_for_predict_column(table.columns) predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, append=True, target_attr=pred_col, inplace=False) for t in triggers: t.execute(predicted, pred_col, inplace=True) eval_summary = eval_matches(predicted, target_attr, pred_col) eval_ls.append(eval_summary) if mg._progbar: bar.update() header = ['Metric', 'Num folds'] fold_header = ['Fold ' + str(i+1) for i in range(k)] header.extend(fold_header) header.append('Mean score') dict_list = [] for m in metric: d = get_metric_dict(eval_ls, k, m, header) dict_list.append(d) stats = pd.DataFrame(dict_list) stats = stats[header] res = OrderedDict() res['cv_stats'] = stats res['fold_stats'] = eval_ls return res