예제 #1
0
 def test_get_property_valid_df_name_2(self):
     # cm.del_catalog()
     self.assertEqual(cm.get_catalog_len(), 0)
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     self.assertEqual(cm.get_property(C, 'key'), '_id')
     self.assertEqual(cm.get_property(C, 'fk_ltable'), 'ltable_ID')
     self.assertEqual(cm.get_property(C, 'fk_rtable'), 'rtable_ID')
     self.assertEqual(cm.get_property(C, 'ltable').equals(A), True)
     self.assertEqual(cm.get_property(C, 'rtable').equals(B), True)
예제 #2
0
def eval_matches(X, gold_label_attr, predicted_label_attr):
    if not isinstance(X, pd.DataFrame):
        logger.error('The input table is not of type dataframe')
        raise AssertionError('The input is not of type dataframe')

    if not isinstance(gold_label_attr, six.string_types):
        logger.error('The input gold_label_attr is not of type string')
        raise AssertionError('The input gold_label_attr is not of type string')

    if not isinstance(predicted_label_attr, six.string_types):
        logger.error('The input predicted_label_attr is not of type string')
        raise AssertionError(
            'The input predicted_label_attr is not of type string')

    if not check_attrs_present(X, gold_label_attr):
        logger.error('The gold_label_attr is not present in the input table')
        raise AssertionError(
            'The gold_label_attr is not present in the input table')

    if not check_attrs_present(X, predicted_label_attr):
        logger.error(
            'The predicted_label_attr is not present in the input table')
        raise AssertionError(
            'The predicted_label_attr is not present in the input table')

    Y = X.reset_index(drop=False, inplace=False)
    g = Y[gold_label_attr]
    # if isinstance(g, pd.DataFrame):
    #     g = g.T
    #     assert len(g) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     g = g.iloc[0]

    p = Y[predicted_label_attr]
    # if isinstance(p, pd.DataFrame):
    #     p = p.T
    #     assert len(p) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     p = p.iloc[0]

    # get false label (0) indices
    gf = g[g == 0].index.values

    pf = p[p == 0].index.values

    # get true label (1) indices
    gt = g[g == 1].index.values

    pt = p[p == 1].index.values

    # get false positive indices
    fp_indices = list(set(gf).intersection(pt))

    # get true positive indices
    tp_indices = list(set(gt).intersection(pt))

    # get false negative indices
    fn_indices = list(set(gt).intersection(pf))

    # get true negative indices
    tn_indices = list(set(gf).intersection(pf))

    n_tp = float(len(tp_indices))
    n_fp = float(len(fp_indices))
    n_fn = float(len(fn_indices))
    n_tn = float(len(tn_indices))
    prec_num = n_tp
    prec_den = n_tp + n_fp
    rec_num = n_tp
    rec_den = n_tp + n_fn
    if prec_den == 0.0:
        precision = 0.0
    else:
        precision = prec_num / prec_den
    if rec_den == 0.0:
        recall = 0.0
    else:
        recall = rec_num / rec_den

    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (2.0 * precision * recall) / (precision + recall)

    fk_ltable, fk_rtable = cm.get_property(X, 'fk_ltable'), cm.get_property(
        X, 'fk_rtable')

    Y.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)
    false_pos_ls = list(Y.ix[fp_indices].index.values)
    false_neg_ls = list(Y.ix[fn_indices].index.values)
    ret_dict = OrderedDict()
    ret_dict['prec_numerator'] = prec_num
    ret_dict['prec_denominator'] = prec_den
    ret_dict['precision'] = precision
    ret_dict['recall_numerator'] = rec_num
    ret_dict['recall_denominator'] = rec_den
    ret_dict['recall'] = recall
    ret_dict['f1'] = f1
    ret_dict['pred_pos_num'] = n_tp + n_fp
    ret_dict['false_pos_num'] = n_fp
    ret_dict['false_pos_ls'] = false_pos_ls
    ret_dict['pred_neg_num'] = n_fn + n_tn
    ret_dict['false_neg_num'] = n_fn
    ret_dict['false_neg_ls'] = false_neg_ls
    return ret_dict
예제 #3
0
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
예제 #4
0
 def test_set_property_valid_df_name_value(self):
     # cm.del_catalog()
     df = pd.read_csv(path_a)
     cm.set_property(df, 'key', 'ID')
     self.assertEqual(cm.get_property(df, 'key'), 'ID')
예제 #5
0
 def test_get_property_df_notin_catalog(self):
     # cm.del_catalog()
     A = pd.read_csv(path_a)
     cm.get_property(A, 'key')
예제 #6
0
 def test_get_property_invalid_path_1(self):
     # cm.del_catalog()
     A = read_csv_metadata(path_a)
     cm.get_property(A, None)
예제 #7
0
 def test_get_property_invalid_df_1(self):
     cm.get_property(10, 'key')
예제 #8
0
 def test_get_fk_rtable_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b)
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     self.assertEqual(cm.get_fk_rtable(C), cm.get_property(C, 'fk_rtable'))
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
예제 #9
0
 def test_get_property_valid_df_name_1(self):
     # cm.del_catalog()
     df = read_csv_metadata(path_a)
     self.assertEqual(cm.get_property(df, 'key'), 'ID')
예제 #10
0
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
예제 #11
0
def eval_matches(X, gold_label_attr, predicted_label_attr):
    if not isinstance(X, pd.DataFrame):
        logger.error('The input table is not of type dataframe')
        raise AssertionError('The input is not of type dataframe')

    if not isinstance(gold_label_attr, six.string_types):
        logger.error('The input gold_label_attr is not of type string')
        raise AssertionError('The input gold_label_attr is not of type string')

    if not isinstance(predicted_label_attr, six.string_types):
        logger.error('The input predicted_label_attr is not of type string')
        raise AssertionError('The input predicted_label_attr is not of type string')

    if not check_attrs_present(X, gold_label_attr):
        logger.error('The gold_label_attr is not present in the input table')
        raise AssertionError('The gold_label_attr is not present in the input table')

    if not check_attrs_present(X, predicted_label_attr):
        logger.error('The predicted_label_attr is not present in the input table')
        raise AssertionError('The predicted_label_attr is not present in the input table')

    Y = X.reset_index(drop=False, inplace=False)
    g = Y[gold_label_attr]
    # if isinstance(g, pd.DataFrame):
    #     g = g.T
    #     assert len(g) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     g = g.iloc[0]

    p = Y[predicted_label_attr]
    # if isinstance(p, pd.DataFrame):
    #     p = p.T
    #     assert len(p) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     p = p.iloc[0]

    # get false label (0) indices
    gf = g[g == 0].index.values

    pf = p[p == 0].index.values

    # get true label (1) indices
    gt = g[g == 1].index.values

    pt = p[p == 1].index.values

    # get false positive indices
    fp_indices = list(set(gf).intersection(pt))

    # get true positive indices
    tp_indices = list(set(gt).intersection(pt))

    # get false negative indices
    fn_indices = list(set(gt).intersection(pf))

    # get true negative indices
    tn_indices = list(set(gf).intersection(pf))

    n_tp = float(len(tp_indices))
    n_fp = float(len(fp_indices))
    n_fn = float(len(fn_indices))
    n_tn = float(len(tn_indices))
    prec_num = n_tp
    prec_den = n_tp + n_fp
    rec_num = n_tp
    rec_den = n_tp + n_fn
    if prec_den == 0.0:
        precision = 0.0
    else:
        precision = prec_num / prec_den
    if rec_den == 0.0:
        recall = 0.0
    else:
        recall = rec_num / rec_den

    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (2.0 * precision * recall) / (precision + recall)

    fk_ltable, fk_rtable = cm.get_property(X, 'fk_ltable'), cm.get_property(X, 'fk_rtable')

    Y.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)
    false_pos_ls = list(Y.ix[fp_indices].index.values)
    false_neg_ls = list(Y.ix[fn_indices].index.values)
    ret_dict = OrderedDict()
    ret_dict['prec_numerator'] = prec_num
    ret_dict['prec_denominator'] = prec_den
    ret_dict['precision'] = precision
    ret_dict['recall_numerator'] = rec_num
    ret_dict['recall_denominator'] = rec_den
    ret_dict['recall'] = recall
    ret_dict['f1'] = f1
    ret_dict['pred_pos_num'] = n_tp + n_fp
    ret_dict['false_pos_num'] = n_fp
    ret_dict['false_pos_ls'] = false_pos_ls
    ret_dict['pred_neg_num'] = n_fn + n_tn
    ret_dict['false_neg_num'] = n_fn
    ret_dict['false_neg_ls'] = false_neg_ls
    return ret_dict