def test_get_property_valid_df_name_2(self):
     # cm.del_catalog()
     self.assertEqual(cm.get_catalog_len(), 0)
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     self.assertEqual(cm.get_property(C, 'key'), '_id')
     self.assertEqual(cm.get_property(C, 'fk_ltable'), 'ltable_ID')
     self.assertEqual(cm.get_property(C, 'fk_rtable'), 'rtable_ID')
     self.assertEqual(cm.get_property(C, 'ltable').equals(A), True)
     self.assertEqual(cm.get_property(C, 'rtable').equals(B), True)
 def test_set_property_valid_df_name_value(self):
     # cm.del_catalog()
     df = pd.read_csv(path_a)
     cm.set_property(df, 'key', 'ID')
     self.assertEqual(cm.get_property(df, 'key'), 'ID')
 def test_get_property_df_notin_catalog(self):
     # cm.del_catalog()
     A = pd.read_csv(path_a)
     cm.get_property(A, 'key')
 def test_get_property_invalid_path_1(self):
     # cm.del_catalog()
     A = read_csv_metadata(path_a)
     cm.get_property(A, None)
 def test_get_property_invalid_df_1(self):
     cm.get_property(10, 'key')
 def test_get_fk_rtable_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b)
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     self.assertEqual(cm.get_fk_rtable(C), cm.get_property(C, 'fk_rtable'))
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
 def test_get_property_valid_df_name_1(self):
     # cm.del_catalog()
     df = read_csv_metadata(path_a)
     self.assertEqual(cm.get_property(df, 'key'), 'ID')
예제 #8
0
def eval_matches(data_frame, gold_label_attr, predicted_label_attr):
    """
    Evaluates the matches from the matcher.

    Specifically, given a DataFrame containing golden labels and predicted
    labels, this function would evaluate the matches and return the accuracy
    results such as precision, recall and F1.

    Args:
        data_frame (DataFrame): The input pandas DataFrame containing "gold"
            labels and "predicted" labels.
        gold_label_attr (string): An attribute in the input DataFrame containing
            "gold" labels.
        predicted_label_attr (string): An attribute in the input DataFrame
            containing "predicted" labels.

    Returns:
        A Python dictionary containing the accuracy measures such as
        precision, recall, F1.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `gold_label_attr` is not of
            type string.
        AssertionError: If `predicted_label_attr` is not of
            type string.
        AssertionError: If the `gold_label_attr` is not in
            the input dataFrame.
        AssertionError: If the `predicted_label_attr` is not in
            the input dataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
        >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
    """
    # Validate input parameters

    # # We expect the input object to be of type pandas DataFrame
    validate_object_type(data_frame, pd.DataFrame, 'The input table')

    # # We expect the input attribute (gold_label_attr) to be of type string
    validate_object_type(gold_label_attr, six.string_types,
                         'The input gold_label_attr')

    # # We expect the input attribute (predicted_label_attr) to be of type
    # string
    validate_object_type(predicted_label_attr, six.string_types,
                         'The input predicted_label_attr')

    # Check whether the gold label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, gold_label_attr):
        logger.error(
            'The gold_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The gold_label_attr is not present in the input DataFrame')

    # Check whether the predicted label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, predicted_label_attr):
        logger.error(
            'The predicted_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The predicted_label_attr is not present in the input DataFrame')

    # Reset the index to get the indices set as 0..len(table)
    new_data_frame = data_frame.reset_index(drop=False, inplace=False)

    # Project out the gold and label attributes.
    gold = new_data_frame[gold_label_attr]
    predicted = new_data_frame[predicted_label_attr]

    # Get gold negatives, positives
    gold_negative = gold[gold == 0].index.values
    gold_positive = gold[gold == 1].index.values

    # Get predicted negatives, positives
    predicted_negative = predicted[predicted == 0].index.values
    predicted_positive = predicted[predicted == 1].index.values

    # get false positive indices
    false_positive_indices = \
        list(set(gold_negative).intersection(predicted_positive))

    # get true positive indices
    true_positive_indices = \
        list(set(gold_positive).intersection(predicted_positive))

    # get false negative indices
    false_negative_indices = \
        list(set(gold_positive).intersection(predicted_negative))

    # get true negative indices
    true_negative_indices = \
        list(set(gold_negative).intersection(predicted_negative))

    # Get the number of TP, FP, FN, TN
    num_true_positives = float(len(true_positive_indices))
    num_false_positives = float(len(false_positive_indices))
    num_false_negatives = float(len(false_negative_indices))
    num_true_negatives = float(len(true_negative_indices))

    # Precision = num_tp/ (num_tp + num_fp)

    # Get precision numerator, denominator
    precision_numerator = num_true_positives
    precision_denominiator = num_true_positives + num_false_positives

    # Precision = num_tp/ (num_tp + num_fn)
    # Get recall numerator, denominator
    recall_numerator = num_true_positives
    recall_denominator = num_true_positives + num_false_negatives

    # Compute precision
    if precision_denominiator == 0.0:
        precision = 0.0
    else:
        precision = precision_numerator / precision_denominiator

    # Compute recall
    if recall_denominator == 0.0:
        recall = 0.0
    else:
        recall = recall_numerator / recall_denominator

    # Compute F1
    if precision == 0.0 and recall == 0.0:
        F1 = 0.0
    else:
        F1 = (2.0 * precision * recall) / (precision + recall)

    # Get the fk_ltable and fk_rtable
    fk_ltable = cm.get_property(data_frame, 'fk_ltable')
    fk_rtable = cm.get_property(data_frame, 'fk_rtable')

    # Check if the fk_ltable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_ltable):
        logger.error('The fk_ltable (%s) contains missing values' % fk_ltable)
        raise AssertionError('The fk_ltable (%s) contains missing values' %
                             fk_ltable)

    # Check if the fk_rtable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_rtable):
        logger.error('The fk_rtable (%s) contains missing values' % fk_rtable)
        raise AssertionError('The fk_rtable (%s) contains missing values' %
                             fk_rtable)

    # Set the index values to fk_ltable and fk_rtable
    new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)

    # Get the list of false positives and false negatives.
    false_pos_ls = list(
        new_data_frame.iloc[false_positive_indices].index.values)
    false_neg_ls = list(
        new_data_frame.iloc[false_negative_indices].index.values)

    # Store and return the accuracy results.
    accuracy_results = collections.OrderedDict()
    accuracy_results['prec_numerator'] = precision_numerator
    accuracy_results['prec_denominator'] = precision_denominiator
    accuracy_results['precision'] = precision
    accuracy_results['recall_numerator'] = recall_numerator
    accuracy_results['recall_denominator'] = recall_denominator
    accuracy_results['recall'] = recall
    accuracy_results['f1'] = F1
    accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives
    accuracy_results['false_pos_num'] = num_false_positives
    accuracy_results['false_pos_ls'] = false_pos_ls
    accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives
    accuracy_results['false_neg_num'] = num_false_negatives
    accuracy_results['false_neg_ls'] = false_neg_ls
    return accuracy_results
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')