Пример #1
0
    def test_recall(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)

        assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3
        assert rl.recall(cm) == 1 / 3
Пример #2
0
    def test_recall(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)

        self.assertEqual(rl.recall(LINKS_TRUE, LINKS_PRED), 1 / 3)
        self.assertEqual(rl.recall(cm), 1 / 3)
Пример #3
0
def metrics(links_true, links_pred, pairs):
    if len(links_pred) > 0:

        # precision
        precision = rl.precision(links_true, links_pred)

        #recall
        recall = rl.recall(links_true, links_pred)

        # The F-score for this classification is
        fscore = rl.fscore(links_true, links_pred)

        return {
            'pairs': len(pairs),
            '#duplicates': len(links_pred),
            'precision': precision,
            'recall': recall,
            'fscore': fscore
        }
    else:
        return {
            'pairs': 0,
            '#duplicates': 0,
            'precision': 0,
            'recall': 0,
            'fscore': 0
        }
Пример #4
0
    def test_fscore(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
        prec = rl.precision(LINKS_TRUE, LINKS_PRED)
        rec = rl.recall(LINKS_TRUE, LINKS_PRED)
        expected = float(2 * prec * rec / (prec + rec))

        self.assertEqual(rl.fscore(LINKS_TRUE, LINKS_PRED), expected)
        self.assertEqual(rl.fscore(cm), expected)
Пример #5
0
def _compute_performance(test_index, predictions, test_vectors_size):
    LOGGER.info('Running performance evaluation ...')

    confusion_matrix = rl.confusion_matrix(test_index,
                                           predictions,
                                           total=test_vectors_size)
    precision = rl.precision(test_index, predictions)
    recall = rl.recall(test_index, predictions)
    f_score = rl.fscore(confusion_matrix)

    LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall,
                f_score)
    LOGGER.info('Confusion matrix: %s', confusion_matrix)

    return precision, recall, f_score, confusion_matrix
def cross_val_score(classifier,
                    comparison_vector,
                    link_true,
                    cv=5,
                    method='fscore'):
    skfolds = StratifiedKFold(n_splits=cv)

    y = pandas.Series(0, index=comparison_vector.index)
    y.loc[link_true.index & comparison_vector.index] = 1

    X_train = comparison_vector.values
    y_train = y.values
    scores = []
    for train_index, test_index in skfolds.split(X_train, y_train):
        #clone_clf = clone(classifier)
        classifier_copy = copy.deepcopy(classifier)
        X_train_folds = comparison_vector.iloc[
            train_index]  #X_train[train_index]
        X_test_folds = comparison_vector.iloc[test_index]  #X_train[test_index]
        y_train_folds = X_train_folds.index & link_true.index  #y_train[train_index]
        y_test_folds = X_test_folds.index & link_true.index

        # Train the classifier
        #print(y_train_folds.shape)
        classifier_copy.fit(X_train_folds, y_train_folds)

        # predict matches for the test
        #print(X_test_folds)
        y_pred = classifier_copy.predict(X_test_folds)

        if (method == 'fscore'):
            score = recordlinkage.fscore(y_test_folds, y_pred)
        elif (method == 'precision'):
            score = recordlinkage.precision(y_test_folds, y_pred)
        elif (method == 'recall'):
            score = recordlinkage.recall(y_test_folds, y_pred)
        elif (method == 'accuracy'):
            score = recordlinkage.accuracy(y_test_folds, y_pred,
                                           len(comparison_vector))
        elif (method == 'specificity'):
            score = recordlinkage.specificity(y_test_folds, y_pred,
                                              len(comparison_vector))
        scores.append(score)

    scores = numpy.array(scores)
    return scores
Пример #7
0
def log_quality_results(logger, result, true_links, total_pairs, params=None):
    logger.info("Number of Results %d", len(result))
    logger.info("Confusion Matrix %s", str(
        recordlinkage.confusion_matrix(true_links, result, total_pairs)))
    try:
        fscore = recordlinkage.fscore(true_links, result)
        accuracy = recordlinkage.accuracy(true_links, result, total_pairs)
        precision = recordlinkage.precision(true_links, result)
        recall = recordlinkage.recall(true_links, result)
        logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy)
        logger.info("Precision: %.2f Recall %.2f", precision, recall)
        logger.info("For params : %s", str(params))
        write_results(logger.name, fscore, accuracy, precision, recall, params)

    except ZeroDivisionError:
        logger.error("ZeroDivisionError!!")
    return fscore
Пример #8
0
def print_experiment_evaluation(matches, description):
    precision = 0
    recall = 0
    fscore = 0

    if len(matches) > 0:
        precision = recordlinkage.precision(links_test, matches)
        recall = recordlinkage.recall(links_test, matches)
        fscore = recordlinkage.fscore(links_test,
                                      matches) if recall + precision > 0 else 0

    print(f"Configuration: {description}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-score: {fscore}")
    print(recordlinkage.confusion_matrix(links_test, matches))

    return precision, recall, fscore
Пример #9
0
def evalution(X_data, links_true):
    # 这里用逻辑回归分类器做分类,
    cl = recordlinkage.LogisticRegressionClassifier()
    cl.fit(X_data, links_true)
    # 用得到的模型做预测
    links_pred = cl.predict(X_data)
    print("links_pred:{}".format(links_pred.shape))
    # 输出混淆矩阵,confusion_matrix
    cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data))
    print("Confusion matrix:\n", cm)
    # compute the F-score for this classification
    fscore = recordlinkage.fscore(cm)
    print('fscore', fscore)
    # compute recall for this classification
    recall = recordlinkage.recall(links_true, links_pred)
    print('recall', recall)
    # compute precision for this classification
    precision = recordlinkage.precision(links_true, links_pred)
    print('precision', precision)
def metrics(links_true, links_pred, comparison_vector):

    if len(links_pred) > 0:
        # confusion matrix
        matrix = recordlinkage.confusion_matrix(links_true, links_pred,
                                                len(comparison_vector))

        # precision
        precision = recordlinkage.precision(links_true, links_pred)

        # precision
        recall = recordlinkage.recall(links_true, links_pred)

        # The F-score for this classification is
        fscore = recordlinkage.fscore(links_true, links_pred)

        return matrix, precision, recall, fscore
    else:
        return 0, 0, 0, 0
Пример #11
0
print('feature shape', features.shape)

# use the Logistic Regression Classifier
# this classifier is equivalent to the deterministic record linkage approach
intercept = -9.5
coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]

print('Deterministic classifier')
print('intercept', intercept)
print('coefficients', coefficients)

logreg = rl.LogisticRegressionClassifier(
    coefficients=coefficients, intercept=intercept)
links = logreg.predict(features)

print(len(links), 'links/matches')

# return the confusion matrix
conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
print('confusion matrix')
print(conf_logreg)

# compute the F-score for this classification
fscore = rl.fscore(conf_logreg)
print('fscore', fscore)
recall = rl.recall(true_links, links)
print('recall', recall)
precision = rl.precision(true_links, links)
print('precision', precision)
Пример #12
0
print('Num. of many-to-many predicted links: {}'.format(len(links_pred)))

# Take the match with highest probability for each Twitter user
links_prob = classifier.prob(comparison_vectors)
links_prob = links_prob[links_prob.index.isin(links_pred.values)]
links_prob = links_prob.to_frame()
links_prob.index.names = ['index_twitter', 'index_voter']
links_prob.columns = ['match_prob']
links_prob.reset_index(inplace=True)
links_prob = links_prob.sort_values(
    'match_prob', ascending=False).drop_duplicates('index_twitter')
links_prob.set_index(['index_twitter', 'index_voter'], inplace=True)
links_pred = links_prob.index

print('Num. of many-to-one predicted links: {}'.format(len(links_pred)))

cm = recordlinkage.confusion_matrix(links_true,
                                    links_pred,
                                    total=len(df_twitter) * len(df_voter))
print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0],
                                                cm[1][1]))

# compute the F-score for this classification
fscore = recordlinkage.fscore(cm)
print('F-score: {:.2f}'.format(fscore))
recall = recordlinkage.recall(links_true, links_pred)
print('Recall: {:.2f}'.format(recall))
precision = recordlinkage.precision(links_true, links_pred)
print('Precision: {:.2f}'.format(precision))

print(classifier.log_weights)
Пример #13
0
    def test_recall(self):

        self.assertEqual(recordlinkage.recall(CONF_M1), 1.0)
        self.assertEqual(recordlinkage.recall(CONF_M5), 0.0)
Пример #14
0
cl = rl.NaiveBayesClassifier()
cl.fit(X_data, links_true)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(X_data)
print("Predicted number of links:", len(links_pred))

cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
print("Confusion matrix:\n", cm)

# compute the F-score for this classification
fscore = rl.fscore(cm)
print('fscore', fscore)
recall = rl.recall(links_true, links_pred)
print('recall', recall)
precision = rl.precision(links_true, links_pred)
print('precision', precision)

# Predict the match probability for each pair in the dataset.
probs = cl.prob(X_data)
Пример #15
0
    def test_recall(self):

        self.assertEqual(recordlinkage.recall(CONF_M1), 1.0)
        self.assertEqual(recordlinkage.recall(CONF_M5), 0.0)