예제 #1
0
    def test_fscore(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
        prec = rl.precision(LINKS_TRUE, LINKS_PRED)
        rec = rl.recall(LINKS_TRUE, LINKS_PRED)
        expected = float(2 * prec * rec / (prec + rec))

        self.assertEqual(rl.fscore(LINKS_TRUE, LINKS_PRED), expected)
        self.assertEqual(rl.fscore(cm), expected)
예제 #2
0
def metrics(links_true, links_pred, pairs):
    if len(links_pred) > 0:

        # precision
        precision = rl.precision(links_true, links_pred)

        #recall
        recall = rl.recall(links_true, links_pred)

        # The F-score for this classification is
        fscore = rl.fscore(links_true, links_pred)

        return {
            'pairs': len(pairs),
            '#duplicates': len(links_pred),
            'precision': precision,
            'recall': recall,
            'fscore': fscore
        }
    else:
        return {
            'pairs': 0,
            '#duplicates': 0,
            'precision': 0,
            'recall': 0,
            'fscore': 0
        }
예제 #3
0
def _compute_performance(test_index, predictions, test_vectors_size):
    LOGGER.info('Running performance evaluation ...')

    confusion_matrix = rl.confusion_matrix(test_index,
                                           predictions,
                                           total=test_vectors_size)
    precision = rl.precision(test_index, predictions)
    recall = rl.recall(test_index, predictions)
    f_score = rl.fscore(confusion_matrix)

    LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall,
                f_score)
    LOGGER.info('Confusion matrix: %s', confusion_matrix)

    return precision, recall, f_score, confusion_matrix
예제 #4
0
def log_quality_results(logger, result, true_links, total_pairs, params=None):
    logger.info("Number of Results %d", len(result))
    logger.info("Confusion Matrix %s", str(
        recordlinkage.confusion_matrix(true_links, result, total_pairs)))
    try:
        fscore = recordlinkage.fscore(true_links, result)
        accuracy = recordlinkage.accuracy(true_links, result, total_pairs)
        precision = recordlinkage.precision(true_links, result)
        recall = recordlinkage.recall(true_links, result)
        logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy)
        logger.info("Precision: %.2f Recall %.2f", precision, recall)
        logger.info("For params : %s", str(params))
        write_results(logger.name, fscore, accuracy, precision, recall, params)

    except ZeroDivisionError:
        logger.error("ZeroDivisionError!!")
    return fscore
def cross_val_score(classifier,
                    comparison_vector,
                    link_true,
                    cv=5,
                    method='fscore'):
    skfolds = StratifiedKFold(n_splits=cv)

    y = pandas.Series(0, index=comparison_vector.index)
    y.loc[link_true.index & comparison_vector.index] = 1

    X_train = comparison_vector.values
    y_train = y.values
    scores = []
    for train_index, test_index in skfolds.split(X_train, y_train):
        #clone_clf = clone(classifier)
        classifier_copy = copy.deepcopy(classifier)
        X_train_folds = comparison_vector.iloc[
            train_index]  #X_train[train_index]
        X_test_folds = comparison_vector.iloc[test_index]  #X_train[test_index]
        y_train_folds = X_train_folds.index & link_true.index  #y_train[train_index]
        y_test_folds = X_test_folds.index & link_true.index

        # Train the classifier
        #print(y_train_folds.shape)
        classifier_copy.fit(X_train_folds, y_train_folds)

        # predict matches for the test
        #print(X_test_folds)
        y_pred = classifier_copy.predict(X_test_folds)

        if (method == 'fscore'):
            score = recordlinkage.fscore(y_test_folds, y_pred)
        elif (method == 'precision'):
            score = recordlinkage.precision(y_test_folds, y_pred)
        elif (method == 'recall'):
            score = recordlinkage.recall(y_test_folds, y_pred)
        elif (method == 'accuracy'):
            score = recordlinkage.accuracy(y_test_folds, y_pred,
                                           len(comparison_vector))
        elif (method == 'specificity'):
            score = recordlinkage.specificity(y_test_folds, y_pred,
                                              len(comparison_vector))
        scores.append(score)

    scores = numpy.array(scores)
    return scores
예제 #6
0
def diagnose_links(true_links, pred_links, total_n_links, similarity_df=None):
    confusion_mat = rl.confusion_matrix(true_links,
                                        pred_links,
                                        total=total_n_links)
    print("Confusion Matrix")
    print(confusion_mat)

    print("Comfusion Matrix (percentages)")
    print(confusion_mat * 100 / total_n_links)
    print("")

    matched_true_links = true_links.isin(pred_links)
    print("Recall Metrics:")
    print("Num of True Links: {:,}".format(len(true_links)))
    print("Num of True Links Matched: {:,}".format(matched_true_links.sum()))
    print("Num of True Links Unmatched: {:,}".format(
        len(matched_true_links) - matched_true_links.sum()))
    print("Percent of True Links Matched: {:.2f}%".format(
        matched_true_links.mean() * 100))
    print("Percent of True Links Unmatched: {:.2f}%".format(
        (1 - matched_true_links.mean()) * 100))
    print("")

    correct_predictions = pred_links.isin(true_links)
    print("Precision Metrics:")
    print("Num of Predicted Matches: {:,}".format(len(pred_links)))
    print("Num of Correct Predicted Matches: {:,}".format(
        correct_predictions.sum()))
    print("Num of Incorrect Predicted Matches: {:,}".format(
        len(correct_predictions) - correct_predictions.sum()))
    print("Percent of Predictions which are Correct: {:.2f}%".format(
        correct_predictions.mean() * 100))
    print("Percent of Predictions which are Incorrect: {:.2f}%".format(
        (1 - correct_predictions.mean()) * 100))
    print("")

    f1_score = rl.fscore(true_links, pred_links)
    print("F1 Score is {:.2f}%".format(f1_score * 100))

    if similarity_df is not None:
        is_true_link = similarity_df.index.isin(true_links)

        auc = roc_auc_score(y_true=is_true_link,
                            y_score=similarity_df["similarity_score"])
        print("AUC of ROC of Similarity Scores is {:.2f}%".format(auc * 100))
예제 #7
0
def print_experiment_evaluation(matches, description):
    precision = 0
    recall = 0
    fscore = 0

    if len(matches) > 0:
        precision = recordlinkage.precision(links_test, matches)
        recall = recordlinkage.recall(links_test, matches)
        fscore = recordlinkage.fscore(links_test,
                                      matches) if recall + precision > 0 else 0

    print(f"Configuration: {description}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-score: {fscore}")
    print(recordlinkage.confusion_matrix(links_test, matches))

    return precision, recall, fscore
예제 #8
0
def evalution(X_data, links_true):
    # 这里用逻辑回归分类器做分类,
    cl = recordlinkage.LogisticRegressionClassifier()
    cl.fit(X_data, links_true)
    # 用得到的模型做预测
    links_pred = cl.predict(X_data)
    print("links_pred:{}".format(links_pred.shape))
    # 输出混淆矩阵,confusion_matrix
    cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data))
    print("Confusion matrix:\n", cm)
    # compute the F-score for this classification
    fscore = recordlinkage.fscore(cm)
    print('fscore', fscore)
    # compute recall for this classification
    recall = recordlinkage.recall(links_true, links_pred)
    print('recall', recall)
    # compute precision for this classification
    precision = recordlinkage.precision(links_true, links_pred)
    print('precision', precision)
def metrics(links_true, links_pred, comparison_vector):

    if len(links_pred) > 0:
        # confusion matrix
        matrix = recordlinkage.confusion_matrix(links_true, links_pred,
                                                len(comparison_vector))

        # precision
        precision = recordlinkage.precision(links_true, links_pred)

        # precision
        recall = recordlinkage.recall(links_true, links_pred)

        # The F-score for this classification is
        fscore = recordlinkage.fscore(links_true, links_pred)

        return matrix, precision, recall, fscore
    else:
        return 0, 0, 0, 0
예제 #10
0
def get_optimal_threshold(result_prob, true_pairs, min_threshold=0.1, max_threshold=1.0, step=0.05):
    logger = get_logger('RL.OPTIMAL_THRESHOLD')
    max_fscore = 0.0
    optimal_threshold = 0
    for threshold in range(int(min_threshold*100),int(max_threshold*100), int(step*100)):
        threshold = threshold / 100.0
        result = [(e1, e2) for (e1, e2, d) in result_prob if d <= threshold]
        if not len(result):
            logger.info("No results for threshold: %.2f", threshold)
            continue
        result = pd.MultiIndex.from_tuples(result)
        true_pairs = pd.MultiIndex.from_tuples(true_pairs)
        try:
            fscore = recordlinkage.fscore(true_pairs, result)
        except ZeroDivisionError:
            logger.info("ZeroDivisionError in recordlinkage.fscore")
            continue
        logger.debug("For threshold: %f fscore: %f", threshold, fscore)
        if fscore >= max_fscore:
            max_fscore = fscore
            optimal_threshold = threshold

    logger.info("Found optimal threshold %f with max_fscore: %f", optimal_threshold, max_fscore)
    return (optimal_threshold, max_fscore)
예제 #11
0
data = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])

golden_pairs = data.sample(frac=1)
golden_pairs = golden_pairs[0:5000]
golden_matches_index = golden_pairs.index & matches
print(golden_matches_index)


data_2 = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])


logreg = recordlinkage.LogisticRegressionClassifier()

logreg.fit(golden_pairs, golden_matches_index)
print ("Intercept: ", logreg.intercept)
print ("Coefficients: ", logreg.coefficients)

result_logreg = logreg.predict(data_2)

print(len(result_logreg))
print(result_logreg)

print(recordlinkage.confusion_matrix(matches, result_logreg, len(data_2)))

print(recordlinkage.fscore(matches, result_logreg))

coefficients = [2, -0.08400654, -0.41432631, -0.12138752, -0.31617086, -0.42389099, -0.33185166, 0.02173983, 0]
predicter = recordlinkage.LogisticRegressionClassifier(coefficients=coefficients, intercept=-5.379865263857996)

y = predicter.predict(data_2)
예제 #12
0
print('feature shape', features.shape)

# use the Logistic Regression Classifier
# this classifier is equivalent to the deterministic record linkage approach
intercept = -9.5
coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]

print('Deterministic classifier')
print('intercept', intercept)
print('coefficients', coefficients)

logreg = rl.LogisticRegressionClassifier(
    coefficients=coefficients, intercept=intercept)
links = logreg.predict(features)

print(len(links), 'links/matches')

# return the confusion matrix
conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
print('confusion matrix')
print(conf_logreg)

# compute the F-score for this classification
fscore = rl.fscore(conf_logreg)
print('fscore', fscore)
recall = rl.recall(true_links, links)
print('recall', recall)
precision = rl.precision(true_links, links)
print('precision', precision)
예제 #13
0
print('Num. of many-to-many predicted links: {}'.format(len(links_pred)))

# Take the match with highest probability for each Twitter user
links_prob = classifier.prob(comparison_vectors)
links_prob = links_prob[links_prob.index.isin(links_pred.values)]
links_prob = links_prob.to_frame()
links_prob.index.names = ['index_twitter', 'index_voter']
links_prob.columns = ['match_prob']
links_prob.reset_index(inplace=True)
links_prob = links_prob.sort_values(
    'match_prob', ascending=False).drop_duplicates('index_twitter')
links_prob.set_index(['index_twitter', 'index_voter'], inplace=True)
links_pred = links_prob.index

print('Num. of many-to-one predicted links: {}'.format(len(links_pred)))

cm = recordlinkage.confusion_matrix(links_true,
                                    links_pred,
                                    total=len(df_twitter) * len(df_voter))
print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0],
                                                cm[1][1]))

# compute the F-score for this classification
fscore = recordlinkage.fscore(cm)
print('F-score: {:.2f}'.format(fscore))
recall = recordlinkage.recall(links_true, links_pred)
print('Recall: {:.2f}'.format(recall))
precision = recordlinkage.precision(links_true, links_pred)
print('Precision: {:.2f}'.format(precision))

print(classifier.log_weights)
예제 #14
0
cl = rl.NaiveBayesClassifier()
cl.fit(X_data, links_true)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(X_data)
print("Predicted number of links:", len(links_pred))

cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
print("Confusion matrix:\n", cm)

# compute the F-score for this classification
fscore = rl.fscore(cm)
print('fscore', fscore)
recall = rl.recall(links_true, links_pred)
print('recall', recall)
precision = rl.precision(links_true, links_pred)
print('precision', precision)

# Predict the match probability for each pair in the dataset.
probs = cl.prob(X_data)
예제 #15
0
    def test_fscore(self):

        self.assertEqual(recordlinkage.fscore(CONF_M1), 1.0)
        self.assertRaises(ZeroDivisionError, recordlinkage.fscore, CONF_M5)
import recordlinkage as rl
from recordlinkage.datasets import load_krebsregister

krebs_X, krebs_true_links = load_krebsregister(missing_values=0)

print(krebs_true_links)

# Train the classifier
ecm = rl.ECMClassifier(binarize=0.8)
result_ecm = ecm.fit_predict(krebs_X)

len(result_ecm)

print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X)))

# The F-score for this classification is
print(rl.fscore(krebs_true_links, result_ecm))

print(ecm.log_weights)
예제 #17
0
#nonmatches = data_sample[data_sample.sum(axis=1) < 4]
#creating match index
match_index = matches.index

#creating a training dataset
golden_pairs = data_sample[0:2000000]
golden_matches_index = golden_pairs.index & match_index

# Train the classifier
svm = rl.SVMClassifier()
svm.learn(golden_pairs, golden_matches_index)
# Predict the match status for all record pairs
result_svm = svm.predict(data)
len(result_svm)

#creating a confusion matrix
conf_svm = rl.confusion_matrix(match_index, result_svm, len(data))
conf_svm

# The F-score for this classification is
rl.fscore(conf_svm)

m_last = pd.DataFrame(result_svm)

#loading data for review
dfB = pd.read_csv('for_linkage_data1.csv', sep=',',encoding='utf-8')

#after review the dataframe m_last
#to review the matches
dfB.loc[['LID000000000','LID000157274','LID000217044','LID000491999','LID000558481','LID000589541']]
예제 #18
0
    def test_fscore(self):

        self.assertEqual(recordlinkage.fscore(CONF_M1), 1.0)
        self.assertRaises(ZeroDivisionError, recordlinkage.fscore, CONF_M5)