Python confusion_matrix示例，recordlinkage.confusion_matrix Python示例

示例#1

0

显示文件

文件： test_measures.py 项目： wfranus/recordlinkage

    def test_confusion_matrix(self):

        result_len = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED,
                                         len(FULL_INDEX))
        result_full_index = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED,
                                                FULL_INDEX)
        expected = numpy.array([[1, 2], [3, 3]])

        numpy.testing.assert_array_equal(result_len, expected)
        numpy.testing.assert_array_equal(result_full_index, expected)

示例#2

0

显示文件

文件： test_measures.py 项目： J535D165/recordlinkage

    def test_confusion_matrix(self):

        conf_m = recordlinkage.confusion_matrix(
            gold_matches_index, matches_index, len(index))
        conf_m_2 = recordlinkage.confusion_matrix(
            gold_matches_index, matches_index, index)

        self.assertEqual(numpy.sum(conf_m), len(pairs))
        self.assertEqual(numpy.sum(conf_m[0, :]), len(gold_matches_index))
        self.assertEqual(numpy.sum(conf_m[:, 0]), len(matches_index))

        numpy.testing.assert_array_equal(conf_m, conf_m_2)

示例#3

0

显示文件

文件： test_measures.py 项目： luyang1210/recordL

    def test_confusion_matrix(self):

        conf_m = recordlinkage.confusion_matrix(gold_matches_index,
                                                matches_index, len(index))
        conf_m_2 = recordlinkage.confusion_matrix(gold_matches_index,
                                                  matches_index, index)

        self.assertEqual(numpy.sum(conf_m), len(pairs))
        self.assertEqual(numpy.sum(conf_m[0, :]), len(gold_matches_index))
        self.assertEqual(numpy.sum(conf_m[:, 0]), len(matches_index))

        numpy.testing.assert_array_equal(conf_m, conf_m_2)

示例#4

0

显示文件

文件： test_measures.py 项目： tushartilwankar/recordlinkage

    def test_specificity(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        assert rl.specificity(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 1 / 2
        assert rl.specificity(cm) == 1 / 2

示例#5

0

显示文件

    def test_recall(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)

        self.assertEqual(rl.recall(LINKS_TRUE, LINKS_PRED), 1 / 3)
        self.assertEqual(rl.recall(cm), 1 / 3)

示例#6

0

显示文件

    def test_precision(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        self.assertEqual(rl.precision(LINKS_TRUE, LINKS_PRED), 1 / 4)
        self.assertEqual(rl.precision(cm), 1 / 4)

示例#7

0

显示文件

文件： test_measures.py 项目： wfranus/recordlinkage

    def test_precision(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        assert rl.precision(LINKS_TRUE, LINKS_PRED) == 1 / 4
        assert rl.precision(cm) == 1 / 4

示例#8

0

显示文件

文件： test_measures.py 项目： wfranus/recordlinkage

    def test_recall(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED)

        assert rl.recall(LINKS_TRUE, LINKS_PRED) == 1 / 3
        assert rl.recall(cm) == 1 / 3

示例#9

0

显示文件

文件： test_measures.py 项目： tushartilwankar/recordlinkage

    def test_accuracy(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9
        assert rl.accuracy(cm) == 4 / 9

示例#10

0

显示文件

    def test_accuracy(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        self.assertEqual(rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)),
                         4 / 9)
        self.assertEqual(rl.accuracy(cm), 4 / 9)

示例#11

0

显示文件

    def test_specificity(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        self.assertEqual(
            rl.specificity(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)), 1 / 2)
        self.assertEqual(rl.specificity(cm), 1 / 2)

示例#12

0

显示文件

    def test_fscore(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))
        prec = rl.precision(LINKS_TRUE, LINKS_PRED)
        rec = rl.recall(LINKS_TRUE, LINKS_PRED)
        expected = float(2 * prec * rec / (prec + rec))

        self.assertEqual(rl.fscore(LINKS_TRUE, LINKS_PRED), expected)
        self.assertEqual(rl.fscore(cm), expected)

示例#13

0

显示文件

def _compute_performance(test_index, predictions, test_vectors_size):
    LOGGER.info('Running performance evaluation ...')

    confusion_matrix = rl.confusion_matrix(test_index,
                                           predictions,
                                           total=test_vectors_size)
    precision = rl.precision(test_index, predictions)
    recall = rl.recall(test_index, predictions)
    f_score = rl.fscore(confusion_matrix)

    LOGGER.info('Precision: %f - Recall: %f - F-score: %f', precision, recall,
                f_score)
    LOGGER.info('Confusion matrix: %s', confusion_matrix)

    return precision, recall, f_score, confusion_matrix

示例#14

0

显示文件

def log_quality_results(logger, result, true_links, total_pairs, params=None):
    logger.info("Number of Results %d", len(result))
    logger.info("Confusion Matrix %s", str(
        recordlinkage.confusion_matrix(true_links, result, total_pairs)))
    try:
        fscore = recordlinkage.fscore(true_links, result)
        accuracy = recordlinkage.accuracy(true_links, result, total_pairs)
        precision = recordlinkage.precision(true_links, result)
        recall = recordlinkage.recall(true_links, result)
        logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy)
        logger.info("Precision: %.2f Recall %.2f", precision, recall)
        logger.info("For params : %s", str(params))
        write_results(logger.name, fscore, accuracy, precision, recall, params)

    except ZeroDivisionError:
        logger.error("ZeroDivisionError!!")
    return fscore

示例#15

0

显示文件

def diagnose_links(true_links, pred_links, total_n_links, similarity_df=None):
    confusion_mat = rl.confusion_matrix(true_links,
                                        pred_links,
                                        total=total_n_links)
    print("Confusion Matrix")
    print(confusion_mat)

    print("Comfusion Matrix (percentages)")
    print(confusion_mat * 100 / total_n_links)
    print("")

    matched_true_links = true_links.isin(pred_links)
    print("Recall Metrics:")
    print("Num of True Links: {:,}".format(len(true_links)))
    print("Num of True Links Matched: {:,}".format(matched_true_links.sum()))
    print("Num of True Links Unmatched: {:,}".format(
        len(matched_true_links) - matched_true_links.sum()))
    print("Percent of True Links Matched: {:.2f}%".format(
        matched_true_links.mean() * 100))
    print("Percent of True Links Unmatched: {:.2f}%".format(
        (1 - matched_true_links.mean()) * 100))
    print("")

    correct_predictions = pred_links.isin(true_links)
    print("Precision Metrics:")
    print("Num of Predicted Matches: {:,}".format(len(pred_links)))
    print("Num of Correct Predicted Matches: {:,}".format(
        correct_predictions.sum()))
    print("Num of Incorrect Predicted Matches: {:,}".format(
        len(correct_predictions) - correct_predictions.sum()))
    print("Percent of Predictions which are Correct: {:.2f}%".format(
        correct_predictions.mean() * 100))
    print("Percent of Predictions which are Incorrect: {:.2f}%".format(
        (1 - correct_predictions.mean()) * 100))
    print("")

    f1_score = rl.fscore(true_links, pred_links)
    print("F1 Score is {:.2f}%".format(f1_score * 100))

    if similarity_df is not None:
        is_true_link = similarity_df.index.isin(true_links)

        auc = roc_auc_score(y_true=is_true_link,
                            y_score=similarity_df["similarity_score"])
        print("AUC of ROC of Similarity Scores is {:.2f}%".format(auc * 100))

示例#16

0

显示文件

文件： dblp_scholar.py 项目： helmuthb/SPEML-2020

def print_experiment_evaluation(matches, description):
    precision = 0
    recall = 0
    fscore = 0

    if len(matches) > 0:
        precision = recordlinkage.precision(links_test, matches)
        recall = recordlinkage.recall(links_test, matches)
        fscore = recordlinkage.fscore(links_test,
                                      matches) if recall + precision > 0 else 0

    print(f"Configuration: {description}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F-score: {fscore}")
    print(recordlinkage.confusion_matrix(links_test, matches))

    return precision, recall, fscore

示例#17

0

显示文件

def run_metrics(app_results, option_classifiers, is_gold_standard):
    logging.info("Running metrics..")

    df_a = app_results['data']

    features = app_results['comparison_vector']
    index_name = app_results['index_name']
    if is_gold_standard:
        index_name_1 = app_results['index_name_1']
        index_name_2 = app_results['index_name_2']
        df_true_links = app_results['df_true_links']

    results_dict = {}

    # For each classifier calculate metrics in results_dict
    for i, select_classifier in enumerate(option_classifiers):
        matches = app_results[select_classifier]['matches']
        decision_proba = app_results[select_classifier]['decision_proba']

        results_dict[select_classifier] = {}
        results_dict[select_classifier]['matches'] = matches
        m = matches.to_frame(index=False).columns.to_list()

        results_dict[select_classifier]['unique'] = get_unique(
            df_a, matches, index_name, m[0], m[1])
        results_dict[select_classifier]['metrics'] = {}
        results_dict[select_classifier]['metrics']['nunique'] = results_dict[
            select_classifier]['unique']['nunique']

        ##FIXME Can we separate metrics calculation from UI render
        if is_gold_standard:
            results_dict[select_classifier]['unique'] = get_unique(
                df_a, matches, index_name, index_name_1, index_name_2)
            results_dict[select_classifier]['metrics'] = metrics(
                df_true_links, matches, features)
            results_dict[select_classifier]['matrix'] = rl.confusion_matrix(
                df_true_links, matches, len(features))
            results_dict[select_classifier]['roc'] = show_roc_curve(
                df_true_links, decision_proba)
            results_dict[select_classifier][
                'pr'] = show_precision_recall_curve(df_true_links,
                                                    decision_proba)

    return results_dict

示例#18

0

显示文件

文件： main.py 项目： Alvin2580du/alvin_py

def evalution(X_data, links_true):
    # 这里用逻辑回归分类器做分类，
    cl = recordlinkage.LogisticRegressionClassifier()
    cl.fit(X_data, links_true)
    # 用得到的模型做预测
    links_pred = cl.predict(X_data)
    print("links_pred:{}".format(links_pred.shape))
    # 输出混淆矩阵，confusion_matrix
    cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data))
    print("Confusion matrix:\n", cm)
    # compute the F-score for this classification
    fscore = recordlinkage.fscore(cm)
    print('fscore', fscore)
    # compute recall for this classification
    recall = recordlinkage.recall(links_true, links_pred)
    print('recall', recall)
    # compute precision for this classification
    precision = recordlinkage.precision(links_true, links_pred)
    print('precision', precision)

示例#19

0

显示文件

文件： patientlinkr.py 项目： mayerantoine/matching-comparative-review

def metrics(links_true, links_pred, comparison_vector):

    if len(links_pred) > 0:
        # confusion matrix
        matrix = recordlinkage.confusion_matrix(links_true, links_pred,
                                                len(comparison_vector))

        # precision
        precision = recordlinkage.precision(links_true, links_pred)

        # precision
        recall = recordlinkage.recall(links_true, links_pred)

        # The F-score for this classification is
        fscore = recordlinkage.fscore(links_true, links_pred)

        return matrix, precision, recall, fscore
    else:
        return 0, 0, 0, 0

示例#20

0

显示文件

文件： linkage_name_exact.py 项目： icanindya/twitter_users_reidentification

# Take the match with highest probability for each Twitter user
links_prob = classifier.prob(comparison_vectors)
links_prob = links_prob[links_prob.index.isin(links_pred.values)]
links_prob = links_prob.to_frame()
links_prob.index.names = ['index_twitter', 'index_voter']
links_prob.columns = ['match_prob']
links_prob.reset_index(inplace=True)
links_prob = links_prob.sort_values(
    'match_prob', ascending=False).drop_duplicates('index_twitter')
links_prob.set_index(['index_twitter', 'index_voter'], inplace=True)
links_pred = links_prob.index

print('Num. of many-to-one predicted links: {}'.format(len(links_pred)))

cm = recordlinkage.confusion_matrix(links_true,
                                    links_pred,
                                    total=num_all_comparisons)
print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0],
                                                cm[1][1]))

# compute the F-score for this classification
fscore = recordlinkage.fscore(cm)
print('F-score: {:.2f}'.format(fscore))
recall = recordlinkage.recall(links_true, links_pred)
print('Recall: {:.2f}'.format(recall))
precision = recordlinkage.precision(links_true, links_pred)
print('Precision: {:.2f}'.format(precision))

print(classifier.log_weights)

示例#21

0

显示文件

文件： logreg.py 项目： jakedormer/matching

data = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])

golden_pairs = data.sample(frac=1)
golden_pairs = golden_pairs[0:5000]
golden_matches_index = golden_pairs.index & matches
print(golden_matches_index)


data_2 = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])


logreg = recordlinkage.LogisticRegressionClassifier()

logreg.fit(golden_pairs, golden_matches_index)
print ("Intercept: ", logreg.intercept)
print ("Coefficients: ", logreg.coefficients)

result_logreg = logreg.predict(data_2)

print(len(result_logreg))
print(result_logreg)

print(recordlinkage.confusion_matrix(matches, result_logreg, len(data_2)))

print(recordlinkage.fscore(matches, result_logreg))

coefficients = [2, -0.08400654, -0.41432631, -0.12138752, -0.31617086, -0.42389099, -0.33185166, 0.02173983, 0]
predicter = recordlinkage.LogisticRegressionClassifier(coefficients=coefficients, intercept=-5.379865263857996)

y = predicter.predict(data_2)

示例#22

0

显示文件

print('feature shape', features.shape)

# use the Logistic Regression Classifier
# this classifier is equivalent to the deterministic record linkage approach
intercept = -9.5
coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]

print('Deterministic classifier')
print('intercept', intercept)
print('coefficients', coefficients)

logreg = rl.LogisticRegressionClassifier(
    coefficients=coefficients, intercept=intercept)
links = logreg.predict(features)

print(len(links), 'links/matches')

# return the confusion matrix
conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
print('confusion matrix')
print(conf_logreg)

# compute the F-score for this classification
fscore = rl.fscore(conf_logreg)
print('fscore', fscore)
recall = rl.recall(true_links, links)
print('recall', recall)
precision = rl.precision(true_links, links)
print('precision', precision)

示例#23

0

显示文件

print('Num. of many-to-many predicted links: {}'.format(len(links_pred)))

# Take the match with highest probability for each Twitter user
links_prob = classifier.prob(comparison_vectors)
links_prob = links_prob[links_prob.index.isin(links_pred.values)]
links_prob = links_prob.to_frame()
links_prob.index.names = ['index_twitter', 'index_voter']
links_prob.columns = ['match_prob']
links_prob.reset_index(inplace=True)
links_prob = links_prob.sort_values(
    'match_prob', ascending=False).drop_duplicates('index_twitter')
links_prob.set_index(['index_twitter', 'index_voter'], inplace=True)
links_pred = links_prob.index

print('Num. of many-to-one predicted links: {}'.format(len(links_pred)))

cm = recordlinkage.confusion_matrix(links_true,
                                    links_pred,
                                    total=len(df_twitter) * len(df_voter))
print('TP: {}\nFN: {}\nFP: {}\nTN: {}\n'.format(cm[0][0], cm[0][1], cm[1][0],
                                                cm[1][1]))

# compute the F-score for this classification
fscore = recordlinkage.fscore(cm)
print('F-score: {:.2f}'.format(fscore))
recall = recordlinkage.recall(links_true, links_pred)
print('Recall: {:.2f}'.format(recall))
precision = recordlinkage.precision(links_true, links_pred)
print('Precision: {:.2f}'.format(precision))

print(classifier.log_weights)

示例#24

0

显示文件

文件： linkage_test.py 项目： icanindya/twitter_users_reidentification

import recordlinkage as rl
from recordlinkage.datasets import load_krebsregister

krebs_X, krebs_true_links = load_krebsregister(missing_values=0)

print(krebs_true_links)

# Train the classifier
ecm = rl.ECMClassifier(binarize=0.8)
result_ecm = ecm.fit_predict(krebs_X)

len(result_ecm)

print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X)))

# The F-score for this classification is
print(rl.fscore(krebs_true_links, result_ecm))

print(ecm.log_weights)

示例#25

0

显示文件

#nonmatches = data_sample[data_sample.sum(axis=1) < 4]
#creating match index
match_index = matches.index

#creating a training dataset
golden_pairs = data_sample[0:2000000]
golden_matches_index = golden_pairs.index & match_index

# Train the classifier
svm = rl.SVMClassifier()
svm.learn(golden_pairs, golden_matches_index)
# Predict the match status for all record pairs
result_svm = svm.predict(data)
len(result_svm)

#creating a confusion matrix
conf_svm = rl.confusion_matrix(match_index, result_svm, len(data))
conf_svm

# The F-score for this classification is
rl.fscore(conf_svm)

m_last = pd.DataFrame(result_svm)

#loading data for review
dfB = pd.read_csv('for_linkage_data1.csv', sep=',',encoding='utf-8')

#after review the dataframe m_last
#to review the matches
dfB.loc[['LID000000000','LID000157274','LID000217044','LID000491999','LID000558481','LID000589541']]

示例#26

0

显示文件

def simple_evaluation(source, links_pred, links_true, links_candidates):
    return {
        'rratio': reduction_ratio(links_pred, source),
        'cmatrix': confusion_matrix(links_true, links_pred, links_candidates)
    }

示例#27

0

显示文件

文件： Entity_Resolution.py 项目： basilvetas/Computer-Systems

def get_matches(locu_train_path, foursquare_train_path, matches_train_path,
                locu_test_path, foursquare_test_path):
    four_train = pd.read_json(foursquare_train_path)
    locu_train = pd.read_json(locu_train_path)

    four_test = pd.read_json(foursquare_test_path)
    locu_test = pd.read_json(locu_test_path)

    matches_train = pd.read_csv(matches_train_path)

    # visualize missing data
    #     msno.matrix(four_train)
    #     msno.matrix(locu_train)
    #     msno.matrix(four_test)
    #     msno.matrix(locu_test)

    locu_train, four_train = preprocess(locu_train, four_train)
    locu_test, four_test = preprocess(locu_test, four_test)
    matches_train = preprocess_matches(matches_train)

    candidate_pairs = index_pairs(locu_train, four_train)
    test_candidate_pairs = index_pairs(locu_test, four_test)
    #     print (len(locu_train), len(four_train), len(candidate_pairs))
    #     print (len(locu_test), len(four_test), len(test_candidate_pairs))

    features = compare_strings(locu_train, four_train, candidate_pairs)
    test_features = compare_strings(locu_test, four_test, test_candidate_pairs)

    #     features = features.loc[features['street_address'] > .1]
    #     features = features.loc[features['name'] > .1]

    train_pairs, train_matches_index, all_matches_index = traintestsplit(
        features, matches_train)

    # Train Logistic Regression classifier
    logreg = recordlinkage.LogisticRegressionClassifier()
    logreg.learn(train_pairs, train_matches_index)
    #     print ("LogReg Intercept: ", logreg.intercept)
    #     print ("LogReg Coefficients: ", logreg.coefficients)

    # Train SVM classifier
    svm = recordlinkage.SVMClassifier()
    svm.learn(train_pairs, train_matches_index)

    # Predict on training data with both classifiers
    svm_results_index = predict(features, svm)
    logreg_results_index = predict(features, logreg)

    # To view pairs
    #     features.index = features.index.rename(['locu_id', 'foursquare_id'])
    #     train_matches = features.loc[svm_results_index]
    #     train_matches

    # Training results
    svm_confn_matrix = recordlinkage.confusion_matrix(all_matches_index,
                                                      svm_results_index,
                                                      len(features))
    #     print("SVM Confusion Matrix: ", svm_confn_matrix)
    #     print("SVM Precision: ", recordlinkage.precision(svm_confn_matrix))
    #     print("SVM Recall:    ", recordlinkage.recall(svm_confn_matrix))
    #     print("SVM Accuracy:  ", recordlinkage.accuracy(svm_confn_matrix))
    #     print("SVM F1 Score:  ", recordlinkage.fscore(svm_confn_matrix))

    logreg_confn_matrix = recordlinkage.confusion_matrix(
        all_matches_index, logreg_results_index, len(features))
    #     print("Logistic Regression Confusion Matrix: ", logreg_confn_matrix)
    #     print("Logistic Regression Precision: ", recordlinkage.precision(logreg_confn_matrix))
    #     print("Logistic Regression Recall:    ", recordlinkage.recall(logreg_confn_matrix))
    #     print("Logistic Regression Accuracy:  ", recordlinkage.accuracy(logreg_confn_matrix))
    #     print("Logistic Regression F1 Score:  ", recordlinkage.fscore(logreg_confn_matrix))

    # Predict on test data with SVM
    test_results_index = predict(test_features, svm)

    # Format and write to CSV
    test_features.index = test_features.index.rename(
        ['locu_id', 'foursquare_id'])
    test_match_pairs = test_features.loc[test_results_index]
    matches_test = test_match_pairs.drop(test_match_pairs.columns[::], axis=1)
    #     matches_test
    matches_test.to_csv('matches_test.csv')

    # create a dataframe for both fourquare and locu of pairs that get matched
    test_tuples = list(matches_test.index)
    test_locu_index = [i[0] for i in test_tuples]
    test_four_index = [i[1] for i in test_tuples]
    test_locu_matches = locu_test.loc[test_locu_index]
    test_four_matches = four_test.loc[test_four_index]

    # for viewing full match dataset
    temp = matches_test.reset_index().join(test_four_matches,
                                           on=['foursquare_id'])
    test_match_pairs = temp.join(test_locu_matches,
                                 on=['locu_id'],
                                 lsuffix='_foursquare',
                                 rsuffix='_locu').set_index(
                                     matches_test.index.names)

    cols = np.array(test_match_pairs.columns.tolist())
    order = [0, 7, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13]
    cols = list(cols[order])
    test_matches_reordered = test_match_pairs[cols]
    #     display(test_matches_reordered)
    #     print("Successfully wrote results to matches_test.csv")
    return

示例#28

0

显示文件

cl = rl.NaiveBayesClassifier()
cl.fit(X_data, links_true)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(X_data)
print("Predicted number of links:", len(links_pred))

cm = rl.confusion_matrix(links_true, links_pred, total=len(X_data))
print("Confusion matrix:\n", cm)

# compute the F-score for this classification
fscore = rl.fscore(cm)
print('fscore', fscore)
recall = rl.recall(links_true, links_pred)
print('recall', recall)
precision = rl.precision(links_true, links_pred)
print('precision', precision)

# Predict the match probability for each pair in the dataset.
probs = cl.prob(X_data)

示例#29

0

显示文件

def linkDB(df1, df2, type, classifier):

    # 1 - INDEXING

    indexer = recordlinkage.Index()

    if type == "sortedneighbourhood":
        indexer.sortedneighbourhood(left_on="0_restaurant",
                                    right_on="1_restaurant")
    elif type == "full":
        indexer.full()
    elif type == "block":
        indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle")

    candidate_links = indexer.index(df1, df2)

    test_pairs = candidate_links[0:100]

    #https://recordlinkage.readthedocs.io/en/latest/annotation.html
    """
	df1.columns = df1.columns.str.replace(r'0_', '')
	df2.columns = df2.columns.str.replace(r'1_', '')
	
	recordlinkage.write_annotation_file(
		"check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF")
	
	df1 = df1.add_prefix('0_')
	df2 = df2.add_prefix('1_')
	"""

    annotations = recordlinkage.read_annotation_file('result.json')

    # 2 - COMPARISON
    comp = recordlinkage.Compare()
    comp.string('0_restaurant',
                '1_restaurant',
                threshold=0.95,
                method='jarowinkler',
                label='ristorante')
    comp.string('0_neighborhood',
                '1_neighborhood',
                method='jarowinkler',
                threshold=0.85,
                label='quartiere')
    comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle')

    features = comp.compute(candidate_links, df1, df2)
    test_features = comp.compute(test_pairs, df1, df2)

    # 3 - CLASSIFICATION
    # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised

    matches = []
    drop1 = []
    drop2 = []

    if classifier == "ecm":
        ecm = recordlinkage.ECMClassifier(init='jaro',
                                          binarize=None,
                                          max_iter=100,
                                          atol=0.0001,
                                          use_col_names=True)
        ecm.fit_predict(features, match_index=None)  # Train the classifier
        e_matches = ecm.predict(features)
        for i, j in e_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)
    elif classifier == "kmeans":
        kmeans = recordlinkage.KMeansClassifier()
        kmeans.fit_predict(features)
        k_matches = kmeans.predict(features)
        for i, j in k_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)

    head = tuple(df1.head()) + tuple(df2.head())
    matches_result = pd.DataFrame(matches)
    matches_result.columns = head

    df1t = df1.drop(drop1, axis=0)
    df2t = df2.drop(drop2, axis=0)
    result = df1t.append([df2t, matches_result])

    new_index = []

    for n in range(result.shape[0]):
        new_index.append(n)

    result.index = new_index

    # 4 - EVALUATION

    if classifier == "ecm":
        test_matches = ecm.predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)
    elif classifier == "kmeans":
        test_matches = kmeans.fit_predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)

    print(cm, acc)

    return result