def test_accuracy(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        assert rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)) == 4 / 9
        assert rl.accuracy(cm) == 4 / 9
Exemplo n.º 2
0
    def test_accuracy(self):

        # confusion matrix
        cm = rl.confusion_matrix(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX))

        self.assertEqual(rl.accuracy(LINKS_TRUE, LINKS_PRED, len(FULL_INDEX)),
                         4 / 9)
        self.assertEqual(rl.accuracy(cm), 4 / 9)
def cross_val_score(classifier,
                    comparison_vector,
                    link_true,
                    cv=5,
                    method='fscore'):
    skfolds = StratifiedKFold(n_splits=cv)

    y = pandas.Series(0, index=comparison_vector.index)
    y.loc[link_true.index & comparison_vector.index] = 1

    X_train = comparison_vector.values
    y_train = y.values
    scores = []
    for train_index, test_index in skfolds.split(X_train, y_train):
        #clone_clf = clone(classifier)
        classifier_copy = copy.deepcopy(classifier)
        X_train_folds = comparison_vector.iloc[
            train_index]  #X_train[train_index]
        X_test_folds = comparison_vector.iloc[test_index]  #X_train[test_index]
        y_train_folds = X_train_folds.index & link_true.index  #y_train[train_index]
        y_test_folds = X_test_folds.index & link_true.index

        # Train the classifier
        #print(y_train_folds.shape)
        classifier_copy.fit(X_train_folds, y_train_folds)

        # predict matches for the test
        #print(X_test_folds)
        y_pred = classifier_copy.predict(X_test_folds)

        if (method == 'fscore'):
            score = recordlinkage.fscore(y_test_folds, y_pred)
        elif (method == 'precision'):
            score = recordlinkage.precision(y_test_folds, y_pred)
        elif (method == 'recall'):
            score = recordlinkage.recall(y_test_folds, y_pred)
        elif (method == 'accuracy'):
            score = recordlinkage.accuracy(y_test_folds, y_pred,
                                           len(comparison_vector))
        elif (method == 'specificity'):
            score = recordlinkage.specificity(y_test_folds, y_pred,
                                              len(comparison_vector))
        scores.append(score)

    scores = numpy.array(scores)
    return scores
Exemplo n.º 4
0
def log_quality_results(logger, result, true_links, total_pairs, params=None):
    logger.info("Number of Results %d", len(result))
    logger.info("Confusion Matrix %s", str(
        recordlinkage.confusion_matrix(true_links, result, total_pairs)))
    try:
        fscore = recordlinkage.fscore(true_links, result)
        accuracy = recordlinkage.accuracy(true_links, result, total_pairs)
        precision = recordlinkage.precision(true_links, result)
        recall = recordlinkage.recall(true_links, result)
        logger.info("FScore: %.2f Accuracy : %.2f", fscore, accuracy)
        logger.info("Precision: %.2f Recall %.2f", precision, recall)
        logger.info("For params : %s", str(params))
        write_results(logger.name, fscore, accuracy, precision, recall, params)

    except ZeroDivisionError:
        logger.error("ZeroDivisionError!!")
    return fscore
Exemplo n.º 5
0
    def test_accuracy(self):

        self.assertEqual(recordlinkage.accuracy(CONF_M1), 1.0)
        self.assertEqual(recordlinkage.accuracy(CONF_M5), 0.0)
Exemplo n.º 6
0
def linkDB(df1, df2, type, classifier):

    # 1 - INDEXING

    indexer = recordlinkage.Index()

    if type == "sortedneighbourhood":
        indexer.sortedneighbourhood(left_on="0_restaurant",
                                    right_on="1_restaurant")
    elif type == "full":
        indexer.full()
    elif type == "block":
        indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle")

    candidate_links = indexer.index(df1, df2)

    test_pairs = candidate_links[0:100]

    #https://recordlinkage.readthedocs.io/en/latest/annotation.html
    """
	df1.columns = df1.columns.str.replace(r'0_', '')
	df2.columns = df2.columns.str.replace(r'1_', '')
	
	recordlinkage.write_annotation_file(
		"check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF")
	
	df1 = df1.add_prefix('0_')
	df2 = df2.add_prefix('1_')
	"""

    annotations = recordlinkage.read_annotation_file('result.json')

    # 2 - COMPARISON
    comp = recordlinkage.Compare()
    comp.string('0_restaurant',
                '1_restaurant',
                threshold=0.95,
                method='jarowinkler',
                label='ristorante')
    comp.string('0_neighborhood',
                '1_neighborhood',
                method='jarowinkler',
                threshold=0.85,
                label='quartiere')
    comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle')

    features = comp.compute(candidate_links, df1, df2)
    test_features = comp.compute(test_pairs, df1, df2)

    # 3 - CLASSIFICATION
    # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised

    matches = []
    drop1 = []
    drop2 = []

    if classifier == "ecm":
        ecm = recordlinkage.ECMClassifier(init='jaro',
                                          binarize=None,
                                          max_iter=100,
                                          atol=0.0001,
                                          use_col_names=True)
        ecm.fit_predict(features, match_index=None)  # Train the classifier
        e_matches = ecm.predict(features)
        for i, j in e_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)
    elif classifier == "kmeans":
        kmeans = recordlinkage.KMeansClassifier()
        kmeans.fit_predict(features)
        k_matches = kmeans.predict(features)
        for i, j in k_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)

    head = tuple(df1.head()) + tuple(df2.head())
    matches_result = pd.DataFrame(matches)
    matches_result.columns = head

    df1t = df1.drop(drop1, axis=0)
    df2t = df2.drop(drop2, axis=0)
    result = df1t.append([df2t, matches_result])

    new_index = []

    for n in range(result.shape[0]):
        new_index.append(n)

    result.index = new_index

    # 4 - EVALUATION

    if classifier == "ecm":
        test_matches = ecm.predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)
    elif classifier == "kmeans":
        test_matches = kmeans.fit_predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)

    print(cm, acc)

    return result
Exemplo n.º 7
0
    def test_accuracy(self):

        self.assertEqual(recordlinkage.accuracy(CONF_M1), 1.0)
        self.assertEqual(recordlinkage.accuracy(CONF_M5), 0.0)