예제 #1
0
    def test_ecm_init_jaro_1value(self):

        m = np.array([1.0, 0.85, .85, .81, .85, .81])
        u = np.array([1.0, .10, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        ecm = rl.ECMClassifier(init='jaro')
        ecm.fit(X_train)
        ecm.predict(X_train)

        with pytest.raises(KeyError):
            ecm.m_probs['c_1'][0]

        assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
        assert math.isclose(ecm.u_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05)
        assert math.isclose(ecm.p, 0.5, abs_tol=0.05)
예제 #2
0
def em_classifier(features):
    ecm = rl.ECMClassifier(binarize=0.85)
    matches = ecm.fit_predict(features)

    df_ecm_prob = pd.DataFrame(ecm.prob(features))
    df_ecm_prob.columns = ['score']
    return matches, df_ecm_prob
예제 #3
0
    def test_binarize_input(self):
        m = np.array([1, .81, .85, .81, .85, .81])
        u = np.array([1, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)
        X_train = X_train * np.random.rand(*X_train.shape)

        # Create the train dataset.
        X_test, true_links = binary_vectors(1000,
                                            500,
                                            m=m,
                                            u=u,
                                            random_state=535,
                                            return_links=True)
        X_test = X_test * np.random.rand(*X_test.shape)

        ecm = rl.ECMClassifier(binarize=True)
        ecm.fit(X_train)
        ecm.predict(X_test)
예제 #4
0
    def test_ecm_atol_none(self):
        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(10000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        # Create the train dataset.
        X_test, true_links = binary_vectors(1000,
                                            500,
                                            m=m,
                                            u=u,
                                            random_state=535,
                                            return_links=True)

        ecm = rl.ECMClassifier(atol=None)
        ecm.fit(X_train)
        ecm.predict(X_test)

        assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3)
        assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)
    def test_em(self):

        ecm = recordlinkage.ECMClassifier()
        ecm.learn(self.y_train.round())
        ecm.predict(self.y.round())
        ecm.prob(self.y.round())

        assert ecm.p is not None
    def test_cora(self):
        logger = get_logger('RL.Test.ECMClassifier.CORA')

        #Read Train data in dataset A & B
        cora = Cora()

        ## Extarct Features
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.candidate_links, cora.trainDataA,
                                      cora.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.ECMClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_links,
                            len(cora.candidate_links))

        #validate the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.val_links, cora.valDataA,
                                      cora.valDataB)
        logger.info("Validation Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_val_links,
                            len(cora.val_links))

        #Test the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.test_links, cora.testDataA,
                                      cora.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_test_links,
                            len(cora.test_links))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 cora.true_test_links)
        ir_metrics.log_metrics(logger)
예제 #7
0
    def test_ecm_init(self):

        m = np.array([0.23, .81, .85, .81, .85, .81])
        u = np.array([0.34, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        ecm = rl.ECMClassifier(init='random')
        ecm.fit(X_train)
        ecm.predict(X_train)

        print(ecm.m_probs)
        print(ecm.log_m_probs)
        print(ecm.u_probs)
        print(ecm.log_u_probs)

        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
예제 #8
0
n_pairs = 50000
n_matches = 7000
m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92])
u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09])

# Create the dataset and return the true links.
X_data, links_true = binary_vectors(
    n_pairs,  # the number of candidate links
    n_matches,  # the number of true links
    m=m_simulate,  # the m probabilities
    u=u_simulate,  # the u probabilities
    random_state=535,  # set seed
    return_links=True)  # return true links

# Initialise the Expectation-Conditional Maximisation classifier.
cl = rl.ECMClassifier()
cl.fit(X_data)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(X_data)
print("Predicted number of links:", len(links_pred))
    def test_ecm(self):
        logger = get_logger('RL.Test.ECMClassifier.Census')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.ECMClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Validate the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.val_links, census.valDataA,
                                      census.valDataB)
        logger.info("Validation Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_val_links,
                            len(census.val_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))

        logger.info("ECM weights: %s", str(logrg.weights))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(census.test_links[i][0], census.test_links[i][1],
                        prob[i]) for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 census.true_test_links)
        ir_metrics.log_metrics(logger)

        #Export False Positives and result porobabilities
        result_feature_mapping = [
            (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        get_entity_name = lambda c, d, i: "_".join([
            str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][c.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda c, d, i: "_".join([
            str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][c.field_map[CensusFields.DNI]])
        ])
        start_time = timeit.default_timer()
        entitiesA = [
            get_entity_name(census, census.testDataA, i)
            for i in range(int(census.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(census, census.testDataB, i)
            for i in range(int(census.testDataB.shape[0]))
        ]
        logger.info("Entities built in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        result_prob = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                        entitiesB.index(
                            get_entity_name_loc(census, census.testDataB,
                                                int(b))), p)
                       for (a, b, p) in result_prob]
        logger.info("Result prob in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        true_links = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                       entitiesB.index(
                           get_entity_name_loc(census, census.testDataB,
                                               int(b))))
                      for (a, b) in census.true_test_links]
        logger.info("true_links in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        export_result_prob(Census, 'ECM', 'census', 'ecm', entitiesA,
                           result_prob, true_links, entitiesB)
        logger.info("Result prob EXPORTED in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        result = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                   entitiesB.index(
                       get_entity_name_loc(census, census.testDataB, int(b))))
                  for (a, b) in result]
        export_false_negatives(Census, 'ECM', 'census', 'ecm', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(Census, 'ECM', 'census', 'ecm', entitiesA,
                               result_prob, true_links, result, entitiesB)
        logger.info("FP & FN EXPORTED in %s",
                    str(timeit.default_timer() - start_time))

        result_feature_mapping = [
            (entitiesA.index(
                get_entity_name_loc(census, census.testDataA, int(a))),
             entitiesB.index(
                 get_entity_name_loc(census, census.testDataB, int(b))), w, p)
            for (a, b, w, p) in result_feature_mapping
        ]
        export_human_readable_results(Census, 'ECM', 'census', 'ecm',
                                      entitiesA, result_feature_mapping,
                                      entitiesB)
        logger.info("Exported Human Readable Results")
예제 #10
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
예제 #11
0
    def test_ecm_predict(self):

        ecm = rl.ECMClassifier()
        ecm.fit(self.X_train.round())
        ecm.predict(self.X_test)
예제 #12
0
featuresbcd3 = features[features['totallinks'] == 3]
featuresbcd3 = featuresbcd3[featuresbcd3['Company'] == 1]
featuresbcd3 = featuresbcd3[featuresbcd3['Corporate Family'] == 1]
featuresbcd3 = featuresbcd3[featuresbcd3['Phone Number'] == 1]

featuresabcd = features[features['totallinks'] == 4]

featuresabc3.to_excel(r'/Users/Adam/Desktop/featuresabc.xlsx',
                      sheet_name='featuresabc',
                      index=False)
featuresabd3.to_excel(r'/Users/Adam/Desktop/featuresabd.xlsx',
                      sheet_name='featuresabd',
                      index=False)
featuresbcd3.to_excel(r'/Users/Adam/Desktop/featuresbcd.xlsx',
                      sheet_name='featuresbcd',
                      index=False)
featuresabcd.to_excel(r'/Users/Adam/Desktop/featuresabcd.xlsx',
                      sheet_name='featuresabcd',
                      index=False)

# %%

ecm = recordlinkage.ECMClassifier()

matchdf = ecm.fit_predict(features)
matchdffinal = matchdf.to_frame(index=False)

matchdffinal.to_excel(r'/Users/Adam/Desktop/HEALTHCAREMATCHES.xlsx',
                      sheet_name='HEALTHCAREMATCHES',
                      index=False)
예제 #13
0
    def test_ecm_probs(self):

        ecm = rl.ECMClassifier()
        ecm.fit(self.X_train.round())

        assert (ecm.p <= 1.0) & (ecm.p >= 0.0)
예제 #14
0
# See the outcome
df_comparison_results.head()
df_comparison_results[df_comparison_results.sum(axis=1) > 3].head()

# Let us use unsupervised technique on all features except on which blocking is done

#refined data
list_features = ['SUBURB', 'STATE', 'SURNAME', 'DATE_OF_BIRTH', 'ADDRESS_1']
df_comparison_results = df_comparison_results[list_features]
df_comparison_results[list_features] = df_comparison_results[
    list_features].apply(lambda x: x.astype(int))
df_comparison_results.head()

# Build model object
classifier = recordlinkage.ECMClassifier()

#train
classifier.fit(df_comparison_results)

#Predict
pred = classifier.predict(df_comparison_results)

# Convert to Df for readability
df = pd.DataFrame([pred]).transpose()
df.head()

del (train, list_text_data, list_int_data, indexer, candidate_links,
     compare_rl, df_comparison_results, list_features, classifier, pred, df)
#%% Entity resolution /Deduplication from two table
# https://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html
예제 #15
0
def linkDB(df1, df2, type, classifier):

    # 1 - INDEXING

    indexer = recordlinkage.Index()

    if type == "sortedneighbourhood":
        indexer.sortedneighbourhood(left_on="0_restaurant",
                                    right_on="1_restaurant")
    elif type == "full":
        indexer.full()
    elif type == "block":
        indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle")

    candidate_links = indexer.index(df1, df2)

    test_pairs = candidate_links[0:100]

    #https://recordlinkage.readthedocs.io/en/latest/annotation.html
    """
	df1.columns = df1.columns.str.replace(r'0_', '')
	df2.columns = df2.columns.str.replace(r'1_', '')
	
	recordlinkage.write_annotation_file(
		"check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF")
	
	df1 = df1.add_prefix('0_')
	df2 = df2.add_prefix('1_')
	"""

    annotations = recordlinkage.read_annotation_file('result.json')

    # 2 - COMPARISON
    comp = recordlinkage.Compare()
    comp.string('0_restaurant',
                '1_restaurant',
                threshold=0.95,
                method='jarowinkler',
                label='ristorante')
    comp.string('0_neighborhood',
                '1_neighborhood',
                method='jarowinkler',
                threshold=0.85,
                label='quartiere')
    comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle')

    features = comp.compute(candidate_links, df1, df2)
    test_features = comp.compute(test_pairs, df1, df2)

    # 3 - CLASSIFICATION
    # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised

    matches = []
    drop1 = []
    drop2 = []

    if classifier == "ecm":
        ecm = recordlinkage.ECMClassifier(init='jaro',
                                          binarize=None,
                                          max_iter=100,
                                          atol=0.0001,
                                          use_col_names=True)
        ecm.fit_predict(features, match_index=None)  # Train the classifier
        e_matches = ecm.predict(features)
        for i, j in e_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)
    elif classifier == "kmeans":
        kmeans = recordlinkage.KMeansClassifier()
        kmeans.fit_predict(features)
        k_matches = kmeans.predict(features)
        for i, j in k_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)

    head = tuple(df1.head()) + tuple(df2.head())
    matches_result = pd.DataFrame(matches)
    matches_result.columns = head

    df1t = df1.drop(drop1, axis=0)
    df2t = df2.drop(drop2, axis=0)
    result = df1t.append([df2t, matches_result])

    new_index = []

    for n in range(result.shape[0]):
        new_index.append(n)

    result.index = new_index

    # 4 - EVALUATION

    if classifier == "ecm":
        test_matches = ecm.predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)
    elif classifier == "kmeans":
        test_matches = kmeans.fit_predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)

    print(cm, acc)

    return result
import recordlinkage as rl
from recordlinkage.datasets import load_krebsregister

krebs_X, krebs_true_links = load_krebsregister(missing_values=0)

print(krebs_true_links)

# Train the classifier
ecm = rl.ECMClassifier(binarize=0.8)
result_ecm = ecm.fit_predict(krebs_X)

len(result_ecm)

print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X)))

# The F-score for this classification is
print(rl.fscore(krebs_true_links, result_ecm))

print(ecm.log_weights)
예제 #17
0
def link_reduce(from_rest: str, dfs: dict, window: int, th: float, classifier: str, thFusion: float) -> dict:
    dfs_copy = {from_rest: dfs[from_rest]}
    dfs_reduce = dfs.copy()

    # Make copy of dfs with from_rest moved on top
    for rest, df in dfs.items():
        if rest == from_rest:
            continue
        else:
            dfs_copy[rest] = df.copy()

    for rest, df in dfs_copy.items():
        for rr, ddf in dfs_reduce.items():
            if rr == rest:
                continue
            else:
                columns_to_check = ['restaurant']
                print(f"{rest} -> {rr}")
                if df['addressGoogle'].isnull().sum() != len(df['addressGoogle']) and ddf['addressGoogle'].isnull().sum() != len(ddf['addressGoogle']):
                    columns_to_check.append('addressGoogle')
                if df['neighborhood'].isnull().sum() != len(df['neighborhood']) and ddf['neighborhood'].isnull().sum() != len(ddf['neighborhood']):
                    columns_to_check.append('neighborhood')

                #print(f"\tcheck: {columns_to_check}")
                indexer = recordlinkage.Index()

                # 1 - INDEXING
                for col in columns_to_check:
                    indexer.sortedneighbourhood(
                        left_on=col, right_on=col, window=window)
                candidate_links = indexer.index(df, ddf)

                # 2 - COMPARISON
                compare_cl = recordlinkage.Compare(n_jobs=-1)
                for col in columns_to_check:
                    if col == 'addressGoogle':
                        compare_cl.exact(col, col)
                    else:
                        compare_cl.string(col, col, label=col,
                                          threshold=th, method='jarowinkler')
                features = compare_cl.compute(candidate_links, df, ddf)

                # 3 - CLASSIFICATION
                matches = None
                if classifier == "ecm":
                    ecm = recordlinkage.ECMClassifier(
                        init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True)
                    ecm.fit_predict(features)
                    matches = ecm.predict(features)
                elif classifier == "kmeans":
                    kmeans = recordlinkage.KMeansClassifier()
                    kmeans.fit_predict(features)
                    matches = kmeans.predict(features)

                # 4 - COMBINE INFORMATION
                for left, right in matches:
                    if not combine(df.loc[left], ddf.loc[right], thFusion, th):
                        matches = matches.drop((left, right))

                print(f"\tmatches: {len(matches)}")
                dfs_copy[rest] = df.copy()

                # 4 - DROP RIGHT ON MATCHES INDEX
                index_to_drop = set(matches.get_level_values(1))
                print(f"\t{rr} before drop: {len(ddf.index)}")
                ddf.drop(index_to_drop, inplace=True)
                dfs_copy[rr] = ddf.copy()
                dfs_reduce[rr] = ddf.copy()
                print(f"\t{rr} after drop: {len(dfs_reduce[rr].index)}\n")

        del dfs_reduce[rest]

    final_df = pd.concat(list(dfs_copy.values()))
    final_df.dropna(subset=['addressGoogle'], inplace=True)
    final_df.drop_duplicates(inplace=True)
    return final_df
예제 #18
0
    del data["Total_Score"]
    del data["rec_id"]
    del data["rec_id.1"]
    ###calculate known matches, then delete for classification
    del data['rec_num']
    data.to_csv('feature_vectors_clean.csv', sep=",", encoding='utf-8')
    return data


prepData()
####Evaluate vector scoring methodology###########################################################################################
known_matches = knownMatches()
missed_matches = missedMatches()
false_positives = findFalsePos()
pairs = ScoreRecords()
print "number of comparison pairs in index:", len(pairs)
print "number of matching pairs:", known_matches
print "number of missed matches:", missed_matches
print "number of false positives:", false_positives
#supervised#######################################################################################################
#unsupervised methods#############################################################################################
###k-means####################################################
data = prepData()
kmeans = rl.KMeansClassifier()
result_kmeans = kmeans.learn(data)
print 'number of predicted pairs using K-means clustering:', len(result_kmeans)
###ECM Maximization###########################################
ecm = rl.ECMClassifier()
result_ecm = ecm.learn((data > 0.8).astype(int))
print 'the number of predicted pairs using ECM Maximization:', len(result_ecm)