def test_logistic_regression_manual(self):
        """
        Test the LogisticRegressionClassifier in case of setting the
        parameters manually.

        """

        # Make random test data.
        numpy.random.seed(535)
        manual_coefficients = numpy.random.randn(self.y_train.shape[1])
        manual_intercept = numpy.random.rand()

        # Initialize the LogisticRegressionClassifier
        logis = recordlinkage.LogisticRegressionClassifier()

        # Check if the cofficients and intercapt are None at this point
        assert logis.coefficients is None
        assert logis.intercept is None

        # Set the parameters coefficients and intercept
        logis.coefficients = manual_coefficients
        logis.intercept = manual_intercept

        # Perform the prediction
        logis.predict(self.y)

        # Train the classifier after manula setting
        logis.learn(self.y_train, self.matches_index)
        logis.predict(self.y)

        lc = numpy.array(logis.coefficients)
        assert lc.shape == (self.y_train.shape[1], )
        assert isinstance(logis.intercept, (float))
示例#2
0
def logreg_classifier(features, links_true, train_size=0.2, cv=None):
    """ Logistic Regression classifier function"""
    logreg = rl.LogisticRegressionClassifier()

    if cv is None:
        golden_match_index = features.index & links_true.index
        train_index = int(len(features) * train_size)
        #train model
        logreg.fit(features[0:train_index], golden_match_index)

        # Predict the match status for all record pairs
        matches = logreg.predict(features)

        df_logreg_prob = pd.DataFrame(logreg.prob(features))
        df_logreg_prob.columns = ['score']
    else:
        df_results = cross_val_predict(logreg,
                                       features,
                                       links_true,
                                       cv,
                                       method='predict')
        matches = df_results.index
        df_logreg_prob = cross_val_predict(logreg,
                                           features,
                                           links_true,
                                           cv,
                                           method='predict_proba')

    return matches, df_logreg_prob
示例#3
0
    def test_logistic_regression_manual(self):

        # Make random test data.
        np.random.seed(535)
        manual_coefficients = np.random.randn(self.X_train.shape[1])
        manual_intercept = np.random.rand()

        # Initialize the LogisticRegressionClassifier
        logis = rl.LogisticRegressionClassifier()
        assert not hasattr(logis, 'coefficients')
        assert not hasattr(logis, 'intercept')

        # Set the parameters coefficients and intercept
        logis.coefficients = manual_coefficients
        logis.intercept = manual_intercept

        # Perform the prediction
        logis.predict(self.X_test)

        # Train the classifier after manual setting
        logis.fit(self.X_train, self.y_train)
        logis.predict(self.X_test)

        lc = logis.coefficients
        assert lc.shape == (self.X_train.shape[1], )
        assert isinstance(logis.intercept, (float))
示例#4
0
    def test_logistic_regression_basic(self):

        logis = rl.LogisticRegressionClassifier()

        # Test the basics
        logis.fit(self.X_train, self.y_train)
        logis.predict(self.X_test)
        logis.prob(self.X_train)
示例#5
0
def ProcessData(patientDataList, fetchedHospitalData):
    # Read from the directory
    filelist = pd.read_csv(
        '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/'
        + fetchedHospitalData)

    # Indexation step
    indexer = p.Index()
    indexer.add(Block(left_on='fatherName', right_on='fatherName'))
    candidate_links = indexer.index(patientDataList, filelist)

    # print((candidate_links))

    # Comparison step
    compare_cl = p.Compare()

    # compare_cl.exact('_id','_id',label='_id')
    compare_cl.exact('name', 'name', label='name')
    compare_cl.exact('fatherName', 'fatherName', label='fatherName')
    compare_cl.exact('grandFatherName',
                     'grandFatherName',
                     label='grandFatherName')
    compare_cl.exact('gender', 'gender', label='gender')
    compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth')
    compare_cl.exact('dayOfBirth', 'dayOfBirth', label='dayOfBirth')
    compare_cl.exact('monthOfBirth', 'monthOfBirth', label='monthOfBirth')
    compare_cl.exact('yearOfBirth', 'yearOfBirth', label='yearOfBirth')
    compare_cl.exact('age', 'age', label='age')
    # compare_cl.exact('address','address',label='address')
    # compare_cl.exact('phoneNumber','phoneNumber',label='phoneNumber')

    features = compare_cl.compute(candidate_links, patientDataList, filelist)

    if features.empty:
        return None
    else:

        # Classification step
        '''
            Use the KMeans Classifier
            This classifier is equivalent to the Unsupervised record linkage approach
        '''

        # # classifier = p.LogisticRegressionClassifier(coefficients=coefficients,intercept=intercept)
        classifier = p.LogisticRegressionClassifier()
        classifier.fit(golden_pairs, golden_matches_index)

        links = classifier.predict(features)

        return links
    def test_logistic_regression_basic(self):
        """

        Test the LogisticRegressionClassifier by training it, predict on a
        dataset and get the probabilities.

        """

        logis = recordlinkage.LogisticRegressionClassifier()

        # Test the basics
        logis.learn(self.y_train, self.matches_index)
        logis.predict(self.y)
        logis.prob(self.y)
示例#7
0
    def test_cora(self):
        logger = get_logger('RL.Test.LogisticRegression.CORA')

        #Read Train data in dataset A & B
        cora = Cora()

        ## Extarct Features
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.candidate_links, cora.trainDataA,
                                      cora.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train Logistic Regression Classifier
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, cora.true_links)

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_links,
                            len(cora.candidate_links))

        #Validate the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.val_links, cora.valDataA,
                                      cora.valDataB)
        logger.info("Validation Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_val_links,
                            len(cora.val_links))

        #Test the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.test_links, cora.testDataA,
                                      cora.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_test_links,
                            len(cora.test_links))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 cora.true_test_links)
        ir_metrics.log_metrics(logger)
示例#8
0
def init_model(classifier: str, num_features: int, **kwargs):
    if classifier is keys.NAIVE_BAYES:
        # add `binarize` threshold if not already specified

        kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs}
        model = rl.NaiveBayesClassifier(**kwargs)

    elif classifier is keys.LOGISTIC_REGRESSION:
        kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs}
        model = rl.LogisticRegressionClassifier(**kwargs)

    elif classifier is keys.LINEAR_SVM:
        kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs}
        model = rl.SVMClassifier(**kwargs)

    elif classifier is keys.SVM:
        model = classifiers.SVCClassifier(**kwargs)

    elif classifier is keys.RANDOM_FOREST:
        model = classifiers.RandomForest(**kwargs)

    elif classifier is keys.SINGLE_LAYER_PERCEPTRON:
        model = classifiers.SingleLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.MULTI_LAYER_PERCEPTRON:
        model = classifiers.MultiLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.VOTING_CLASSIFIER:
        model = classifiers.VotingClassifier(num_features, **kwargs)

    elif classifier is keys.GATED_CLASSIFIER:
        model = classifiers.GatedEnsembleClassifier(num_features, **kwargs)

    elif classifier is keys.STACKED_CLASSIFIER:
        model = classifiers.StackedEnsembleClassifier(num_features, **kwargs)

    else:
        err_msg = (
            f'Classifier not supported: {classifier}. '
            f'It should be one of {set(constants.CLASSIFIERS)}'
        )
        LOGGER.critical(err_msg)
        raise ValueError(err_msg)

    LOGGER.info('Model initialized: %s', model)

    return model
示例#9
0
    def _test_logistic_transh(self, dataset, params):
        """Note: Zero aligned pairs are returned, require fixation."""
        model = dataset()
        logger = get_logger('RL.Test.LogisticTransH.' + str(model))
        entity, relation, triples, entity_pairs, true_pairs = model.get_er_model(
        )
        transh = TransH(entity,
                        relation,
                        triples,
                        entity_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transh.get_ent_embeddings()
        ent_embeddings = [
            np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0])
        ]
        trainDataA = pd.DataFrame(data=ent_embeddings)
        trainDataB = pd.DataFrame(data=ent_embeddings)

        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i), method='gauss')

        candidate_links = pd.MultiIndex.from_tuples(entity_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Features %s", str(features.describe()))

        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, true_pairs)

        result = logrg.predict(features)
        log_quality_results(logger, result, true_pairs, len(entity_pairs))

        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs)
        ir_metrics.log_metrics(logger, params)
示例#10
0
def evalution(X_data, links_true):
    # 这里用逻辑回归分类器做分类,
    cl = recordlinkage.LogisticRegressionClassifier()
    cl.fit(X_data, links_true)
    # 用得到的模型做预测
    links_pred = cl.predict(X_data)
    print("links_pred:{}".format(links_pred.shape))
    # 输出混淆矩阵,confusion_matrix
    cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data))
    print("Confusion matrix:\n", cm)
    # compute the F-score for this classification
    fscore = recordlinkage.fscore(cm)
    print('fscore', fscore)
    # compute recall for this classification
    recall = recordlinkage.recall(links_true, links_pred)
    print('recall', recall)
    # compute precision for this classification
    precision = recordlinkage.precision(links_true, links_pred)
    print('precision', precision)
示例#11
0
    def test_census_new(self):
        c = Census()
        graph = Graph_VEG(Census)
        logger = get_logger("RL.Test.LogisticRLTransE.Census")
        logger.info("values for name : %s",
                    str(graph.relation_value_map[graph.relation[1]][:10]))
        logger.info("relation: %s", str(graph.relation))
        logger.info("train_triples: %s", str(graph.train_triples[:10]))
        logger.info("set train_triples size %d", len(set(graph.train_triples)))

        params = self.get_default_params()
        transe = RLTransE(graph,
                          dimension=params['dimension'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          regularizer_scale=params['regularizer_scale'],
                          batchSize=params['batchSize'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'])
        loss, val_loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f val_loss: %f", loss,
                    val_loss)

        value_embeddings = transe.get_val_embeddings()
        relation_embeddings = transe.get_rel_embeddings()

        #Map of feilds in census dataFrame to VEG relations.
        field_relation_map = {
            c.field_map[CensusFields.FIRST_NAME]: "name",
            c.field_map[CensusFields.SURNAME_1]: "surname",
            c.field_map[CensusFields.SURNAME_2]: "surname2",
            c.field_map[CensusFields.YOB]: "yob",
            c.field_map[CensusFields.CIVIL_STATUS]: "civil",
            c.field_map[CensusFields.OCCUPATION]: "occupation",
            c.field_map[CensusFields.RELATION]: "relation"
        }

        missing_values = []
        train_features = []  #Size samples*(dimension*rel_count)
        test_features = []
        for (candidate_links, dataA, dataB, features) in \
                            [(c.candidate_links, c.trainDataA, c.trainDataB, train_features),
                            (c.test_links, c.testDataA, c.testDataB, test_features)]:
            for (a, b) in candidate_links:
                row_a = dataA.loc[a]
                row_b = dataB.loc[b]
                distance = []

                for f in field_relation_map:
                    val_a = row_a[f]
                    val_b = row_b[f]
                    if val_a != val_b:
                        rel = field_relation_map[f]
                        try:
                            val_index_a = graph.relation_value_map[rel].index(
                                val_a)
                        except ValueError:
                            missing_values.append(val_a)
                            distance.extend([1.0] * params['dimension'])
                            continue
                        try:
                            val_index_b = graph.relation_value_map[rel].index(
                                val_b)
                        except ValueError:
                            missing_values.append(val_b)
                            distance.extend([1.0] * params['dimension'])
                            continue
                        rel_index = graph.relation.index(field_relation_map[f])

                        distance.extend(value_embeddings[rel][val_index_a] + \
                            relation_embeddings[rel_index] - value_embeddings[rel][val_index_b])

                features.append(pd.Series(distance).rename((a, b)))
                #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in c.true_test_links)
        logger.info("No. of missing values: %d", len(missing_values))
        logger.info("Unique No. of missing values: %d",
                    len(set(missing_values)))

        train_features = pd.DataFrame(data=train_features).fillna(1)
        test_features = pd.DataFrame(data=test_features).fillna(1)
        logger.info("Shape of Train features: %s", str(train_features.shape))
        logger.info("Shape of Test features: %s", str(test_features.shape))

        #Train Logistic Regression Model
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(train_features, c.true_links)
        result = logrg.predict(train_features)
        result = pd.MultiIndex.from_tuples(result.to_series())
        log_quality_results(logger, result, c.true_links,
                            len(c.candidate_links), params)

        #Test Classifier
        result = logrg.predict(test_features)
        result = pd.MultiIndex.from_tuples(result.to_series())
        log_quality_results(logger, result, c.true_test_links,
                            len(c.test_links), params)
        """
        Todo: Export Embeddings and probabilities.
        try:
            entities = ["value\trelation"]
            for r in graph.relation_value_map:
                for v in graph.relation_value_map[r]:
                    entities.append("\t".join([v,r]))

            embeddings = []
            for rel in value_embeddings:
                val_count = len(graph.relation_value_map[rel])
                embeddings.extend(value_embeddings[rel][:val_count])

            #Write Embeddings to file
            export_embeddings('veg', str(c), 'LogisticRLTransE', entities, embeddings)
        except Exception as e:
            logger.error("Failed to export embeddings")
            logger.error(e)
        export_result_prob(Census, 'veg', str(c), 'RLTransE', graph.values, result_prob, c.true_test_links)
        """
        prob_series = logrg.prob(test_features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(c.test_links[i][0], c.test_links[i][1], prob[i])
                       for i in range(0, len(prob))]
        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 c.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
示例#12
0
def create_and_train_logistic_regression():
    """
    Creates and trains a KMeans Classifier
    """
    return train_supervised_classifier(rl.LogisticRegressionClassifier())
示例#13
0
matches = data[0:316]
matches = matches[['sku_1', 'sku_2']]
matches = pandas.MultiIndex.from_frame(matches)

data = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])

golden_pairs = data.sample(frac=1)
golden_pairs = golden_pairs[0:5000]
golden_matches_index = golden_pairs.index & matches
print(golden_matches_index)


data_2 = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2'])


logreg = recordlinkage.LogisticRegressionClassifier()

logreg.fit(golden_pairs, golden_matches_index)
print ("Intercept: ", logreg.intercept)
print ("Coefficients: ", logreg.coefficients)

result_logreg = logreg.predict(data_2)

print(len(result_logreg))
print(result_logreg)

print(recordlinkage.confusion_matrix(matches, result_logreg, len(data_2)))

print(recordlinkage.fscore(matches, result_logreg))

coefficients = [2, -0.08400654, -0.41432631, -0.12138752, -0.31617086, -0.42389099, -0.33185166, 0.02173983, 0]
示例#14
0
match_can_df = lnk.adjust_scores(match_can_df, 'Phone', 0.6)

lnk.get_true_match_vals(match_can_df, slp, tm)

test_size = 0.3
random_state = 456

train, test = train_test_split(match_can_df,
                               stratify=match_can_df.Match,
                               test_size=test_size,
                               random_state=random_state)

train_matches_index = train[train.Match == 1]
test_matches_index = test[test.Match == 1]
train.drop(columns='Match', inplace=True)
test.drop(columns='Match', inplace=True)

lr_all = recordlinkage.LogisticRegressionClassifier()

lr_all.fit_predict(train, train_matches_index.index)

test = pm.get_predictions(test, lr)

lnk.get_true_match_vals(test, slp, tm)

pm.get_cf_mat(test)

test = pm.add_col_from_df(test, slp, 'country')

pm.get_country_roc_curves(test)
示例#15
0
comparer.add(String('address_2', 'address_2', threshold=0.85,
                    label='address_2'))
features = comparer.compute(candidate_links, dfA)

print('feature shape', features.shape)

# use the Logistic Regression Classifier
# this classifier is equivalent to the deterministic record linkage approach
intercept = -9.5
coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]

print('Deterministic classifier')
print('intercept', intercept)
print('coefficients', coefficients)

logreg = rl.LogisticRegressionClassifier(
    coefficients=coefficients, intercept=intercept)
links = logreg.predict(features)

print(len(links), 'links/matches')

# return the confusion matrix
conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links))
print('confusion matrix')
print(conf_logreg)

# compute the F-score for this classification
fscore = rl.fscore(conf_logreg)
print('fscore', fscore)
recall = rl.recall(true_links, links)
print('recall', recall)
precision = rl.precision(true_links, links)
    def test_probs(self):

        cl = recordlinkage.LogisticRegressionClassifier()

        with pytest.raises(ValueError):
            cl.prob(self.y, return_type='unknown_return_type')
def get_matches(locu_train_path, foursquare_train_path, matches_train_path,
                locu_test_path, foursquare_test_path):
    four_train = pd.read_json(foursquare_train_path)
    locu_train = pd.read_json(locu_train_path)

    four_test = pd.read_json(foursquare_test_path)
    locu_test = pd.read_json(locu_test_path)

    matches_train = pd.read_csv(matches_train_path)

    # visualize missing data
    #     msno.matrix(four_train)
    #     msno.matrix(locu_train)
    #     msno.matrix(four_test)
    #     msno.matrix(locu_test)

    locu_train, four_train = preprocess(locu_train, four_train)
    locu_test, four_test = preprocess(locu_test, four_test)
    matches_train = preprocess_matches(matches_train)

    candidate_pairs = index_pairs(locu_train, four_train)
    test_candidate_pairs = index_pairs(locu_test, four_test)
    #     print (len(locu_train), len(four_train), len(candidate_pairs))
    #     print (len(locu_test), len(four_test), len(test_candidate_pairs))

    features = compare_strings(locu_train, four_train, candidate_pairs)
    test_features = compare_strings(locu_test, four_test, test_candidate_pairs)

    #     features = features.loc[features['street_address'] > .1]
    #     features = features.loc[features['name'] > .1]

    train_pairs, train_matches_index, all_matches_index = traintestsplit(
        features, matches_train)

    # Train Logistic Regression classifier
    logreg = recordlinkage.LogisticRegressionClassifier()
    logreg.learn(train_pairs, train_matches_index)
    #     print ("LogReg Intercept: ", logreg.intercept)
    #     print ("LogReg Coefficients: ", logreg.coefficients)

    # Train SVM classifier
    svm = recordlinkage.SVMClassifier()
    svm.learn(train_pairs, train_matches_index)

    # Predict on training data with both classifiers
    svm_results_index = predict(features, svm)
    logreg_results_index = predict(features, logreg)

    # To view pairs
    #     features.index = features.index.rename(['locu_id', 'foursquare_id'])
    #     train_matches = features.loc[svm_results_index]
    #     train_matches

    # Training results
    svm_confn_matrix = recordlinkage.confusion_matrix(all_matches_index,
                                                      svm_results_index,
                                                      len(features))
    #     print("SVM Confusion Matrix: ", svm_confn_matrix)
    #     print("SVM Precision: ", recordlinkage.precision(svm_confn_matrix))
    #     print("SVM Recall:    ", recordlinkage.recall(svm_confn_matrix))
    #     print("SVM Accuracy:  ", recordlinkage.accuracy(svm_confn_matrix))
    #     print("SVM F1 Score:  ", recordlinkage.fscore(svm_confn_matrix))

    logreg_confn_matrix = recordlinkage.confusion_matrix(
        all_matches_index, logreg_results_index, len(features))
    #     print("Logistic Regression Confusion Matrix: ", logreg_confn_matrix)
    #     print("Logistic Regression Precision: ", recordlinkage.precision(logreg_confn_matrix))
    #     print("Logistic Regression Recall:    ", recordlinkage.recall(logreg_confn_matrix))
    #     print("Logistic Regression Accuracy:  ", recordlinkage.accuracy(logreg_confn_matrix))
    #     print("Logistic Regression F1 Score:  ", recordlinkage.fscore(logreg_confn_matrix))

    # Predict on test data with SVM
    test_results_index = predict(test_features, svm)

    # Format and write to CSV
    test_features.index = test_features.index.rename(
        ['locu_id', 'foursquare_id'])
    test_match_pairs = test_features.loc[test_results_index]
    matches_test = test_match_pairs.drop(test_match_pairs.columns[::], axis=1)
    #     matches_test
    matches_test.to_csv('matches_test.csv')

    # create a dataframe for both fourquare and locu of pairs that get matched
    test_tuples = list(matches_test.index)
    test_locu_index = [i[0] for i in test_tuples]
    test_four_index = [i[1] for i in test_tuples]
    test_locu_matches = locu_test.loc[test_locu_index]
    test_four_matches = four_test.loc[test_four_index]

    # for viewing full match dataset
    temp = matches_test.reset_index().join(test_four_matches,
                                           on=['foursquare_id'])
    test_match_pairs = temp.join(test_locu_matches,
                                 on=['locu_id'],
                                 lsuffix='_foursquare',
                                 rsuffix='_locu').set_index(
                                     matches_test.index.names)

    cols = np.array(test_match_pairs.columns.tolist())
    order = [0, 7, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13]
    cols = list(cols[order])
    test_matches_reordered = test_match_pairs[cols]
    #     display(test_matches_reordered)
    #     print("Successfully wrote results to matches_test.csv")
    return
示例#18
0
    def _test_logistic_transh_erer(self, dataset, params):
        model = dataset()
        logger = get_logger('RL.Test.erer.LogisticTransH.ERER.' + str(model))
        entA, entB, relA, relB, triA, triB, entity_pairs, prior_pairs, true_pairs = model.get_erer_model(
        )

        self.assertTrue(all([(tp in entity_pairs) for tp in true_pairs]))
        #Generate embeddings for datasetA
        transh = TransH(entA,
                        relA,
                        triA,
                        prior_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)
        ent_embeddingsA = transh.get_ent_embeddings()
        transh.close_tf_session()
        del transh

        #Generate embeddings for datasetB
        transh = TransH(entB,
                        relB,
                        triB,
                        entity_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)
        ent_embeddingsB = transh.get_ent_embeddings()
        transh.close_tf_session()

        ent_embeddingsA = [
            np.array(ent_embeddingsA[i])
            for i in range(ent_embeddingsA.shape[0])
        ]
        ent_embeddingsB = [
            np.array(ent_embeddingsB[i])
            for i in range(ent_embeddingsB.shape[0])
        ]
        trainDataA = pd.DataFrame(data=ent_embeddingsA)
        trainDataB = pd.DataFrame(data=ent_embeddingsB)

        #Define comparision Class
        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i))  #method='exp')

        #sample negative pairs
        train_pairs = []
        tuple_pp = set(map(tuple, prior_pairs))
        logger.info("Number of prior_pairs: %d", len(prior_pairs))
        for e1, e2 in prior_pairs:
            train_pairs.append((e1, e2))
            while True:
                neg_e2 = random.choice(xrange(0, len(entB)))
                if neg_e2 == e2 or (e1, neg_e2) in tuple_pp:
                    continue
                else:
                    train_pairs.append((e1, neg_e2))
                    break
        logger.info("Number of Train Pairs: %d", len(train_pairs))
        candidate_links = pd.MultiIndex.from_tuples(train_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        #Train Logistic Regression Model
        logrg = recordlinkage.LogisticRegressionClassifier()
        candidate_links = pd.MultiIndex.from_tuples(prior_pairs)
        logrg.fit(features, candidate_links)

        #Test Classifier
        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i))
        candidate_links = pd.MultiIndex.from_tuples(entity_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Test Features %s", str(features.describe()))
        result = logrg.predict(features)
        log_quality_results(logger, result, true_pairs, len(entity_pairs))

        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs)
        ir_metrics.log_metrics(logger, params, params)

        #Export results
        export_embeddings('erer', str(model), 'LogTransH', entA,
                          ent_embeddingsA)
        export_embeddings('erer', str(model), 'LogTransH', entB,
                          ent_embeddingsB)
        export_result_prob(dataset, 'erer', str(model), 'LogTransH', entA,
                           result_prob, true_pairs, entB)
示例#19
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
    def test_logistic(self):
        logger = get_logger('RL.Test.LogisticRegression.Census')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, census.true_links)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Validate the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.val_links, census.valDataA,
                                      census.valDataB)
        logger.info("Validation Features %s", str(features.describe()))
        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_val_links,
                            len(census.val_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))

        logger.info("logrg coefficients: %s", str(logrg.coefficients))
        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(census.test_links[i][0], census.test_links[i][1],
                        prob[i]) for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 census.true_test_links)
        ir_metrics.log_metrics(logger)

        #Export False Positives and result porobabilities
        result_feature_mapping = [
            (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        get_entity_name = lambda c, d, i: "_".join([
            str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][c.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda c, d, i: "_".join([
            str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][c.field_map[CensusFields.DNI]])
        ])
        entitiesA = [
            get_entity_name(census, census.testDataA, i)
            for i in range(int(census.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(census, census.testDataB, i)
            for i in range(int(census.testDataB.shape[0]))
        ]
        result_prob = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                        entitiesB.index(
                            get_entity_name_loc(census, census.testDataB,
                                                int(b))), p)
                       for (a, b, p) in result_prob]
        true_links = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                       entitiesB.index(
                           get_entity_name_loc(census, census.testDataB,
                                               int(b))))
                      for (a, b) in census.true_test_links]
        export_result_prob(Census, 'LogisticRegression', 'census', 'logistic',
                           entitiesA, result_prob, true_links, entitiesB)

        result = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                   entitiesB.index(
                       get_entity_name_loc(census, census.testDataB, int(b))))
                  for (a, b) in result]
        export_false_negatives(Census, 'LogisticRegression', 'census',
                               'logistic', entitiesA, result_prob, true_links,
                               result, entitiesB)
        export_false_positives(Census, 'LogisticRegression', 'census',
                               'logistic', entitiesA, result_prob, true_links,
                               result, entitiesB)

        weights = logrg.coefficients
        result = [
            (e1, e2,
             [str("%.2f" % (float(d * w) / sum(weights))) for w in weights], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        result_feature_mapping = [
            (entitiesA.index(
                get_entity_name_loc(census, census.testDataA, int(a))),
             entitiesB.index(
                 get_entity_name_loc(census, census.testDataB, int(b))), w, p)
            for (a, b, w, p) in result_feature_mapping
        ]
        export_human_readable_results(Census, 'LogisticRegression', 'census',
                                      'logistic', entitiesA,
                                      result_feature_mapping, entitiesB)