def svm_classifier(features, links_true, train_size=0.2, cv=None): """ Support vector machine classifier """ svm = rl.SVMClassifier() ##FIXME train_size check should be greater than 0 less than 1 if cv is None: golden_match_index = features.index & links_true.index train_index = int(len(features) * train_size) #train model svm.fit(features[0:train_index], golden_match_index) # Predict the match status for all record pairs matches = svm.predict(features) df_svm = pd.DataFrame(svm.kernel.decision_function(features)) df_svm.columns = ['score'] else: df_results = cross_val_predict(svm, features, links_true, cv, method='predict') matches = df_results.index df_svm = cross_val_predict(svm, features, links_true, cv, method='decision_function') return matches, df_svm
def init_model(classifier, num_features, **kwargs): if classifier is keys.NAIVE_BAYES: # add `binarize` threshold if not already specified if "binarize" not in kwargs.keys(): kwargs["binarize"] = constants.NAIVE_BAYES_BINARIZE model = rl.NaiveBayesClassifier(**kwargs) elif classifier is keys.LINEAR_SVM: model = rl.SVMClassifier(**kwargs) elif classifier is keys.SVM: model = classifiers.SVCClassifier(**kwargs) elif classifier is keys.SINGLE_LAYER_PERCEPTRON: model = classifiers.SingleLayerPerceptron(num_features, **kwargs) elif classifier is keys.MULTI_LAYER_PERCEPTRON: model = classifiers.MultiLayerPerceptron(num_features, **kwargs) else: err_msg = ( f'Classifier not supported: {classifier}. ' f'It should be one of {set(constants.CLASSIFIERS)}' ) LOGGER.critical(err_msg) raise ValueError(err_msg) LOGGER.info('Model initialized: %s', model) return model
def test_svm(self): svm = rl.SVMClassifier() svm.fit(self.X_train, self.y_train) svm.predict(self.X_test) # There are no probabilities with pytest.raises(AttributeError): svm.prob(self.X_train)
def test_svm(self): svm = recordlinkage.SVMClassifier() svm.learn(self.y_train, self.matches_index) svm.predict(self.y) # There are no probabilities with pytest.raises(AttributeError): svm.prob(self.y)
def init_model(classifier: str, num_features: int, **kwargs): if classifier is keys.NAIVE_BAYES: # add `binarize` threshold if not already specified kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs} model = rl.NaiveBayesClassifier(**kwargs) elif classifier is keys.LOGISTIC_REGRESSION: kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs} model = rl.LogisticRegressionClassifier(**kwargs) elif classifier is keys.LINEAR_SVM: kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs} model = rl.SVMClassifier(**kwargs) elif classifier is keys.SVM: model = classifiers.SVCClassifier(**kwargs) elif classifier is keys.RANDOM_FOREST: model = classifiers.RandomForest(**kwargs) elif classifier is keys.SINGLE_LAYER_PERCEPTRON: model = classifiers.SingleLayerPerceptron(num_features, **kwargs) elif classifier is keys.MULTI_LAYER_PERCEPTRON: model = classifiers.MultiLayerPerceptron(num_features, **kwargs) elif classifier is keys.VOTING_CLASSIFIER: model = classifiers.VotingClassifier(num_features, **kwargs) elif classifier is keys.GATED_CLASSIFIER: model = classifiers.GatedEnsembleClassifier(num_features, **kwargs) elif classifier is keys.STACKED_CLASSIFIER: model = classifiers.StackedEnsembleClassifier(num_features, **kwargs) else: err_msg = ( f'Classifier not supported: {classifier}. ' f'It should be one of {set(constants.CLASSIFIERS)}' ) LOGGER.critical(err_msg) raise ValueError(err_msg) LOGGER.info('Model initialized: %s', model) return model
#getting a sample of the dataframe data_sample = data.take(np.random.permutation(len(data))[:2000000]) #tentetive matches matches = dfA_sample[dfA_sample.sum(axis=1) > 6] #tentetive matches #nonmatches = data_sample[data_sample.sum(axis=1) < 4] #creating match index match_index = matches.index #creating a training dataset golden_pairs = data_sample[0:2000000] golden_matches_index = golden_pairs.index & match_index # Train the classifier svm = rl.SVMClassifier() svm.learn(golden_pairs, golden_matches_index) # Predict the match status for all record pairs result_svm = svm.predict(data) len(result_svm) #creating a confusion matrix conf_svm = rl.confusion_matrix(match_index, result_svm, len(data)) conf_svm # The F-score for this classification is rl.fscore(conf_svm) m_last = pd.DataFrame(result_svm) #loading data for review
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
def get_matches(locu_train_path, foursquare_train_path, matches_train_path, locu_test_path, foursquare_test_path): four_train = pd.read_json(foursquare_train_path) locu_train = pd.read_json(locu_train_path) four_test = pd.read_json(foursquare_test_path) locu_test = pd.read_json(locu_test_path) matches_train = pd.read_csv(matches_train_path) # visualize missing data # msno.matrix(four_train) # msno.matrix(locu_train) # msno.matrix(four_test) # msno.matrix(locu_test) locu_train, four_train = preprocess(locu_train, four_train) locu_test, four_test = preprocess(locu_test, four_test) matches_train = preprocess_matches(matches_train) candidate_pairs = index_pairs(locu_train, four_train) test_candidate_pairs = index_pairs(locu_test, four_test) # print (len(locu_train), len(four_train), len(candidate_pairs)) # print (len(locu_test), len(four_test), len(test_candidate_pairs)) features = compare_strings(locu_train, four_train, candidate_pairs) test_features = compare_strings(locu_test, four_test, test_candidate_pairs) # features = features.loc[features['street_address'] > .1] # features = features.loc[features['name'] > .1] train_pairs, train_matches_index, all_matches_index = traintestsplit( features, matches_train) # Train Logistic Regression classifier logreg = recordlinkage.LogisticRegressionClassifier() logreg.learn(train_pairs, train_matches_index) # print ("LogReg Intercept: ", logreg.intercept) # print ("LogReg Coefficients: ", logreg.coefficients) # Train SVM classifier svm = recordlinkage.SVMClassifier() svm.learn(train_pairs, train_matches_index) # Predict on training data with both classifiers svm_results_index = predict(features, svm) logreg_results_index = predict(features, logreg) # To view pairs # features.index = features.index.rename(['locu_id', 'foursquare_id']) # train_matches = features.loc[svm_results_index] # train_matches # Training results svm_confn_matrix = recordlinkage.confusion_matrix(all_matches_index, svm_results_index, len(features)) # print("SVM Confusion Matrix: ", svm_confn_matrix) # print("SVM Precision: ", recordlinkage.precision(svm_confn_matrix)) # print("SVM Recall: ", recordlinkage.recall(svm_confn_matrix)) # print("SVM Accuracy: ", recordlinkage.accuracy(svm_confn_matrix)) # print("SVM F1 Score: ", recordlinkage.fscore(svm_confn_matrix)) logreg_confn_matrix = recordlinkage.confusion_matrix( all_matches_index, logreg_results_index, len(features)) # print("Logistic Regression Confusion Matrix: ", logreg_confn_matrix) # print("Logistic Regression Precision: ", recordlinkage.precision(logreg_confn_matrix)) # print("Logistic Regression Recall: ", recordlinkage.recall(logreg_confn_matrix)) # print("Logistic Regression Accuracy: ", recordlinkage.accuracy(logreg_confn_matrix)) # print("Logistic Regression F1 Score: ", recordlinkage.fscore(logreg_confn_matrix)) # Predict on test data with SVM test_results_index = predict(test_features, svm) # Format and write to CSV test_features.index = test_features.index.rename( ['locu_id', 'foursquare_id']) test_match_pairs = test_features.loc[test_results_index] matches_test = test_match_pairs.drop(test_match_pairs.columns[::], axis=1) # matches_test matches_test.to_csv('matches_test.csv') # create a dataframe for both fourquare and locu of pairs that get matched test_tuples = list(matches_test.index) test_locu_index = [i[0] for i in test_tuples] test_four_index = [i[1] for i in test_tuples] test_locu_matches = locu_test.loc[test_locu_index] test_four_matches = four_test.loc[test_four_index] # for viewing full match dataset temp = matches_test.reset_index().join(test_four_matches, on=['foursquare_id']) test_match_pairs = temp.join(test_locu_matches, on=['locu_id'], lsuffix='_foursquare', rsuffix='_locu').set_index( matches_test.index.names) cols = np.array(test_match_pairs.columns.tolist()) order = [0, 7, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13] cols = list(cols[order]) test_matches_reordered = test_match_pairs[cols] # display(test_matches_reordered) # print("Successfully wrote results to matches_test.csv") return
def create_and_train_svm(): """ Creates and trains a SVM Classifier """ return train_supervised_classifier(rl.SVMClassifier())