def test_language_score(self): """Tests that the language comparison is correctly implemented""" article1 = Article(language='eng') article2 = Article(language='eng') article3 = Article(language='ENG') self.assertEqual(1, ArticlePair(article1, article2).get_language_score()) self.assertEqual(1, ArticlePair(article1, article3).get_language_score())
def test_mesh_score(self): """Tests that the number of shared keywords by the articles are correctly retrieved by ignoring lower/upper-case letters""" article1 = Article(mesh_terms=['test', 'Testing', 'unittest']) article2 = Article(mesh_terms=['test', 'TESTING']) article3 = Article(mesh_terms=[]) self.assertEqual(2, ArticlePair(article1, article2).get_mesh_score()) self.assertEqual(0, ArticlePair(article1, article3).get_mesh_score())
def test_authors_score(self): """Tests that the authors scores are correctly retrieved, checking that lower/upper-case letters are ignored.""" author1 = Author("Testing", "Test", "T.T.") author2 = Author("Resting", "Rest", "R.R.") author3 = Author("John", "Doe", "J.D.") article1 = Article(main_author=author1, authors=[author1, author2, author3]) article2 = Article(main_author=author1, authors=[author1, author2, author3]) article3 = Article(main_author=author1, authors=[author1, author2]) self.assertEqual(2, ArticlePair(article1, article2).get_coauthors_score()) self.assertEqual(1, ArticlePair(article1, article3).get_coauthors_score())
def test_initials_score(self): """Tests that the binary information (authors initials match) is correctly detected""" author1 = Author("Lastname", "Forename", "L.F.") author2 = Author("Lastname", "Test", "L.T.") article1 = Article(main_author=author1) article2 = Article(main_author=author2) article3 = Article() self.assertEqual(1, ArticlePair(article1, article1).get_initials_score()) self.assertEqual(0, ArticlePair(article1, article2).get_initials_score()) self.assertEqual(-1, ArticlePair(article1, article3).get_initials_score())
def print_feature_importances(classifier): features_importances = classifier.feature_importances_() features_names = ArticlePair.feature_names() print("\nFeature Importances:") for i in range(len(features_names)): print(features_names[i] + str(features_importances[i] * 100) + "%")
def test_vector_score(self): """Tests that the cosine similarity between vectors is correctly calculated""" x1, y1, z1 = 3, 5, 2 x2, y2, z2 = 9, 2, 3 v1 = [x1, y1, z1] v2 = [x2, y2, z2] article1 = Article(vector=v1) article2 = Article(vector=v2) pair = ArticlePair(article1, article2) expected = ((x1 * x2) + (y1 * y2) + (z1 * z2)) \ / (math.sqrt(x1**2 + y1**2 + z1**2) * math.sqrt(x2**2 + y2**2 + z2**2)) score = pair.get_vector_score() self.assertEqual(expected, score)
def test_date_score(self): """Tests that the articles date score matches the distances of the dates (absolute value) in days""" d1 = datetime.datetime(2018, 1, 1) d2 = datetime.datetime(2019, 1, 1) article1 = Article(date=d1) article2 = Article(date=d2) same_pair = ArticlePair(article1, article1) real_pair1 = ArticlePair(article1, article2) real_pair2 = ArticlePair(article2, article1) self.assertEqual(0, same_pair.get_date_score()) self.assertEqual(1, real_pair1.get_date_score()) self.assertEqual(1, real_pair2.get_date_score())
def test_ambiguity_score(self): """Tests that the ambiguity score is correctly averaged""" article1 = Article(ambiguity=0.10) article2 = Article(ambiguity=0.20) ambiguity = ArticlePair(article1, article2).get_ambiguity_score() error = ambiguity - 0.15 # Approximation error is 2.77e-17 self.assertTrue(float(0.0000000000000001) > error)
def test_lnlength_score(self): """Tests that the articles lastnames lengths are correctly averaged""" author1 = Author('Pulfer', 'Brian', 'P.B.') author2 = Author('Doe', 'John', 'DJ') author3 = Author('Case', 'Test', 'C.T.') article1 = Article(authors=[author1, author2]) article2 = Article(authors=[author1, author3]) article3 = Article(authors=[author2, author3]) ap1 = ArticlePair(article1, article2) ap2 = ArticlePair(article1, article3) ap3 = ArticlePair(article2, article3) self.assertEqual(6, ap1.get_lnlength_score()) self.assertEqual(4.5, ap2.get_lnlength_score()) self.assertEqual(4.5, ap3.get_lnlength_score())
def test_location_score(self): """Tests that the score is 1 for articles of the same country, 0 otherwise""" c1 = 'France' c2 = ' france ' c3 = 'Greece' a1 = Article(loc_list=[c1]) a2 = Article(loc_list=[c2]) a3 = Article(loc_list=[c3]) ap1 = ArticlePair(a1, a2) ap2 = ArticlePair(a1, a3) self.assertEqual(1, ap1.get_location_score()) # Countries are equal self.assertEqual(0, ap2.get_location_score()) # Countries are different
def test_Levenshtein_scores(self): """Tests that all the scores which uses the levensthein distance (email, affiliation) work properly""" author1 = Author('lastname', 'firstname', 'L.F.') author2 = Author('test', 'name', 'T.N.') email1 = '*****@*****.**' email2 = '*****@*****.**' article1 = Article(e_mail=email1, main_author=author1) article2 = Article(e_mail=email1, main_author=author1) article3 = Article(e_mail=email2, main_author=author2) ap1 = ArticlePair(article1, article2) ap2 = ArticlePair(article1, article3) # Testing e-mail score self.assertEqual(1, ap1.get_email_score()) self.assertEqual(0, ap2.get_email_score()) # Testing affiliation score self.assertEqual(0, ap1.get_firstname_score()) self.assertEqual(5, ap2.get_firstname_score())
def test_jdst_score(self): """Tests that the number of shared Journal Descriptors and Semantic Types are correctly computed""" article1 = Article(jds=['A', 'B', 'C'], sts=['X', 'Y', 'Z']) article2 = Article(jds=['A', 'C'], sts=['Z']) self.assertEqual(3, ArticlePair(article1, article2).get_jdst_score())
def main(training_set_path="./../dataset/1500_pairs_train.csv", testing_set_path="./../dataset/400_pairs_test.csv"): """Main method - Trains and tests various classifiers_test""" # Positive instances in the training set: 970. Negative instances in the training set: 503 # Positive instances in the testing set: 217. Negative instances in the testing set: 182 # Nominative training set length is 1500 Instances. After filtering out null values there are 1473 instances # Nominative testing set length is 400 Instances. After filtering out null values there are 397 instances # Retrieving sets training_set = get_set(training_set_path) testing_set = get_set(testing_set_path) x_train = list() training_labels = np.array(training_set[:, 8]) x_test = list() testing_labels = np.array(testing_set[:, 8]) y_train, y_test = list(), list() # Filling training set data for i in range(len(training_set)): pmid_left = int(training_set[i][0]) pmid_right = int(training_set[i][4]) # Loading first article and setting infos article1 = article_loader.load_article(pmid_left) author1 = Author(lastname=str(training_set[i][1]), forename=str(training_set[i][3]), initials=str(training_set[i][2])) article1.set_main_author(author1) article1.set_ambiguity( ambiguity_score.get_ambiguity_score( namespace_lastname=training_set[i][1], namespace_initial=training_set[i][2], dataset=training_set, ds_ln1_col=1, ds_fn1_col=2, ds_ln2_col=5, ds_fn2_col=6)) # Loading second article and setting infos article2 = article_loader.load_article(pmid_right) author2 = Author(lastname=str(training_set[i][5]), forename=str(training_set[i][7]), initials=str(training_set[i][6])) article2.set_main_author(author2) article2.set_ambiguity( ambiguity_score.get_ambiguity_score( namespace_lastname=training_set[i][5], namespace_initial=training_set[i][6], dataset=training_set, ds_ln1_col=1, ds_fn1_col=2, ds_ln2_col=5, ds_fn2_col=6)) # Creating the pair article_pair = ArticlePair(article1, article2) # Putting the pair's vector in the training set x_train.append(article_pair.scores()) if 'NO' in training_labels[i]: y_train.append(0) else: y_train.append(1) # Filling testing set data for i in range(len(testing_set)): pmid_left = int(testing_set[i][0]) pmid_right = int(testing_set[i][4]) # Loading first article and setting infos article1 = article_loader.load_article(pmid_left) author1 = Author(lastname=str(testing_set[i][1]), forename=str(testing_set[i][3]), initials=str(testing_set[i][2])) article1.set_main_author(author1) article1.set_ambiguity( ambiguity_score.get_ambiguity_score(testing_set[i][1], testing_set[i][2], testing_set, 1, 2, 5, 6)) # Loading second article and setting infos article2 = article_loader.load_article(pmid_right) author2 = Author(lastname=str(testing_set[i][5]), forename=str(testing_set[i][7]), initials=str(testing_set[i][6])) article2.set_main_author(author2) article2.set_ambiguity( ambiguity_score.get_ambiguity_score(testing_set[i][5], testing_set[i][6], testing_set, 1, 2, 5, 6)) # Creating the pair article_pair = ArticlePair(article1, article2) # Putting the pair's vector in the testing set x_test.append(article_pair.scores()) if 'NO' in testing_labels[i]: y_test.append(0) else: y_test.append(1) y_train = np.array(y_train).astype('int') y_test = np.array(y_test).astype('int') # Filling empty datas (-1) with average values x_train_filled = fill_empty_with_average( x_train) # ALTERNATIVE: fill_empty_with_random(x_train) x_test_filled = fill_empty_with_average( x_test) # ALTERNATIVE: fill_empty_with_random(x_test) # Normalizing data binaries_features = ArticlePair.binary_scores() x_train_norm = normalize_set(x_train_filled, binaries_features).astype('float64') x_test_norm = normalize_set(x_test_filled, binaries_features).astype('float64') # Testing many classifiers (OPTIONAL) test_classifiers(x_train_norm, y_train, x_test_norm, y_test) # K-Cross validating the best classifier (Random Forest) k = 10 data = np.concatenate((x_train_norm, x_test_norm), axis=0) target = np.concatenate((y_train, y_test), axis=0) scores = cross_validate(data, target, k) accuracy = scores.mean() print("\nRandom Forest accuracy with " + str(k) + "-fold cross validation: " + str(int(accuracy * 100)) + "%") # Running the main (best) classifier best_classifier = RandomForest(50) best_classifier.fit(x_train_norm, y_train) predictions = best_classifier.predict(x_test_norm) print("\nBest classifier accuracy: " + str(compute_accuracy(predictions, y_test) * 100) + "%") # Printing each feature importance for the classifier print_feature_importances(best_classifier)