def test_language_score(self):
        """Tests that the language comparison is correctly implemented"""

        article1 = Article(language='eng')
        article2 = Article(language='eng')
        article3 = Article(language='ENG')

        self.assertEqual(1, ArticlePair(article1, article2).get_language_score())
        self.assertEqual(1, ArticlePair(article1, article3).get_language_score())
    def test_mesh_score(self):
        """Tests that the number of shared keywords by the articles are correctly
        retrieved by ignoring lower/upper-case letters"""

        article1 = Article(mesh_terms=['test', 'Testing', 'unittest'])
        article2 = Article(mesh_terms=['test', 'TESTING'])
        article3 = Article(mesh_terms=[])

        self.assertEqual(2, ArticlePair(article1, article2).get_mesh_score())
        self.assertEqual(0, ArticlePair(article1, article3).get_mesh_score())
    def test_authors_score(self):
        """Tests that the authors scores are correctly retrieved, checking that lower/upper-case letters are ignored."""
        author1 = Author("Testing", "Test", "T.T.")
        author2 = Author("Resting", "Rest", "R.R.")
        author3 = Author("John", "Doe", "J.D.")

        article1 = Article(main_author=author1, authors=[author1, author2, author3])
        article2 = Article(main_author=author1, authors=[author1, author2, author3])
        article3 = Article(main_author=author1, authors=[author1, author2])

        self.assertEqual(2, ArticlePair(article1, article2).get_coauthors_score())
        self.assertEqual(1, ArticlePair(article1, article3).get_coauthors_score())
    def test_initials_score(self):
        """Tests that the binary information (authors initials match) is correctly detected"""
        author1 = Author("Lastname", "Forename", "L.F.")
        author2 = Author("Lastname", "Test", "L.T.")

        article1 = Article(main_author=author1)
        article2 = Article(main_author=author2)
        article3 = Article()

        self.assertEqual(1, ArticlePair(article1, article1).get_initials_score())
        self.assertEqual(0, ArticlePair(article1, article2).get_initials_score())
        self.assertEqual(-1, ArticlePair(article1, article3).get_initials_score())
def print_feature_importances(classifier):
    features_importances = classifier.feature_importances_()
    features_names = ArticlePair.feature_names()

    print("\nFeature Importances:")
    for i in range(len(features_names)):
        print(features_names[i] + str(features_importances[i] * 100) + "%")
    def test_vector_score(self):
        """Tests that the cosine similarity between vectors is correctly calculated"""

        x1, y1, z1 = 3, 5, 2
        x2, y2, z2 = 9, 2, 3

        v1 = [x1, y1, z1]
        v2 = [x2, y2, z2]

        article1 = Article(vector=v1)
        article2 = Article(vector=v2)

        pair = ArticlePair(article1, article2)

        expected = ((x1 * x2) + (y1 * y2) + (z1 * z2)) \
                   / (math.sqrt(x1**2 + y1**2 + z1**2) * math.sqrt(x2**2 + y2**2 + z2**2))
        score = pair.get_vector_score()

        self.assertEqual(expected, score)
    def test_date_score(self):
        """Tests that the articles date score matches the distances of the dates (absolute value) in days"""

        d1 = datetime.datetime(2018, 1, 1)
        d2 = datetime.datetime(2019, 1, 1)

        article1 = Article(date=d1)
        article2 = Article(date=d2)

        same_pair = ArticlePair(article1, article1)
        real_pair1 = ArticlePair(article1, article2)
        real_pair2 = ArticlePair(article2, article1)

        self.assertEqual(0, same_pair.get_date_score())
        self.assertEqual(1, real_pair1.get_date_score())
        self.assertEqual(1, real_pair2.get_date_score())
    def test_ambiguity_score(self):
        """Tests that the ambiguity score is correctly averaged"""

        article1 = Article(ambiguity=0.10)
        article2 = Article(ambiguity=0.20)

        ambiguity = ArticlePair(article1, article2).get_ambiguity_score()
        error = ambiguity - 0.15

        # Approximation error is 2.77e-17
        self.assertTrue(float(0.0000000000000001) > error)
    def test_lnlength_score(self):
        """Tests that the articles lastnames lengths are correctly averaged"""

        author1 = Author('Pulfer', 'Brian', 'P.B.')
        author2 = Author('Doe', 'John', 'DJ')
        author3 = Author('Case', 'Test', 'C.T.')

        article1 = Article(authors=[author1, author2])
        article2 = Article(authors=[author1, author3])
        article3 = Article(authors=[author2, author3])

        ap1 = ArticlePair(article1, article2)
        ap2 = ArticlePair(article1, article3)
        ap3 = ArticlePair(article2, article3)

        self.assertEqual(6, ap1.get_lnlength_score())
        self.assertEqual(4.5, ap2.get_lnlength_score())
        self.assertEqual(4.5, ap3.get_lnlength_score())
    def test_location_score(self):
        """Tests that the score is 1 for articles of the same country, 0 otherwise"""

        c1 = 'France'
        c2 = '  france '
        c3 = 'Greece'

        a1 = Article(loc_list=[c1])
        a2 = Article(loc_list=[c2])
        a3 = Article(loc_list=[c3])

        ap1 = ArticlePair(a1, a2)
        ap2 = ArticlePair(a1, a3)

        self.assertEqual(1, ap1.get_location_score())     # Countries are equal
        self.assertEqual(0, ap2.get_location_score())     # Countries are different
    def test_Levenshtein_scores(self):
        """Tests that all the scores which uses the levensthein distance (email, affiliation) work properly"""

        author1 = Author('lastname', 'firstname', 'L.F.')
        author2 = Author('test', 'name', 'T.N.')

        email1 = '*****@*****.**'
        email2 = '*****@*****.**'

        article1 = Article(e_mail=email1, main_author=author1)
        article2 = Article(e_mail=email1, main_author=author1)
        article3 = Article(e_mail=email2, main_author=author2)

        ap1 = ArticlePair(article1, article2)
        ap2 = ArticlePair(article1, article3)

        # Testing e-mail score
        self.assertEqual(1, ap1.get_email_score())
        self.assertEqual(0, ap2.get_email_score())

        # Testing affiliation score
        self.assertEqual(0, ap1.get_firstname_score())
        self.assertEqual(5, ap2.get_firstname_score())
    def test_jdst_score(self):
        """Tests that the number of shared Journal Descriptors and Semantic Types are correctly computed"""
        article1 = Article(jds=['A', 'B', 'C'], sts=['X', 'Y', 'Z'])
        article2 = Article(jds=['A', 'C'], sts=['Z'])

        self.assertEqual(3, ArticlePair(article1, article2).get_jdst_score())
def main(training_set_path="./../dataset/1500_pairs_train.csv",
         testing_set_path="./../dataset/400_pairs_test.csv"):
    """Main method - Trains and tests various classifiers_test"""
    # Positive instances in the training set: 970. Negative instances in the training set: 503
    # Positive instances in the testing set: 217. Negative instances in the testing set: 182
    # Nominative training set length is 1500 Instances. After filtering out null values there are 1473 instances
    # Nominative testing set length is 400 Instances. After filtering out null values there are 397 instances

    # Retrieving sets
    training_set = get_set(training_set_path)
    testing_set = get_set(testing_set_path)

    x_train = list()
    training_labels = np.array(training_set[:, 8])

    x_test = list()
    testing_labels = np.array(testing_set[:, 8])

    y_train, y_test = list(), list()

    # Filling training set data
    for i in range(len(training_set)):
        pmid_left = int(training_set[i][0])
        pmid_right = int(training_set[i][4])

        # Loading first article and setting infos
        article1 = article_loader.load_article(pmid_left)

        author1 = Author(lastname=str(training_set[i][1]),
                         forename=str(training_set[i][3]),
                         initials=str(training_set[i][2]))

        article1.set_main_author(author1)
        article1.set_ambiguity(
            ambiguity_score.get_ambiguity_score(
                namespace_lastname=training_set[i][1],
                namespace_initial=training_set[i][2],
                dataset=training_set,
                ds_ln1_col=1,
                ds_fn1_col=2,
                ds_ln2_col=5,
                ds_fn2_col=6))

        # Loading second article and setting infos
        article2 = article_loader.load_article(pmid_right)

        author2 = Author(lastname=str(training_set[i][5]),
                         forename=str(training_set[i][7]),
                         initials=str(training_set[i][6]))

        article2.set_main_author(author2)
        article2.set_ambiguity(
            ambiguity_score.get_ambiguity_score(
                namespace_lastname=training_set[i][5],
                namespace_initial=training_set[i][6],
                dataset=training_set,
                ds_ln1_col=1,
                ds_fn1_col=2,
                ds_ln2_col=5,
                ds_fn2_col=6))

        # Creating the pair
        article_pair = ArticlePair(article1, article2)

        # Putting the pair's vector in the training set
        x_train.append(article_pair.scores())
        if 'NO' in training_labels[i]:
            y_train.append(0)
        else:
            y_train.append(1)

    # Filling testing set data
    for i in range(len(testing_set)):
        pmid_left = int(testing_set[i][0])
        pmid_right = int(testing_set[i][4])

        # Loading first article and setting infos
        article1 = article_loader.load_article(pmid_left)

        author1 = Author(lastname=str(testing_set[i][1]),
                         forename=str(testing_set[i][3]),
                         initials=str(testing_set[i][2]))

        article1.set_main_author(author1)
        article1.set_ambiguity(
            ambiguity_score.get_ambiguity_score(testing_set[i][1],
                                                testing_set[i][2], testing_set,
                                                1, 2, 5, 6))

        # Loading second article and setting infos
        article2 = article_loader.load_article(pmid_right)

        author2 = Author(lastname=str(testing_set[i][5]),
                         forename=str(testing_set[i][7]),
                         initials=str(testing_set[i][6]))

        article2.set_main_author(author2)
        article2.set_ambiguity(
            ambiguity_score.get_ambiguity_score(testing_set[i][5],
                                                testing_set[i][6], testing_set,
                                                1, 2, 5, 6))

        # Creating the pair
        article_pair = ArticlePair(article1, article2)

        # Putting the pair's vector in the testing set
        x_test.append(article_pair.scores())
        if 'NO' in testing_labels[i]:
            y_test.append(0)
        else:
            y_test.append(1)

    y_train = np.array(y_train).astype('int')
    y_test = np.array(y_test).astype('int')

    # Filling empty datas (-1) with average values
    x_train_filled = fill_empty_with_average(
        x_train)  # ALTERNATIVE: fill_empty_with_random(x_train)
    x_test_filled = fill_empty_with_average(
        x_test)  # ALTERNATIVE: fill_empty_with_random(x_test)

    # Normalizing data
    binaries_features = ArticlePair.binary_scores()

    x_train_norm = normalize_set(x_train_filled,
                                 binaries_features).astype('float64')
    x_test_norm = normalize_set(x_test_filled,
                                binaries_features).astype('float64')

    # Testing many classifiers (OPTIONAL)
    test_classifiers(x_train_norm, y_train, x_test_norm, y_test)

    # K-Cross validating the best classifier (Random Forest)
    k = 10
    data = np.concatenate((x_train_norm, x_test_norm), axis=0)
    target = np.concatenate((y_train, y_test), axis=0)
    scores = cross_validate(data, target, k)
    accuracy = scores.mean()

    print("\nRandom Forest accuracy with " + str(k) +
          "-fold cross validation: " + str(int(accuracy * 100)) + "%")

    # Running the main (best) classifier
    best_classifier = RandomForest(50)

    best_classifier.fit(x_train_norm, y_train)
    predictions = best_classifier.predict(x_test_norm)

    print("\nBest classifier accuracy: " +
          str(compute_accuracy(predictions, y_test) * 100) + "%")

    # Printing each feature importance for the classifier
    print_feature_importances(best_classifier)