Python AuthorDisambiguation示例，src.author_disambiguation.AuthorDisambiguation Python示例

示例#1

0

显示文件

 def test__makePairs(self):
     print("INFO: Running _makePairs tests")
     log_path = self.log_path + 'make_pairs.log'
     with open(log_path, 'w'):
         pass
     test_auths = [
         ["A1-1000 yang-liu", 1],
         ["A1-1001 yang-liu", 1],
         ["A1-1002 yang-liu", 1],
         ["A1-1003 yang-liu", 1],
         ["A1-1004 yang-liu", 1],
         ["A1-1005 yang-liu", 1],
     ]
     test_auth = ["A1-1002 yang-liu", 1]
     expected_out = [
         ["A1-1002 yang-liu A1-1000 yang-liu", 1, 1],
         ["A1-1002 yang-liu A1-1001 yang-liu", 1, 1],
         ["A1-1002 yang-liu A1-1003 yang-liu", 1, 1],
         ["A1-1002 yang-liu A1-1004 yang-liu", 1, 1],
         ["A1-1002 yang-liu A1-1005 yang-liu", 1, 1],
     ]
     author_processor = AuthorDisambiguation(
         papers=self.test_papers,
         id_to_name=self.id_to_name,
         compare_args=self.compare_authors_args,
         log_path=log_path,
         name_similarity_cutoff=.95)
     res, excluded = author_processor._makePairs(test_auth, test_auths)
     self.assertEqual([["A1-1002 yang-liu", 1]], excluded)
     self.compareList(expected_out, res)

示例#2

0

显示文件

    def test_makeAmbiguousPairs(self):
        print("INFO: Running makeAmbiguousPairs tests")
        log_path = self.log_path + 'makeAmbiguousPairs.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True)

        ambiguous_papers = {
            "yang-liu-georgetown": ["W19-2708", "W19-2710", "W19-2717"]
        }
        check_authors = {
            "yang-liu-georgetown": {("W19-2708", "amir-zeldes"),
                                    ("Q18-1005", "yang-liu-edinburgh"),
                                    ("N19-1173", "yang-liu-edinburgh"),
                                    ("P15-2047", "yang-liu-edinburgh")},
        }
        authors_to_get = ["amir-zeldes", "yang-liu-edinburgh"]

        results, excluded = author_processor._makeAmbiguousPairs(
            ambiguous_papers, check_authors, authors_to_get)
        expected_excluded = {
            'W19-2708 yang-liu-georgetown': ['W19-2708 amir-zeldes']
        }
        for k in expected_excluded.keys():
            self.assertEqual(expected_excluded[k], excluded[k])
        expected_results = {
            "W19-2708 yang-liu-georgetown": [
                "Q18-1005 yang-liu-edinburgh", "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh"
            ],
            "W19-2710 yang-liu-georgetown": [
                "W19-2708 amir-zeldes",
                "Q18-1005 yang-liu-edinburgh",
                "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh",
            ],
            "W19-2717 yang-liu-georgetown": [
                "W19-2708 amir-zeldes", "Q18-1005 yang-liu-edinburgh",
                "N19-1173 yang-liu-edinburgh", "P15-2047 yang-liu-edinburgh"
            ]
        }
        for k, info in results.items():
            if k not in expected_results:
                print(k)
                self.fail()
            results_pair_keys = [x[0] for x in info]
            expected_pair_keys = [
                " ".join([k, x]) for x in expected_results[k]
            ]
            self.compareList(results_pair_keys, expected_pair_keys)

示例#3

0

显示文件

 def test__findData(self):
     print("INFO: Running _findData tests")
     log_path = self.log_path + 'find_data.log'
     with open(log_path, 'w'):
         pass
     author_processor = AuthorDisambiguation(DEBUG_MODE=True,
                                             log_path=log_path)
     res, status, error_msg = author_processor._findData("blah.json")
     self.assertIsNone(res)
     self.assertEqual(-1, status)
     self.assertEqual("blah.json not found in any subdirectory", error_msg)
     res, status, error_msg = author_processor._findData(
         "parsed_papers.json")
     self.assertEqual(0, status)
     self.assertEqual("", error_msg)

示例#4

0

显示文件

    def test_makePredictions(self):
        print("INFO: Running makePredictions tests")
        log_path = self.log_path + 'make_predictions.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True,
            model_path=os.getcwd(),
            model_name="SoftVoting")

        test_target = ["D17-1207", "yang-liu-ict"]
        test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"],
                ["P09-2066", "yang-liu-icsi"], ["D14-1076", "yang-liu-icsi"],
                ["D15-1210", "yang-liu-ict"], ["P16-1159", "yang-liu-ict"]]
        info_dict = {
            test_target[0] + " " + test_target[1]:
            getAuthorInfo([self.test_papers[test_target[0]],
                           test_target[1]])[1]
        }
        pairs = []
        for p, n in test:
            info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1]
            pairs.append([
                " ".join([*test_target, p, n]),
                info_dict[" ".join(test_target)], info_dict[p + " " + n]
            ])
        comparator = CompareAuthors(**self.compare_authors_args)
        key, res = author_processor._compareAuthors(
            [comparator, " ".join(test_target), pairs])
        test_compare_results = {key: res}
        consolidated = author_processor._consolidateResults(
            test_compare_results)
        predictions, probabilities = author_processor._makePredictions(
            consolidated)
        for k, info in predictions.items():
            self.assertTrue(k in probabilities)
            for a, predict in info.items():
                # self.assertTrue( a in probabilities[k])
                if a == "yang-liu-icsi" or a == "yang-liu-ict":
                    self.assertEqual(2, len(predict))
                    # self.assertEqual(2, len(probabilities[k][a]))
                else:
                    self.assertEqual(1, len(predict))

示例#5

0

显示文件

    def test_consolidateResults(self):
        print("INFO: Running consolidateResults tests")
        log_path = self.log_path + 'consolidate_results.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True)
        expected_compare_array = np.array([1 for x in range(24)])

        test_results = {
            "W19-2708 yang-liu-georgetown": {
                "yang-liu-edinburgh":
                [[1 for x in range(24)] for x in range(3)]
            },
            "W19-2710 yang-liu-georgetown": {
                "yang-liu-edinburgh":
                [[1 for x in range(24)] for x in range(2)]
            },
            "W19-2717 yang-liu-georgetown": {
                "yang-liu-edinburgh":
                [[1 for x in range(24)] for x in range(1)]
            },
            "Q18-1005 yang-liu-edinburgh": {
                "amir-zeldes": [[1 for x in range(24)] for x in range(1)]
            }
        }
        res = author_processor._consolidateResults(test_results)
        self.assertEqual(2, len(res))
        self.assertEqual(["yang-liu-georgetown", "yang-liu-edinburgh"],
                         list(res.keys()))
        for k, items in res.items():
            self.assertEqual(1, len(items))
            for a, r in items.items():
                if k == "yang-liu-georgetown":
                    self.assertEqual(6, r.shape[0])
                elif k == "yang-liu-edinburgh":
                    self.assertEqual(1, r.shape[0])
                for i in range(r.shape[0]):
                    np.testing.assert_array_equal(r[i], expected_compare_array)

示例#6

0

显示文件

    def test_compareAuthors(self):
        test_target = ["D17-1207", "yang-liu-ict"]
        test = [["C10-2059", "yajuan-lu"], ["P16-1159", "yong-cheng"],
                ["P09-2066", "yang-liu-icsi"]]
        info_dict = {
            test_target[0] + " " + test_target[1]:
            getAuthorInfo([self.test_papers[test_target[0]],
                           test_target[1]])[1]
        }
        pairs = []
        for p, n in test:
            info_dict[p + " " + n] = getAuthorInfo([self.test_papers[p], n])[1]
            pairs.append([
                " ".join([*test_target, p, n]),
                info_dict[" ".join(test_target)], info_dict[p + " " + n]
            ])

        print("INFO: Running compareAuthors tests")
        log_path = self.log_path + 'compare_authors.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            allow_authors_not_in_override=False)
        comparator = CompareAuthors(**self.compare_authors_args)
        key, res = author_processor._compareAuthors(
            [comparator, " ".join(test_target), pairs])
        self.assertEqual(" ".join(test_target), key)
        self.assertNotEqual(0, len(res))
        for k, info in info_dict.items():
            if k == " ".join(test_target):
                continue
            k_id = k.split()[1]
            self.assertTrue(k_id in res)
            self.assertEqual(1, len(res[k_id]))
            expected = comparator([
                " ".join([*test_target, k]), 0,
                info_dict[" ".join(test_target)], info
            ])[-1]
            np.testing.assert_array_equal(expected, res[k_id][0])

示例#7

0

显示文件

 def test__getSimilarAuthors(self):
     print("INFO: Running _getSimilarAuthors tests")
     log_path = self.log_path + 'get_similar_authors.log'
     with open(log_path, 'w'):
         pass
     expected = [
         "yang-liu-edinburgh",
         "yang-liu-ict",
         "yang-liu",
         "yang-liu-icsi",
         "yang-li",
     ]
     author_processor = AuthorDisambiguation(
         papers=self.test_papers,
         id_to_name=self.id_to_name,
         compare_args=self.compare_authors_args,
         log_path=log_path,
         name_similarity_cutoff=.95)
     args = [
         "yang-liu", "yang liu", author_processor.author_name,
         getAlgo("jaro", "similarity"), .95, False
     ]
     _, res, _, _ = author_processor._getSimilarAuthors(args)
     self.compareList(expected, res)
     expected_2 = [
         "yang-liu-georgetown", "yang-liu-edinburgh", "yang-liu-ict",
         "yang-liu", "yang-liu-icsi", "yang-li"
     ]
     args[-1] = True
     author_processor.sim_overrides = True
     _, res, _, _ = author_processor._getSimilarAuthors(args)
     self.compareList(expected_2, res)
     args[0] = "eugenio-martinez-camara1"
     args[1] = "eugenio martinez camara"
     print(author_processor._getSimilarAuthors(args))

示例#8

0

显示文件

 def test__determineCorrectAuthor(self):
     test_1 = {
         "a": [1, 1, 1, 1, 1, 0, 1, 1, 1, 0],
         "b": [1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
         "c": [1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
         "d": [1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
         "e": [1, 0, 1, 1, 0, 0, 0, 1, 1, 1],
         "f": [1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
         "g": [1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
     }
     test_2 = {
         "a": [1, 1, 1, 1, 1, 0, 1, 1, 1, 0],
         "b": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         "c": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "d": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "e": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "f": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "g": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
     }
     test_3 = {
         "a": [1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
         "b": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "c": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "d": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "e": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "f": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         "g": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
     }
     test_4 = {
         "a": [1, 1, 1, 1, 1, 0],
         "b": [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
     }
     print("INFO: Running determineCorrect tests")
     log_path = self.log_path + 'determine_correct.log'
     with open(log_path, 'w'):
         pass
     author_processor = AuthorDisambiguation(
         papers=self.test_papers,
         id_to_name=self.id_to_name,
         compare_args=self.compare_authors_args,
         log_path=log_path,
         name_similarity_cutoff=.95,
         threshold=.7)
     res, above = author_processor._determineCorrectAuthor(test_1)
     self.assertEqual("a", res)
     self.assertEqual([], above)
     res, above = author_processor._determineCorrectAuthor(test_2)
     self.assertEqual("b", res)
     self.assertEqual([['a', 0.8], ['b', 0.9]], above)
     res, above = author_processor._determineCorrectAuthor(test_3)
     self.assertIsNone(res)
     self.assertEqual([['a', 0.3], ['b', 0.1], ['c', 0.1], ['d', 0.1],
                       ['e', 0.1], ['f', 0.1], ['g', 0.1]], above)
     res, above = author_processor._determineCorrectAuthor(test_4)
     self.assertEqual("b", res)
     self.assertEqual([['a', 5 / 6], ['b', .8]], above)

示例#9

0

显示文件

 def test__getAuthorInfos(self):
     print("INFO: Running _getAuthorInfos tests")
     log_path = self.log_path + 'get_author_info.log'
     with open(log_path, 'w'):
         pass
     test_auths = ["yang-liu-ict", "luyang-liu", "bob-newman", "yang-liu"]
     test_papers = {
         "D17-1207": self.test_papers["D17-1207"],
         "C18-1172": self.test_papers["C18-1172"]
     }
     test_author_papers = {
         "yang-liu-ict": ["D17-1207"],
         "luyang-liu": ["C18-1172"],
         "bob-newman": ["A0-0000"]
     }
     author_processor = AuthorDisambiguation(
         papers=test_papers,
         author_papers=test_author_papers,
         compare_args=self.compare_authors_args,
         log_path=log_path,
         file_log_level=logging.WARNING)
     res, error_auth, error_paper = author_processor._getAuthorInfos(
         test_auths)
     self.assertEqual(1, error_auth)
     self.assertEqual(1, error_paper)
     for i, v in res.items():
         if i == "D17-1207 yang-liu-ict":
             self.compareInfoDict(
                 v,
                 getAuthorInfo(
                     [self.test_papers["D17-1207"], "yang-liu-ict"])[1])
         elif i == "C18-1172 luyang-liu":
             self.compareInfoDict(
                 v,
                 getAuthorInfo([self.test_papers["C18-1172"],
                                "luyang-liu"])[1])

示例#10

0

显示文件

    def test_checkCallErrors(self):
        print("INFO: Running checkCallError tests")
        log_path = self.log_path + 'call_error_check.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            allow_authors_not_in_override=False)
        test_1 = [["abcsasd-adad"], {}]
        test_2 = [["yang-liu"], {"abc-de": ["yang-liu-ict"]}]
        test_3 = [["yang-liu"], {"yang-liu": "yang-liu-ict"}]
        test_4 = [["yang-liu"], {"yang-liu": "yang-liu-ict"}]
        test_5 = [["yang-liu"], {"yang-liu": ["yang-liu-ict", "yang-liu"]}]
        test_6 = [["yang-liu"], {
            "yang-liu": ["yang-liu-ict", "no-one-should-ever-have-this-name"]
        }]

        with self.assertRaises(ValueError):
            author_processor._errorCheckCallArgs(test_2[0], test_2[1])
            author_processor._errorCheckCallArgs(test_3[0], test_3[1])
            author_processor._errorCheckCallArgs(test_4[0], test_4[1])
            author_processor._errorCheckCallArgs(test_5[0], test_5[1])

        with self.assertRaises(KeyError):
            author_processor._errorCheckCallArgs(test_1[0], test_1[1])
            author_processor._errorCheckCallArgs(test_6[0], test_6[1])

        should_work_targets = ["luyang-liu", "yang-liu"]
        should_work_override = {"yang-liu": ["yang-liu-ict"]}
        a, b = author_processor._errorCheckCallArgs(should_work_targets,
                                                    should_work_override)
        self.assertEqual(["yang-liu"], a)
        self.assertEqual(["luyang-liu"], b)

示例#11

0

显示文件

    def test_removeKnownDifferent(self):
        print("INFO: Running removeKnownDifferent tests")
        log_path = self.log_path + 'remove_known_different.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True)

        tmp_pairs = {
            "W19-2708 yang-liu-georgetown": [
                "Q18-1005 yang-liu-edinburgh", "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh"
            ],
            "W19-2710 yang-liu-georgetown": [
                "W19-2708 amir-zeldes",
                "Q18-1005 yang-liu-edinburgh",
                "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh",
            ],
            "W19-2717 yang-liu-georgetown": [
                "W19-2708 amir-zeldes", "Q18-1005 yang-liu-edinburgh",
                "N19-1173 yang-liu-edinburgh", "P15-2047 yang-liu-edinburgh"
            ],
            "Q18-1005 yang-liu-edinburgh": [
                "W19-2708 amir-zeldes",
            ]
        }
        test_pairs = {}
        for k, info in tmp_pairs.items():
            test_pairs[k] = [[" ".join([k, x]), 1] for x in info]
        test_excluded = {
            'W19-2708 yang-liu-georgetown': ['W19-2708 amir-zeldes'],
            "W19-2710 yang-liu-georgetown": []
        }
        expected_different = {"yang-liu-georgetown": ["amir-zeldes"]}
        expected_pairs = {
            "W19-2708 yang-liu-georgetown": [
                "Q18-1005 yang-liu-edinburgh", "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh"
            ],
            "W19-2710 yang-liu-georgetown": [
                "Q18-1005 yang-liu-edinburgh",
                "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh",
            ],
            "W19-2717 yang-liu-georgetown": [
                "Q18-1005 yang-liu-edinburgh", "N19-1173 yang-liu-edinburgh",
                "P15-2047 yang-liu-edinburgh"
            ],
            "Q18-1005 yang-liu-edinburgh": [
                "W19-2708 amir-zeldes",
            ]
        }
        fixed, diff = author_processor._removeKnownDifferent(
            test_pairs, test_excluded)
        self.assertDictEqual(expected_different, diff)
        for k, info in fixed.items():
            if k not in expected_pairs:
                print(k)
                self.fail()
            results_pair_keys = [x[0] for x in info]
            expected_pair_keys = [" ".join([k, x]) for x in expected_pairs[k]]
            self.compareList(results_pair_keys, expected_pair_keys)

示例#12

0

显示文件

    def test_makeAmbiguousAuthors(self):
        print("INFO: Running makeAmbiguousAuthor tests")
        log_path = self.log_path + 'make_ambiguous_author.log'
        with open(log_path, 'w'):
            pass
        author_processor = AuthorDisambiguation(
            papers=self.test_papers,
            id_to_name=self.id_to_name,
            compare_args=self.compare_authors_args,
            log_path=log_path,
            name_similarity_cutoff=.95,
            sim_overrides=True)
        test_override = {"luyang-liu": ["bo-li"]}
        test_has_authors = ["luyang-liu"]
        test_no_authors = ["yang-liu"]
        expected_authors_to_get = [
            "bo-li", "yang-liu-georgetown", "yang-liu-edinburgh",
            "yang-liu-ict", "yang-liu-icsi", "yang-li"
        ]
        expected_excluded = []
        expected_names = {
            "yang-liu": "yang liu",
            "luyang-liu": "luyang liu",
        }
        expected_author_papers = {
            "luyang-liu": ["C18-1172"],
            "yang-liu": ["K18-1018", "I13-1154", "C12-2073"],
        }
        tmp_authors = []
        for i in expected_authors_to_get:
            if "yang-li" not in i:
                continue
            tmp_authors.extend([(p, i) for p in self.author_papers[i]])
        expected_check_authors = {
            "luyang-liu": [("D18-1212", "bo-li"), ("C18-1025", "bo-li"),
                           ("P19-1130", "bo-li")],
            "yang-liu":
            tmp_authors
        }
        res = author_processor._makeAmbiguousAuthors(test_has_authors,
                                                     test_no_authors,
                                                     test_override)
        ambiguous_author_papers, ambiguous_author_names, check_author_keys, authors_get_info, excluded = res
        self.assertEqual(expected_excluded, excluded)
        for a in ambiguous_author_papers.keys():
            if a not in expected_author_papers:
                print(a)
                self.assertTrue(a in expected_author_papers)
            self.compareList(ambiguous_author_papers[a],
                             expected_author_papers[a])
            self.assertTrue(a not in author_processor.author_papers)
        for k, n in ambiguous_author_names.items():
            if k not in expected_names:
                print(k)
                self.assertTrue(k in expected_names)
            self.assertEqual(expected_names[k], n)

        self.compareList(authors_get_info, expected_authors_to_get)

        for k, i in check_author_keys.items():
            if k not in expected_check_authors:
                self.assertTrue(k in expected_check_authors)
            self.compareList(i, expected_check_authors[k])

示例#13

0

显示文件

        "qin-lu1",
        "manuel-carlos-diaz-galiano1",
        "luis-nieto-pina1",
        "yang-liu",
        "luciano-del-corro1",
        "izzeddin-gur1",
        "gia-h-ngo1",
    ]
    target_creator = TargetCreator(parsed, id_to_name, author_papers)
    targets = []
    for k in input_handler.targets:
        for p in author_papers[k]:
            if k not in parsed[p].affiliations:
                continue
            targets.extend(target_creator.createTarget(k, [p]))
        # rtr = input_handler.handleInput(k, test_papers[k])
    target_papers, target_authors, target_ids = target_creator.fillData()
    compare_authors_args = {
        "company_corpus": org_corpus,
        "department_corpus": department_corpus,
        "threshold": .4,
        "str_algorithm": ["jaro", "similarity"]
    }
    disambiguation = AuthorDisambiguation(papers=target_papers,
                                          author_papers=target_authors,
                                          compare_args=compare_authors_args,
                                          id_to_name=target_ids,
                                          **config["AuthorDisambiguation"])

    results = disambiguation(targets)