def _getAuthorInfo(self, a):
     self.logger.debug("Getting info for {}".format(a))
     name = cleanName(
         remove_weird_notes.sub(" ",
                                nameFromDict(self.id_to_name[a])).replace(
                                    "  ", " ")).replace("  ", " ")
     printLogToConsole(self.console_log_level,
                       "id={}".format(a),
                       logging.INFO,
                       logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "name={}".format(name),
                       logging.INFO,
                       logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "Papers for {}:".format(a),
                       logging.INFO,
                       logger=self.logger)
     for p in self.author_papers[a]:
         if p not in self.papers:
             continue
         try:
             title = self.papers[p].title
         except:
             title = self.papers[p]["title"]
         printLogToConsole(self.console_log_level,
                           "\t{}\t{}".format(p, title),
                           logging.INFO,
                           logger=self.logger)
     printLogToConsole(self.console_log_level,
                       "{} Author(s) have this name".format(
                           len(self.names[name])),
                       logging.INFO,
                       logger=self.logger)
 def _genAuthorOverride(self, a):
     self.logger.debug("Received clear override command".format(a))
     if a in self.override_authors:
         printLogToConsole(
             self.console_log_level,
             "{} already has authors to compare with".format(a),
             logging.INFO,
             logger=self.logger)
         return
     elif a not in self.targets:
         printLogToConsole(self.console_log_level,
                           "{} is not a target".format(a),
                           logging.INFO,
                           logger=self.logger)
         return
     name = cleanName(
         remove_weird_notes.sub(" ",
                                nameFromDict(self.id_to_name[a])).replace(
                                    "  ", " ")).replace("  ", " ")
     print("INFO: Other authors with the same name:")
     for other_a in self.names[name]:
         if other_a != a:
             print("INFO: {}".format(other_a))
     if len(self.names[name]) == 1:
         printLogToConsole(
             self.console_log_level,
             "{} only has {}, will not add authors to compare with".format(
                 name, a),
             logging.INFO,
             logger=self.logger)
     else:
         self.override_authors[a] = [x for x in self.names[name] if x != a]
         self.logger.debug("{} authors added to override_authors".format(
             len(self.names[name])))
Пример #3
0
 def compareInfoDict(self, actual, expected):
     self.assertTrue("name" in actual)
     self.assertEqual(actual["name"], cleanName(expected["name"]))
     self.assertTrue("co_authors_name" in actual)
     expected_names = [cleanName(x) for x in expected["co_authors_name"]]
     for n in actual["co_authors_name"]:
         self.assertTrue(n in expected_names)
     self.assertTrue("aff_name" in actual)
     if not expected["aff_name"]:
         self.assertEqual(actual["aff_name"], expected["aff_name"])
     else:
         self.assertEqual(actual["aff_name"],
                          cleanName(expected["aff_name"]))
     self.assertTrue("address" in actual)
     try:
         self.assertDictEqual(actual["address"], expected["address"])
     except Exception as e:
         print(actual["address"])
         print(expected["address"])
         raise e
     self.assertTrue("title" in actual)
     self.assertEqual(actual["title"], cleanName(expected["title"]))
     list_keys = [
         "title_tokenized", "co_authors_id", "department",
         "co_authors_email", "co_authors_aff", "co_authors_aff_type",
         "citations", "citations_tokenized", "sections",
         "sections_tokenized"
     ]
     str_keys = ["aff_type", "email_user", "email_domain"]
     for k in list_keys:
         try:
             self.assertTrue(k in actual)
         except Exception as e:
             print(k)
             raise e
         for i in actual[k]:
             try:
                 self.assertTrue(i in expected[k])
             except Exception as e:
                 print(actual["name"])
                 print(expected["name"])
                 print(i)
                 print(expected[k])
                 raise e
     for k in str_keys:
         self.assertTrue(k in actual)
         self.assertEqual(actual[k], expected[k])
    def __init__(self,
                 papers,
                 author_papers,
                 id_to_name,
                 console_log_level=logging.ERROR,
                 file_log_level=logging.DEBUG,
                 log_format=None,
                 log_path=None,
                 target_path=None,
                 save_data=False,
                 ext_directory=False,
                 save_path=None,
                 cores=4):
        if not log_format:
            log_format = '%(asctime)s|%(levelname)8s|%(module)20s|%(funcName)20s: %(message)s'
        if not log_path:
            log_path = os.getcwd() + "/logs/disambiguation.log"
        self.logger = createLogger("input_handler", log_path, log_format,
                                   console_log_level, file_log_level)
        self.console_log_level = console_log_level
        self.papers = papers
        self.author_papers = author_papers
        self.id_to_name = id_to_name
        self.names = defaultdict(list)
        for k, name in id_to_name.items():
            name_cleaned = cleanName(
                remove_weird_notes.sub(" ", nameFromDict(name)).replace(
                    "  ", " ")).replace("  ", " ")
            self.names[name_cleaned].append(k)
        self.save_data = save_data
        self.save_path = save_path
        self.ext_directory = ext_directory
        self.override_authors = {}
        if not target_path:
            self.logger.debug("No path was passed for target_path")
            self.targets = []
        else:
            self.logger.debug("Opening {}".format(target_path))
            if target_path.split(".")[-1] == "json":
                self.logger.debug("Parsing json...")
                try:
                    targets_dict = json.load(open(target_path))
                except FileNotFoundError as e:
                    self.logger.debug(
                        "File path was not found, trying to open with adding os.getcwd()"
                    )
                    target_path = "/" + target_path if target_path[
                        0] == "/" else target_path
                    targets_dict = json.load(open(os.getcwd() + target_path))
                for k, v in targets_dict.items():
                    self.logger.debug("Found target {}".format(k))
                    self.targets.append(k)
                    self.override_authors[k] = v
            elif target_path.split(".")[-1] == "txt":
                self.logger.debug("Parsing txt file...")
                try:
                    self.targets = [
                        x.strip() for x in open(target_path).readlines()
                    ]
                except FileNotFoundError as e:
                    self.logger.debug(
                        "File path was not found, trying to open with adding os.getcwd()"
                    )
                    target_path = "/" + target_path if target_path[
                        0] == "/" else target_path
                    self.targets = [
                        x.strip()
                        for x in open(os.getcwd() + target_path).readlines()
                    ]
            else:
                self.logger.error("File type {} is not supported".format(
                    target_path.split(".")[-1]))
                raise ValueError("File type {} is not supported".format(
                    target_path.split(".")[-1]))

            self.logger.debug("Found {} targets".format(len(self.targets)))
            self.logger.debug("Found {} overrides".format(
                len(self.override_authors)))

        self.valid_main_commands = {
            "t": {
                "required": "target-id",
                "desc": "Specify target",
                "action": self._addTarget
            },
            "d": {
                "required": None,
                "desc": "Display a target or all targets",
                "action": self._displayTargets
            },
            "r": {
                "required": None,
                "desc": "Remove a target",
                "action": self._removeTarget
            },
            "g": {
                "required": "target-id",
                "desc": "Generate authors to compare with based on their name",
                "action": self._genAuthorOverride
            },
            "c": {
                "required": "target-id",
                "desc": "Clear target's authors to compare with",
                "action": self._clearAuthorOverride
            },
            "o": {
                "required": None,
                "desc": "Display override authors",
                "action": self._displayOverride
            },
            "h": {
                "required": None,
                "desc": "Help",
                "action": self._printHelp
            },
            "s": {
                "required": None,
                "desc": "Save targets",
                "action": self._save
            },
            "e": {
                "required": None,
                "desc": "Finish and continue",
                "action": None
            },
        }
        self.valid_target_commands = {
            "a": {
                "required": "target-id",
                "optional": None,
                "desc": "Specify author to compare target to"
            },
            "g": {
                "required": "Author Name",
                "optional": None,
                "desc":
                "Generate a list of authors to compare to by their name"
            },
            "d": {
                "required": None,
                "optional": None,
                "desc": "Display list of authors"
            },
            "r": {
                "required": "author-id",
                "optional": None,
                "desc": "Remove an author-id"
            },
            "e": {
                "required": None,
                "optional": None,
                "desc": "Finish editing target"
            }
        }
    def _makeAmbiguousAuthors(self, has_authors, needs_authors, override_authors):
        ambiguous_author_papers = defaultdict(list)
        ambiguous_author_names = dict()
        authors_get_info = list()
        check_author_keys = defaultdict(list)
        excluded = []
        for i in [*has_authors, *needs_authors]:
            ambiguous_author_papers[i] = self.author_papers.pop(i)
            try:
                ambiguous_author_names[i] = cleanName(nameFromDict(self.id_to_name[i])).lower()
                del self.author_name[i]
            except KeyError as e:
                self.logger.warning("{} is not in id_to_name".format(i))
                excluded.append(i)

        for a in has_authors:
            if a in excluded:
                self.logger.debug("Skipping {} because it is in excluded".format(a))
                continue
            authors_get_info.extend(override_authors[a])
            check_author_keys[a] = self._makeCheckAuthors(override_authors[a])
        args = []
        for a in needs_authors:
            if a in excluded:
                self.logger.debug("Skipping {} because it is in excluded".format(a))
                continue
            args.append([a, ambiguous_author_names[a], self.author_name, self.str_algorithm, self.name_similarity_cutoff,
                         self.sim_overrides])
        printLogToConsole(self.console_log_level, "Getting similar authors in parallel with {} cores".format(self.cores),
                          logging.INFO)
        self.logger.info("Getting similar authors in parallel with {} cores".format(self.cores))
        sim_authors = []
        with mp.Pool(self.cores) as Pool:
            imap_results = list(tqdm(Pool.imap_unordered(self._getSimilarAuthors, args), total=len(args), file=sys.stdout))
            for target, auth, warnings, debug in imap_results:
                self.logger.debug("Adding authors from {}".format(target))
                self.logger.debug("len(auth)={}".format(len(auth)))
                sim_authors.append([target, auth])
                for i in warnings:
                    self.logger.warning(i)
                for i in debug:
                    self.logger.debug(i)
        self.logger.debug("len(sim_authors)={}".format(len(sim_authors)))

        pbar = tqdm(total=len(sim_authors), file=sys.stdout)
        for a, auths in sim_authors:

            if a in override_authors:
                self.logger.exception(ValueError("{} is in need authors, but is already in override_authors".format(a)))
                raise ValueError("{} is in need authors, but is already in override_authors".format(a))
            pbar.write("INFO: Checking similar authors to {}".format(a))
            self.logger.info("Checking similar authors to {}".format(a))
            if len(auths) == 0:
                self.logger.warning("{} has no similar authors".format(a))
                excluded.append(a)
            else:
                authors_get_info.extend(auths)
                check_author_keys[a] = self._makeCheckAuthors(auths)
                if len(check_author_keys[a]) == 0:
                    self.logger.debug("{} had at least 1 similar author, but nothing in check author keys".format(a))
            pbar.update()
        pbar.close()
        authors_get_info = list(set(authors_get_info))
        return ambiguous_author_papers, ambiguous_author_names, check_author_keys, authors_get_info, excluded
    def _getSimilarAuthors(args):
        target_id, target_author, author_name, str_algorithm, name_similarity_cutoff, sim_overrides = args
        out = []

        target_initials = [w[0] for w in target_author.split()]
        target_author = target_author.lower()
        old_target_id = deepcopy(target_id)
        target_id = remove_numbers.sub("",target_id)
        warnings = []
        debug = []
        authors_use = []
        for _id, name in author_name.items():
            try:
                first_letter = name[0].lower()
            except Exception as e:
                raise e
            name = name.lower()
            if first_letter == target_author[0].lower():
                authors_use.append([_id, name])
        debug.append("{} authors with the same first letter as {}".format(len(authors_use), target_id))
        for _id, name in authors_use:
            cleaned_name = cleanName(name).lower()

            tmp_id = "-".join(_id.split("-")[:len(target_initials)])
            pass_sim_test = False

            if str_algorithm(target_id, tmp_id) * 100 >= name_similarity_cutoff * 100:
                pass_sim_test = True
            override_with_sim = sim_overrides and pass_sim_test

            # Do not override the first name check b/c the first name check prevents authors with the targets name in
            # their name from being used.
            # For example:
            #   target is yang-liu
            #   the author it is looking at is luyang-liu.
            #   It would pass the similarity test, but we know it is not the same because the first name is
            if str_algorithm(cleaned_name.split()[0],
                             target_author.split()[0]) * 100 < name_similarity_cutoff * 100:
                if pass_sim_test:
                    warnings.append(
                        "{} passed the similarity test, but does not have the same first name".format(_id))
                    warnings.append("author name ={}".format(name))
                continue

            # For the initials, override does have an affect due to some people having weird notes in their name.
            # For example:
            #   yang-liu-georgetown's name is Yang (Janet) Liu
            #   For the time being, clean name does not remove the (Janet) from the name (might change later)
            #   So yang-liu-georgetown's initials are [y,j,l]. But we WANT to compare this to the target of yang-liu,
            #   so we override it
            cleaned_initials = [w[0] for w in cleaned_name.split()]
            same_initials = True
            if len(cleaned_initials) != len(target_initials) and not override_with_sim:
                if pass_sim_test:
                    warnings.append(
                        "{} passed the similarity test, but does not have the same number of initials".format(_id))
                    warnings.append("target name ={}".format(target_id))
                    warnings.append("author name ={}".format(name))
                continue

            for i in range(min(len(target_initials),len(cleaned_initials))):
                if target_initials[i] != cleaned_initials[i]:
                    same_initials = False
                    break
            if not same_initials and not override_with_sim:
                if pass_sim_test:
                    warnings.append("{} passed the similarity test, but does not have the same initials".format(_id))
                    warnings.append("target name ={}".format(target_id))
                    warnings.append("author name ={}".format(name))
                continue

            if pass_sim_test:
                if override_with_sim and not same_initials:
                    debug.append("{} was added due to overriding with sim score".format(_id))
                debug.append("{} is similar to {}".format(_id, target_id))
                out.append(_id)
        debug.append("Found {} similar authors".format(len(out)))
        return old_target_id, out, warnings, debug
def getAuthorInfo(args):
    paper, author = args
    pair_key = paper.pid + " " + author
    out = {
        "pid": paper.pid,
        "name": cleanName(paper.authors[author]),
        "co_authors_id": [],
        "co_authors_name": [],
        "co_authors_email": [],
        "co_authors_aff_type": [],
        "co_authors_aff": []
    }
    for a in paper.authors.keys():
        if a == author:
            continue
        out["co_authors_id"].append(a)
        out["co_authors_name"].append(cleanName(paper.authors[a]))
        if a in paper.affiliations:
            auth_aff = paper.affiliations[a]
            if auth_aff["email"]:
                out["co_authors_email"].append(auth_aff["email"].split("@"))
            else:
                out["co_authors_email"].append([None, None])
            try:
                auth_aff_type = auth_aff["affiliation"]["type"][0]
            except:
                auth_aff_type = None
            if auth_aff_type:
                out["co_authors_aff"].append(
                    cleanName(
                        auth_aff["affiliation"]["info"][auth_aff_type][0]))
            else:
                out["co_authors_aff"].append(None)
            out["co_authors_aff_type"].append(auth_aff_type)
        else:
            out["co_authors_aff"].append(None)
            out["co_authors_email"].append([None, None])
            out["co_authors_aff_type"].append(None)
            out["co_authors_aff"].append(None)
    aff_info = paper.affiliations[author]["affiliation"]
    email = paper.affiliations[author]["email"]
    if email:
        email = email.split("@")
        out["email_user"] = email[0]
        if len(email) == 2:
            out["email_domain"] = email[1]
        else:
            out["email_domain"] = None
    else:
        out["email_user"] = None
        out["email_domain"] = None

    try:
        out["aff_type"] = aff_info["type"][0]

    except (IndexError, KeyError) as e:
        out["aff_type"] = None

    if out["aff_type"]:
        out["aff_name"] = cleanName(aff_info["info"][out["aff_type"]][0])
        out["department"] = aff_info["info"]["department"]
    else:
        out["aff_name"] = None
        out["department"] = []
    try:
        out["address"] = aff_info["address"]
        if out["address"]["settlement"]:
            out["address"]["settlement"] = cleanName(
                out["address"]["settlement"])
        if out["address"]["country"]:
            out["address"]["country"] = cleanName(out["address"]["country"])
    except (IndexError, KeyError) as e:
        out["address"] = {}
    out["title"] = cleanName(paper.title)
    out["title_tokenized"] = paper.title_tokenized
    out["citations"] = paper.citations
    out["citations_tokenized"] = paper.citations_tokenized
    out["sections"] = paper.sections
    out["sections_tokenized"] = paper.sections_tokenized
    return pair_key, out