Exemplo n.º 1
0
    def get_orcid_ids(self, doi_string, family_names=[]):
        result = []

        records = self.get_orcid_records(doi_string, family_names)
        if records is not None:
            for orcid_id in dg(records,
                               ["result", "orcid-identifier", "path"]):
                personal_details = self.get_orcid_data(orcid_id)
                if personal_details is not None:
                    given_name = dg(personal_details,
                                    ["name", "given-names", "value"])
                    family_name = dg(personal_details,
                                     ["name", "family-name", "value"])
                    credit_name = dg(personal_details,
                                     ["name", "credit-name", "value"])
                    other_names = dg(personal_details,
                                     ["other-names", "other-name", "content"])
                    result += [
                        da({
                            "orcid": orcid_id,
                            "given": given_name,
                            "family": family_name,
                            "credit": credit_name,
                            "other": other_names
                        })
                    ]

        return result
Exemplo n.º 2
0
    def __get_paper_data(self, source, paper_id):
        doi = None
        pmid = None
        pmcid = None

        paper_data = self.__get_data(self.paper_api + "ext_id:%s+src:%s" %
                                     (paper_id, source))

        matched_results = dg(paper_data, ["resultList", "result"])
        if matched_results is not None and len(matched_results) > 0:
            doi = dg(matched_results[0], ["doi"])
            pmid = dg(matched_results[0], ["pmid"])
            pmcid = dg(matched_results[0], ["pmcid"])

        return {"doi": doi, "pmid": pmid, "pmcid": pmcid}
Exemplo n.º 3
0
    def process_citing_entity(self):
        citing_entity = None

        if self.occ is not None:
            citing_resource = self.rf.retrieve_entity(self.occ, GraphEntity.expression)
            citing_entity = self.g_set.add_br(self.name, self.id, self.source_provider, citing_resource)
        if citing_entity is None and self.doi is not None:
            citing_entity = self.process_doi(self.doi, self.curator, self.source_provider)

        if citing_entity is None:
            citing_entity = self.g_set.add_br(self.name)
            self.__add_doi(citing_entity, self.doi, self.curator)
            self.rf.update_graph_set(self.g_set)
            self.repok.add_sentence(
                self.message("The citing entity has been created even if no results have "
                             "been returned by the API.",
                             "doi", self.doi))

        # Add other ids if they exist
        self.__add_pmid(citing_entity, self.pmid)
        self.__add_pmcid(citing_entity, self.pmcid)

        cited_entities = self.process_references()

        if cited_entities is not None:
            for idx, cited_entity in enumerate(cited_entities):
                citing_entity.has_citation(cited_entity)
                cur_bibentry = dg(self.entries[idx], ["bibentry"])
                if cur_bibentry is not None and cur_bibentry.strip():
                    cur_be = self.g_set.add_be(self.curator, self.source_provider, self.source)
                    citing_entity.contains_in_reference_list(cur_be)
                    cited_entity.has_reference(cur_be)
                    cur_be.create_content(cur_bibentry.strip())

            return self.g_set
Exemplo n.º 4
0
    def get_orcid_ids(self, doi_string, family_names=[]):
        result = []
        records = self.get_orcid_records(doi_string, family_names)
        if records is not None:
            if isinstance(self.query_interface, RemoteQuery):
                for orcid_id in dg(records,
                                   ["result", "orcid-identifier", "path"]):
                    personal_details = self.get_orcid_data(orcid_id)
                    if personal_details is not None:
                        given_name = dg(personal_details,
                                        ["name", "given-names", "value"])
                        family_name = dg(personal_details,
                                         ["name", "family-name", "value"])
                        credit_name = dg(personal_details,
                                         ["name", "credit-name", "value"])
                        other_names = dg(
                            personal_details,
                            ["other-names", "other-name", "content"])
                        result += [
                            da({
                                "orcid": orcid_id,
                                "given": given_name,
                                "family": family_name,
                                "credit": credit_name,
                                "other": other_names
                            })
                        ]
            else:
                for author in records:
                    result += [
                        da({
                            "orcid": author['orcid'],
                            "given": author['given_names'],
                            "family": author['family_name'],
                            "credit": "",  # actually we don't manage this
                            "other": ""  # actually we don't manage this
                        })
                    ]

        return result
Exemplo n.º 5
0
    def process_references(self, cur_source, cur_id):
        ref_list_url = self.ref_list_api.replace("XXX", cur_source).replace(
            "YYY", cur_id)

        paper_references = self.__get_data(ref_list_url)

        references = dg(paper_references, ["referenceList", "reference"])
        if references is not None:
            self.rs.new_ref_list()
            for reference in references:
                ref_entry = self.__create_entry(reference)
                entry_text = None if ref_entry is None else ref_entry[0]
                process_entry_text = \
                    True if ref_entry is None else ref_entry[1]

                # Add special data if the reference matches with
                # the ePMC database
                if reference["match"] == "Y":
                    ref_id = reference["id"]
                    ref_source = reference["source"]
                    ref_localid = ref_source + "-" + ref_id

                    paper_ids = self.__get_paper_data(ref_source, ref_id)

                    ref_doi = self.normalise_doi(paper_ids["doi"])
                    ref_pmid = paper_ids["pmid"]
                    ref_pmcid = paper_ids["pmcid"]
                else:
                    ref_localid = None
                    ref_doi = self.normalise_doi(dg(reference, ["doi"]))
                    ref_pmid = dg(reference, ["pmid"])
                    ref_pmcid = dg(reference, ["pmcid"])
                ref_url = dg(reference, ["externalLink"])

                self.rs.add_reference(entry_text, process_entry_text,
                                      ref_localid, ref_doi, ref_pmid,
                                      ref_pmcid, ref_url,
                                      None)  # TODO none xmlid
            return ref_list_url
Exemplo n.º 6
0
    def __init__(self, full_entry, repok, reperr, query_interface, resourcefinder, get_bib_entry_doi, message,
                process_existing_by_id,
                 do_process_entry=True):

        self.id = "Crossref"
        self.repok = repok
        self.reperr = reperr
        self.query_interface = query_interface
        self.rf = resourcefinder
        self.get_bib_entry_doi = get_bib_entry_doi
        self.message = message

        self.process_existing_by_id = process_existing_by_id
        self.extracted_doi_used = False
        self.do_process_entry = do_process_entry
        self.entry = dg(full_entry, ["bibentry"])
        self.provided_doi = dg(full_entry, ["doi"])
        self.provided_pmid = dg(full_entry, ["pmid"])
        self.provided_pmcid = dg(full_entry, ["pmcid"])
        self.provided_url = dg(full_entry, ["url"])
        self.process_string = dg(full_entry, ["process_entry"])


        # Variables used to store results
        self.process_doi_result = None
        self.process_pmid_result = None
        self.process_pmcid_result = None
        self.process_url_result = None
        self.existing_bibref_entry = None
        self.extracted_doi = None
        self.extracted_url = None
        self.cur_res = None
        self.existing_res_on_blazegraph = None
        self.cur_res_obtained_via = None
        self.cur_json_obtained_via = None

        # Variable to use for disambiguation purposes
        self.to_be_considered = True

        if self.process_string is not None:
            self.do_process_entry = self.process_string.lower().strip() == "true"

        if self.provided_url is not None:
            self.provided_url = FormatProcessor.extract_url(self.provided_url)
        else:
            self.extracted_url = FormatProcessor.extract_url(self.entry)

        self.extracted_doi = FormatProcessor.extract_doi(self.entry)

        # Start to query for data
        self.process_remote()
Exemplo n.º 7
0
    def process(self, oa=False, intext_refs=False):
        while True:
            if self.stopper.can_proceed():
                cur_page = self.__get_next_page()

                result, cur_get_url = self.__get_data_from_page(cur_page, oa)
                # Re-run the query with the first page,
                # since a wrong page can be specified
                if result is None:
                    result, cur_get_url = self.__get_data_from_page("*", oa)

                # Proceed only if there were no problems in getting the data, otherwise stop
                if result is not None:
                    papers_retrieved = dg(result, ["resultList", "result"])
                    if papers_retrieved is not None and papers_retrieved:
                        for paper in papers_retrieved:
                            if self.stopper.can_proceed():
                                cur_id = dg(paper, ["id"])
                                cur_source = dg(paper, ["source"])
                                cur_doi = self.normalise_doi(dg(
                                    paper, ["doi"]))
                                cur_pmid = dg(paper, ["pmid"])
                                cur_pmcid = dg(paper, ["pmcid"])
                                self.process_article(cur_id, cur_source,
                                                     cur_doi, cur_pmid,
                                                     cur_pmcid, oa,
                                                     intext_refs)
                            else:
                                break
                        if self.stopper.can_proceed():
                            self.__store_page_number(
                                dg(result, ["nextCursorMark"]))
                    else:  # We have browsed all the pages with results, and thus the counting is reset
                        self.__reset_page_number()
                        self.repok.add_sentence(
                            "All the pages have been processed.")
                        break
                else:
                    self.reper.add_sentence(
                        "Problems in retrieving data for '%s'" % cur_get_url)
                    break
            else:  # Process stopped due to external reasons
                self.repok.add_sentence(
                    "Process stopped due to external reasons.")
                break
Exemplo n.º 8
0
    def __create_entry(entry):
        result = None

        author = dg(entry, ["authorString"])
        unstructured = dg(entry, ["unstructuredInformation"])
        if author is not None and author.lower() != "author unknown":
            to_process = True
            entry_string = author

            year = dg(entry, ["pubYear"])
            if year is not None and year > 0:
                entry_string += " (%s)" % str(year)
            else:
                to_process &= False

            title = dg(entry, ["title"])
            if title is not None and title.strip() != "":
                entry_string += "%s %s" % ("" if entry_string[-1] == "." else
                                           ".", title.strip())
            else:
                to_process &= False

            editors = dg(entry, ["editors"])
            if editors is not None and editors.strip() != "":
                entry_string += "%s %s (Eds.)" % ("" if entry_string[-1] == "."
                                                  else ".", editors.strip())

            journal = dg(entry, ["journalAbbreviation"])
            if journal is not None and journal.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 journal.strip())

            container = dg(entry, ["publicationTitle"])
            if container is not None and container.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 container.strip())

            series = dg(entry, ["seriesName"])
            if series is not None and series.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 series.strip())

            volume = dg(entry, ["volume"])
            if volume is not None and volume.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ",",
                                           volume.strip())

            issue = dg(entry, ["issue"])
            if issue is not None and issue.strip() != "":
                is_digit = entry_string[-1].isdigit()
                entry_string += "%s%s%s" % \
                                (" (" if is_digit else " ", issue.strip(), ")" if is_digit else "")

            page = dg(entry, ["pageInfo"])
            if page is not None and page.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ":",
                                           page.strip())

            edition = dg(entry, ["edition"])
            if edition is not None and edition.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ".",
                                           edition.strip())

            doi = dg(entry, ["doi"])
            if doi is not None and doi.strip() != "":
                entry_string += "%s https://doi.org/%s" % \
                                ("" if entry_string[-1] == "." else ".", doi.strip())

            result = (entry_string, to_process)
        elif unstructured is not None and len(unstructured.strip()):
            result = (html.document_fromstring(
                unstructured.strip()).text_content(), True)

        return result
Exemplo n.º 9
0
    def process_references(self):
        result = []

        for full_entry in self.entries:
            self.repok.new_article()
            self.reperr.new_article()
            cur_res = None

            entry = dg(full_entry, ["bibentry"])
            do_process_entry = True
            process_string = dg(full_entry, ["process_entry"])
            if process_string is not None:
                do_process_entry = process_string.lower().strip() == "true"
            provided_doi = dg(full_entry, ["doi"])
            provided_pmid = dg(full_entry, ["pmid"])
            provided_pmcid = dg(full_entry, ["pmcid"])
            provided_url = dg(full_entry, ["url"])

            # This is useful if additional data are stored in the field URL, e.g.:
            # 'http://pub.stat.ee/px/web.2001/dialog/statfile1.asp. Accessed on 2009'
            if provided_url is not None:
                provided_url = FormatProcessor.extract_url(provided_url)

            extracted_doi = FormatProcessor.extract_doi(entry)
            extracted_doi_used = False
            extracted_url = FormatProcessor.extract_url(entry)

            if provided_doi is not None:
                cur_res = self.process_doi(provided_doi, self.curator,
                                           self.source_provider)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "DOI provided as input by %s." %
                            self.source_provider, "DOI", provided_doi))

            if cur_res is None and provided_pmid is not None:
                cur_res = self.process_pmid(provided_pmid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "PMID provided as input by %s." %
                            self.source_provider, "PMID", provided_pmid))

            if cur_res is None and provided_pmcid is not None:
                cur_res = self.process_pmcid(provided_pmcid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "PMCID provided as input by %s." %
                            self.source_provider, "PMCID", provided_pmcid))

            if cur_res is None and provided_url is not None:
                cur_res = self.process_url(provided_url)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "URL provided as input by %s." %
                            self.source_provider, "URL", provided_url))

            if cur_res is None and entry is not None:
                if do_process_entry:
                    cur_res = self.process_entry(entry)

                if cur_res is None:
                    if self.get_bib_entry_doi and extracted_doi is not None:
                        extracted_doi_used = True
                        cur_res = self.process_doi(extracted_doi, self.name,
                                                   self.source_provider)
                        if cur_res is not None:
                            self.repok.add_sentence(
                                self.message(
                                    "The entity for '%s' has been found by means of the "
                                    "DOI extracted from it." % entry, "DOI",
                                    extracted_doi))
                    if cur_res is None and self.get_bib_entry_url and extracted_url is not None:
                        existing_res = self.rf.retrieve_from_url(extracted_url)
                        if existing_res is not None:
                            cur_res = self.g_set.add_br(
                                self.name, self.source_provider, self.source,
                                existing_res)
                            self.repok.add_sentence(
                                self.message(
                                    "The entity for '%s' has been found by means of the "
                                    "URL extracted from it." % entry, "URL",
                                    extracted_url))

                else:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been retrieved by using the search API.",
                            "entry", entry))

            # If no errors were generated, proceed
            if self.reperr.is_empty():
                # If it is none
                if cur_res is None:
                    cur_res = self.g_set.add_br(self.name)
                    self.rf.update_graph_set(self.g_set)
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been created even if no results have "
                            "been returned by the API.", "entry", entry))

                # Add the DOI, the PMID and the PMCID if they have been provided by the curator
                # (if they are not already associated to the resource)
                self.__add_doi(cur_res, provided_doi, self.curator)
                self.__add_pmid(cur_res, provided_pmid)
                self.__add_pmcid(cur_res, provided_pmcid)
                self.__add_url(cur_res, provided_url)

                # Add any DOI extracted from the entry if it is not already included (and only if
                # a resource has not been retrieved by a DOI specified in the entry explicitly, or
                # by a Crossref search.
                if self.get_bib_entry_doi and extracted_doi_used:
                    self.__add_doi(cur_res, extracted_doi, self.name)

                # Add any URL extracted from the entry if it is not already included
                if self.get_bib_entry_url:
                    self.__add_url(cur_res, extracted_url)

                result += [cur_res]
                self.rf.update_graph_set(self.g_set)

            else:  # If errors have been raised, stop the process for this entry (by returning None)
                return None

        # If the process comes here, then everything worked correctly
        return result
Exemplo n.º 10
0
    def author(self, cur_br, key, json, source, *args):
        # Get all ORCID of the authors (if any)
        all_authors = json[key]
        all_family_names = dg(all_authors, ["family"])
        author_orcid = []
        if "DOI" in json and all_family_names:
            doi_string = json["DOI"]
            if self.of is not None:
                author_orcid = self.of.get_orcid_ids(doi_string, all_family_names)

        # Used to create ordered list of authors/editors of bibliographic entities
        prev_role = None

        # Analyse all authors
        for author in json["author"]:
            given_name_string = None
            if "given" in author:
                given_name_string = author["given"]
            family_name_string = None
            if "family" in author:
                family_name_string = author["family"]

            cur_orcid_record = None  # TODO: handle if ORCID in Crossref
            if family_name_string:
                # Get all the ORCID/author records retrieved that share the
                # family name into consideration
                orcid_with_such_family = dgt(author_orcid, "family", family_name_string)
                author_with_such_family = dgt(all_authors, "family", family_name_string)
                if len(orcid_with_such_family) == 1 and len(author_with_such_family) == 1:
                    cur_orcid_record = orcid_with_such_family[0]
                elif given_name_string is not None and \
                        len(orcid_with_such_family) >= 1 and len(author_with_such_family) >= 1:
                    # From the previous lists of ORCID/author record, get the list
                    # of all the given name defined
                    orcid_given_with_such_family = dg(orcid_with_such_family, ["given"])
                    author_given_with_such_family = dg(author_with_such_family, ["given"])

                    # Get the indexes of the previous list that best match with the
                    # given name of the author we are considering
                    closest_orcid_matches_idx = \
                        slc(orcid_given_with_such_family, given_name_string)
                    closest_author_matches_idx = \
                        slc(author_given_with_such_family, given_name_string)
                    if len(closest_orcid_matches_idx) == 1 and \
                            len(closest_author_matches_idx) == 1:
                        closest_author_orcid_matches_idx = slc(
                            author_given_with_such_family, orcid_given_with_such_family[0])
                        if closest_author_orcid_matches_idx == closest_author_matches_idx:
                            cur_orcid_record = \
                                orcid_with_such_family[closest_orcid_matches_idx[0]]

            # An ORCID has been found to match with such author record, and we try to
            # see if such orcid (and thus, the author) has been already added in the
            # store
            retrieved_agent = None
            if cur_orcid_record is not None and self.rf is not None:  # TODO: handle if ORCID in Crossref
                retrieved_agent = self.rf.retrieve_from_orcid(cur_orcid_record["orcid"])

            # If the resource does not exist already, create a new one
            if retrieved_agent is None:
                cur_agent = self.g_set.add_ra(self.name, self.id, source)
                if cur_orcid_record is not None and self.of is not None:
                    cur_agent_orcid = self.g_set.add_id(self.of.name, self.of.id, self.of.get_last_query())
                    cur_agent_orcid.create_orcid(cur_orcid_record["orcid"])
                    cur_agent.has_id(cur_agent_orcid)
                    self.rf.add_orcid_to_store(cur_agent, cur_agent_orcid, cur_orcid_record["orcid"])

                if given_name_string is not None:
                    cur_agent.create_given_name(given_name_string)
                elif cur_orcid_record is not None and "given" in cur_orcid_record:
                    cur_agent.create_given_name(cur_orcid_record["given"])

                if family_name_string is not None:
                    cur_agent.create_family_name(family_name_string)
                elif cur_orcid_record is not None and "family" in cur_orcid_record:
                    cur_agent.create_family_name(cur_orcid_record["family"])
            else:
                cur_agent = self.g_set.add_ra(self.name, self.id, source, retrieved_agent)

            # Add statements related to the author resource (that could or could not
            # exist in the store)
            cur_role = self.g_set.add_ar(self.name, self.id, source)
            if json["type"] == "edited-book":
                cur_role.create_editor(cur_br)
            else:
                cur_role.create_author(cur_br)
            cur_agent.has_role(cur_role)

            if prev_role is not None:
                cur_role.follows(prev_role)

            prev_role = cur_role
Exemplo n.º 11
0
    def process_citing_entity(self):
        # This method let us process the citing entity: this is the first step of the process, if the citing resource
        # hasn't been found in blazegraph.
        citing_entity = None

        if self.occ is not None:
            citing_resource = self.rf.retrieve_entity(self.occ,
                                                      GraphEntity.expression,
                                                      typ='only_blazegraph')
            citing_entity = self.g_set.add_br(self.name, self.id,
                                              self.source_provider,
                                              citing_resource)

        if citing_entity is None and self.doi is not None:
            citing_entity = self.process_doi_query(self.doi,
                                                   self.curator,
                                                   self.source_provider,
                                                   typ='only_blazegraph')

        if citing_entity is None:
            # If the citing entity hasn't been found, then create one and update the graph
            citing_entity = self.g_set.add_br(self.name)
            self.__add_doi(citing_entity, self.doi, self.curator)

            # self.rf.update_graph_set(self.g_set)
            self.repok.add_sentence(
                self.message(
                    "The citing entity has been created even if no results have "
                    "been returned by the API.", "doi", self.doi))

        # Add other ids if they exist
        self.__add_pmid(citing_entity, self.pmid)
        self.__add_pmcid(citing_entity, self.pmcid)

        # Process all the references contained and return related entities
        cited_entities = self.process_references(citing_entity=citing_entity,
                                                 citing_doi=self.doi)

        if cited_entities is not None:
            cited_entities_xmlid_be = []
            for idx, cited_entity in enumerate(cited_entities):
                citing_entity.has_citation(cited_entity)
                cur_bibentry = dg(self.entries[idx], ["bibentry"])
                cur_be_xmlid = dg(self.entries[idx], ["xmlid"])

                if cur_bibentry is not None and cur_bibentry.strip():
                    cur_be = self.g_set.add_be(self.curator,
                                               self.source_provider,
                                               self.source)
                    citing_entity.contains_in_reference_list(cur_be)
                    cited_entity.has_reference(cur_be)
                    self.__add_xmlid(cur_be, cur_be_xmlid)  # new
                    cur_be.create_content(cur_bibentry.strip())
                    cited_entities_xmlid_be.append(
                        (cited_entity, cur_be_xmlid, cur_be))

            # create rp, pl, de, ci, an
            if self.intext_refs:
                rp_entities = jt.process_reference_pointers(citing_entity, \
                                                            cited_entities_xmlid_be, self.reference_pointers,
                                                            self.g_set, \
                                                            self.curator, self.source_provider, self.source)
                # self.rf.update_graph_set(self.g_set)

            return self.g_set