Exemplos de dg em Python, exemplos de support.dg em Python

Exemplo n.º 1

0

Exibir arquivo

    def get_orcid_ids(self, doi_string, family_names=[]):
        result = []

        records = self.get_orcid_records(doi_string, family_names)
        if records is not None:
            rec_results = dg(records, ["orcid-search-results", "orcid-search-result"])
            if rec_results is not None:
                for record in rec_results:
                    orcid_profile = dg(record, ["orcid-profile"])
                    if orcid_profile is not None:
                        orcid_id = dg(orcid_profile, ["orcid-identifier", "path"])
                        if orcid_id is not None:
                            personal_details = dg(orcid_profile, ["orcid-bio", "personal-details"])
                            if personal_details is not None:
                                given_name = dg(personal_details, ["given-names", "value"])
                                family_name = dg(personal_details, ["family-name", "value"])
                                credit_name = dg(personal_details, ["credit-name", "value"])
                                other_names = dg(personal_details, ["other-names", "other-name", "value"])
                                result += [da({
                                    "orcid": orcid_id,
                                    "given": given_name,
                                    "family": family_name,
                                    "credit": credit_name,
                                    "other": other_names
                                })]

        return result

Exemplo n.º 2

0

Exibir arquivo

Arquivo: epmc_processor.py Projeto: project-renard-survey/opencitations

    def __get_paper_data(self, source, paper_id):
        doi = None
        pmid = None
        pmcid = None

        paper_data = self.__get_data(self.paper_api + "ext_id:%s+src:%s" %
                                     (paper_id, source))

        matched_results = dg(paper_data, ["resultList", "result"])
        if matched_results is not None and \
                        len(matched_results) > 0:
            doi = dg(matched_results[0], ["doi"])
            pmid = dg(matched_results[0], ["pmid"])
            pmcid = dg(matched_results[0], ["pmcid"])

        return {"doi": doi, "pmid": pmid, "pmcid": pmcid}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: patmha/CiTOCrawler

    def process(self):
        """This methods returns a GraphSet populated with the citation data form the input
        source, or None if any issue has been encountered."""
        if self.doi is not None:
            citing_resource = self.rf.retrieve_citing_from_doi(self.doi)
            if citing_resource is None and self.pmid is not None:
                citing_resource = self.rf.retrieve_citing_from_pmid(self.pmid)
            if citing_resource is None and self.pmcid is not None:
                citing_resource = self.rf.retrieve_citing_from_pmcid(
                    self.pmcid)
            if citing_resource is None and self.url is not None:
                citing_resource = self.rf.retrieve_citing_from_url(self.url)

            if citing_resource is None:
                citing_entity = self.process_doi(self.doi, self.curator,
                                                 self.source_provider)
                if citing_entity is None:
                    citing_entity = self.g_set.add_br(self.name)
                    self.__add_doi(citing_entity, self.doi, self.curator)
                    self.rf.update_graph_set(self.g_set)
                    self.repok.add_sentence(
                        self.message(
                            "The citing entity has been created even if no results have "
                            "been returned by the API.", "doi", self.doi))

                # Add other ids if they exist
                self.__add_pmid(citing_entity, self.pmid)
                self.__add_pmcid(citing_entity, self.pmcid)

                cited_entities = self.process_references()

                if cited_entities is not None:
                    for idx, cited_entity in enumerate(cited_entities):
                        citing_entity.has_citation(cited_entity)
                        cur_bibentry = dg(self.entries[idx], ["bibentry"])
                        if cur_bibentry is not None and cur_bibentry.strip():
                            cur_be = self.g_set.add_be(self.curator,
                                                       self.source_provider,
                                                       self.source)
                            citing_entity.contains_in_reference_list(cur_be)
                            cited_entity.has_reference(cur_be)
                            cur_be.create_content(cur_bibentry.strip())

                    return self.g_set
            else:
                self.repok.add_sentence(
                    "The citing entity with DOI '%s' has been already "
                    "processed in the past." % self.doi)
        else:  # No DOI has been specified for the citing resource
            self.reperr.add_sentence(
                "No DOI has been specified for the citing resource.")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: epmc_processor.py Projeto: project-renard-survey/opencitations

    def __process_references(self, cur_source, cur_id):
        ref_list_url = self.ref_list_api.replace("XXX", cur_source).replace(
            "YYY", cur_id)

        paper_references = self.__get_data(ref_list_url)

        references = dg(paper_references, ["referenceList", "reference"])
        if references is not None:
            self.rs.new_ref_list()
            for reference in references:
                ref_entry = self.__create_entry(reference)
                entry_text = None if ref_entry is None else ref_entry[0]
                process_entry_text = \
                    True if ref_entry is None else ref_entry[1]

                # Add special data if the reference matches with
                # the ePMC database
                if reference["match"] == "Y":
                    ref_id = reference["id"]
                    ref_source = reference["source"]
                    ref_localid = ref_source + "-" + ref_id

                    paper_ids = self.__get_paper_data(ref_source, ref_id)

                    ref_doi = paper_ids["doi"]
                    ref_pmid = paper_ids["pmid"]
                    ref_pmcid = paper_ids["pmcid"]
                else:
                    ref_localid = None
                    ref_doi = dg(reference, ["doi"])
                    ref_pmid = dg(reference, ["pmid"])
                    ref_pmcid = dg(reference, ["pmcid"])
                ref_url = dg(reference, ["externalLink"])

                self.rs.add_reference(entry_text, process_entry_text,
                                      ref_localid, ref_doi, ref_pmid,
                                      ref_pmcid, ref_url)
            return ref_list_url

Exemplo n.º 5

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: essepuntato/opencitations

    def process(self):
        """This methods returns a GraphSet populated with the citation data form the input
        source, or None if any issue has been encountered."""
        if self.doi is not None:
            citing_resource = self.rf.retrieve_citing_from_doi(self.doi)
            if citing_resource is None and self.pmid is not None:
                citing_resource = self.rf.retrieve_citing_from_pmid(self.pmid)
            if citing_resource is None and self.pmcid is not None:
                citing_resource = self.rf.retrieve_citing_from_pmcid(self.pmcid)
            if citing_resource is None and self.url is not None:
                citing_resource = self.rf.retrieve_citing_from_url(self.url)

            if citing_resource is None:
                citing_entity = self.process_doi(self.doi, self.curator, self.source_provider)
                if citing_entity is None:
                    citing_entity = self.g_set.add_br(self.name)
                    self.__add_doi(citing_entity, self.doi, self.curator)
                    self.rf.update_graph_set(self.g_set)
                    self.repok.add_sentence(
                        self.message("The citing entity has been created even if no results have "
                                     "been returned by the API.",
                                     "doi", self.doi))

                # Add other ids if they exist
                self.__add_pmid(citing_entity, self.pmid)
                self.__add_pmcid(citing_entity, self.pmcid)

                cited_entities = self.process_references()

                if cited_entities is not None:
                    for idx, cited_entity in enumerate(cited_entities):
                        citing_entity.has_citation(cited_entity)
                        cur_bibentry = dg(self.entries[idx], ["bibentry"])
                        if cur_bibentry is not None and cur_bibentry.strip():
                            cur_be = self.g_set.add_be(self.curator, self.source_provider, self.source)
                            citing_entity.contains_in_reference_list(cur_be)
                            cited_entity.has_reference(cur_be)
                            cur_be.create_content(cur_bibentry.strip())

                    return self.g_set
            else:
                self.repok.add_sentence(
                    "The citing entity with DOI '%s' has been already "
                    "processed in the past." % self.doi)
        else:  # No DOI has been specified for the citing resource
            self.reperr.add_sentence("No DOI has been specified for the citing resource.")

Exemplo n.º 6

0

Exibir arquivo

Arquivo: epmc_processor.py Projeto: project-renard-survey/opencitations

    def process(self, oa=False):
        while True:
            if self.stopper.can_proceed():
                cur_page = self.__get_next_page()

                result, cur_get_url = self.__get_data_from_page(cur_page, oa)
                # Re-run the query with the first page,
                # since a wrong page can be specified
                if result is None:
                    result, cur_get_url = self.__get_data_from_page("*", oa)

                # Proceed only if there were no problems in getting the data, otherwise stop
                if result is not None:
                    papers_retrieved = dg(result, ["resultList", "result"])
                    if papers_retrieved is not None and papers_retrieved:
                        for paper in papers_retrieved:
                            if self.stopper.can_proceed():
                                cur_id = dg(paper, ["id"])
                                cur_source = dg(paper, ["source"])
                                cur_doi = dg(paper, ["doi"])
                                cur_pmid = dg(paper, ["pmid"])
                                cur_pmcid = dg(paper, ["pmcid"])
                                if cur_doi is None and cur_pmcid is not None:
                                    cur_doi = self.__get_doi_from_xml_source(
                                        cur_pmcid)
                                cur_localid = cur_source + "-" + cur_id
                                id_list = [
                                    cur_doi, cur_pmid, cur_pmcid, cur_localid
                                ]
                                if not self.rs.is_any_stored(id_list):
                                    self.repok.new_article()
                                    self.repok.add_sentence(
                                        "Processing article with local id '%s'."
                                        % cur_localid)

                                    if oa:
                                        ref_list_url = self.__process_xml_source(
                                            cur_pmcid)
                                    else:
                                        ref_list_url = self.__process_references(
                                            cur_source, cur_id)
                                    if ref_list_url is not None:
                                        self.rs.store(
                                            next(item for item in id_list
                                                 if item is not None),
                                            cur_localid, cur_doi, cur_pmid,
                                            cur_pmcid, self.name,
                                            self.provider,
                                            encode_url(ref_list_url))
                                        self.repok.add_sentence(
                                            "References of '%s' have been stored."
                                            % cur_localid)
                                    else:
                                        self.reper.add_sentence(
                                            "The article '%s' has no references or its PubMed Central "
                                            "ID is not defined." % cur_localid)
                                else:
                                    self.repok.add_sentence(
                                        "The article '%s' has been already stored."
                                        % cur_localid)
                            else:
                                break
                        if self.stopper.can_proceed():
                            self.__store_page_number(
                                dg(result, ["nextCursorMark"]))
                    else:  # We have browsed all the pages with results, and thus the counting is reset
                        self.__reset_page_number()
                        self.repok.add_sentence(
                            "All the pages has been processed.")
                        break
                else:
                    self.reper.add_sentence(
                        "Problems in retrieving data for '%s'" % cur_get_url)
                    break
            else:  # Process stopped due to external reasons
                self.repok.add_sentence(
                    "Process stopped due to external reasons.")
                break

Exemplo n.º 7

0

Exibir arquivo

Arquivo: epmc_processor.py Projeto: project-renard-survey/opencitations

    def __create_entry(entry):
        result = None

        author = dg(entry, ["authorString"])
        unstructured = dg(entry, ["unstructuredInformation"])
        if author is not None and author.lower() != "author unknown":
            to_process = True
            entry_string = author

            year = dg(entry, ["pubYear"])
            if year is not None and year > 0:
                entry_string += " (%s)" % str(year)
            else:
                to_process &= False

            title = dg(entry, ["title"])
            if title is not None and title.strip() != "":
                entry_string += "%s %s" % ("" if entry_string[-1] == "." else
                                           ".", title.strip())
            else:
                to_process &= False

            editors = dg(entry, ["editors"])
            if editors is not None and editors.strip() != "":
                entry_string += "%s %s (Eds.)" % ("" if entry_string[-1] == "."
                                                  else ".", editors.strip())

            journal = dg(entry, ["journalAbbreviation"])
            if journal is not None and journal.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 journal.strip())

            container = dg(entry, ["publicationTitle"])
            if container is not None and container.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 container.strip())

            series = dg(entry, ["seriesName"])
            if series is not None and series.strip() != "":
                entry_string += "%s %s" % \
                                ("," if entry_string.endswith("(Eds.)") else
                                 "" if re.search("[\.\?\!]$", entry_string) is not None else ".",
                                 series.strip())

            volume = dg(entry, ["volume"])
            if volume is not None and volume.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ",",
                                           volume.strip())

            issue = dg(entry, ["issue"])
            if issue is not None and issue.strip() != "":
                is_digit = entry_string[-1].isdigit()
                entry_string += "%s%s%s" % \
                                (" (" if is_digit else " ", issue.strip(), ")" if is_digit else "")

            page = dg(entry, ["pageInfo"])
            if page is not None and page.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ":",
                                           page.strip())

            edition = dg(entry, ["edition"])
            if edition is not None and edition.strip() != "":
                entry_string += "%s %s" % ("" if re.search(
                    "[\.\?\!]$", entry_string) is not None else ".",
                                           edition.strip())

            doi = dg(entry, ["doi"])
            if doi is not None and doi.strip() != "":
                entry_string += "%s http://dx.doi.org/%s" % \
                                ("" if entry_string[-1] == "." else ".", doi.strip())

            result = (entry_string, to_process)
        elif unstructured is not None and len(unstructured.strip()):
            result = (html.document_fromstring(
                unstructured.strip()).text_content(), True)

        return result

Exemplo n.º 8

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: essepuntato/opencitations

    def process_crossref_json(
            self, crossref_json, crossref_source,
            doi_curator=None, doi_source_provider=None, doi_source=None):
        # Check if the found bibliographic resource already exist either
        # in the triplestore or in the current graph set.
        self.rf.update_graph_set(self.g_set)
        retrieved_resource = self.rf.retrieve(self.__get_ids_for_type(crossref_json))

        if retrieved_resource is not None:
            cur_br = self.g_set.add_br(self.name, self.id, crossref_source, retrieved_resource)
        else:
            cur_br = self.g_set.add_br(self.name, self.id, crossref_source)
            for key in crossref_json:
                if key == "title":
                    cur_title = self.__create_title_from_list(crossref_json[key])
                    cur_br.create_title(cur_title)
                elif key == "subtitle":
                    cur_br.create_subtitle(self.__create_title_from_list(crossref_json[key]))
                elif key == "author":
                    # Get all ORCID of the authors (if any)
                    all_authors = crossref_json["author"]
                    all_family_names = dg(all_authors, ["family"])
                    author_orcid = []
                    if "DOI" in crossref_json and all_family_names:
                        doi_string = crossref_json["DOI"]
                        author_orcid = self.of.get_orcid_ids(doi_string, all_family_names)

                    # Used to create ordered list of authors/editors of bibliographic entities
                    prev_role = None

                    # Analyse all authors
                    for author in crossref_json["author"]:
                        given_name_string = None
                        if "given" in author:
                            given_name_string = author["given"]
                        family_name_string = None
                        if "family" in author:
                            family_name_string = author["family"]

                        cur_orcid_record = None
                        if family_name_string:
                            # Get all the ORCID/author records retrieved that share the
                            # family name into consideration
                            orcid_with_such_family = dgt(author_orcid, "family", family_name_string)
                            author_with_such_family = dgt(all_authors, "family", family_name_string)
                            if len(orcid_with_such_family) == 1 and len(author_with_such_family) == 1:
                                cur_orcid_record = orcid_with_such_family[0]
                            elif given_name_string is not None and \
                                 len(orcid_with_such_family) >= 1 and len(author_with_such_family) >= 1:
                                # From the previous lists of ORCID/author record, get the list
                                # of all the given name defined
                                orcid_given_with_such_family = dg(orcid_with_such_family, ["given"])
                                author_given_with_such_family = dg(author_with_such_family, ["given"])

                                # Get the indexes of the previous list that best match with the
                                # given name of the author we are considering
                                closest_orcid_matches_idx = \
                                    slc(orcid_given_with_such_family, given_name_string)
                                closest_author_matches_idx = \
                                    slc(author_given_with_such_family, given_name_string)
                                if len(closest_orcid_matches_idx) == 1 and \
                                   len(closest_author_matches_idx) == 1:
                                    closest_author_orcid_matches_idx = slc(
                                        author_given_with_such_family, orcid_given_with_such_family[0])
                                    if closest_author_orcid_matches_idx == closest_author_matches_idx:
                                        cur_orcid_record = \
                                            orcid_with_such_family[closest_orcid_matches_idx[0]]

                        # An ORCID has been found to match with such author record, and we try to
                        # see if such orcid (and thus, the author) has been already added in the
                        # store
                        retrieved_agent = None
                        if cur_orcid_record is not None:
                            retrieved_agent = self.rf.retrieve_from_orcid(cur_orcid_record["orcid"])

                        # If the resource does not exist already, create a new one
                        if retrieved_agent is None:
                            cur_agent = self.g_set.add_ra(self.name, self.id, crossref_source)
                            if cur_orcid_record is not None:
                                cur_agent_orcid = \
                                    self.g_set.add_id(self.of.name, self.of.id, self.of.get_last_query())
                                cur_agent_orcid.create_orcid(cur_orcid_record["orcid"])
                                cur_agent.has_id(cur_agent_orcid)

                            if given_name_string is not None:
                                cur_agent.create_given_name(given_name_string)
                            elif cur_orcid_record is not None and "given" in cur_orcid_record:
                                cur_agent.create_given_name(cur_orcid_record["given"])

                            if family_name_string is not None:
                                cur_agent.create_family_name(family_name_string)
                            elif cur_orcid_record is not None and "family" in cur_orcid_record:
                                cur_agent.create_family_name(cur_orcid_record["family"])
                        else:
                            cur_agent = self.g_set.add_ra(
                                self.name, self.id, crossref_source, retrieved_agent)

                        # Add statements related to the author resource (that could or could not
                        # exist in the store)
                        cur_role = self.g_set.add_ar(self.name, self.id, crossref_source)
                        if crossref_json["type"] == "edited-book":
                            cur_role.create_editor(cur_br)
                        else:
                            cur_role.create_author(cur_br)
                        cur_agent.has_role(cur_role)

                        if prev_role is not None:
                            cur_role.follows(prev_role)

                        prev_role = cur_role

                elif key == "publisher":
                    cur_agent = None

                    # Check if the publishier already exists
                    if "member" in crossref_json and crossref_json["member"] is not None:
                        cur_member_url = crossref_json["member"]
                        retrieved_agent = self.rf.retrieve_from_url(cur_member_url)
                        if retrieved_agent is not None:
                            cur_agent = self.g_set.add_ra(
                                self.name, self.id, crossref_source, retrieved_agent)
                    else:
                        cur_member_url = None

                    # If the publisher is not already defined in the knowledge base,
                    # create a new one.
                    if cur_agent is None:
                        cur_agent = self.g_set.add_ra(self.name, self.id, crossref_source)
                        cur_agent.create_name(crossref_json[key])

                        if cur_member_url is not None:
                            cur_agent_id = self.g_set.add_id(self.name, self.id, crossref_source)
                            cur_agent_id.create_url(crossref_json["member"])
                            cur_agent.has_id(cur_agent_id)

                    cur_role = self.g_set.add_ar(self.name, self.id, crossref_source)
                    cur_role.create_publisher(cur_br)
                    cur_agent.has_role(cur_role)
                elif key == "DOI":
                    cur_id = self.g_set.add_id(doi_curator, doi_source_provider, doi_source)
                    if cur_id.create_doi(crossref_json[key]):
                        cur_br.has_id(cur_id)
                elif key == "issued":
                    cur_br.create_pub_year(crossref_json[key]["date-parts"][0][0])
                elif key == "URL":
                    cur_id = self.g_set.add_id(self.name, self.id, crossref_source)
                    if cur_id.create_url(crossref_json[key]):
                        cur_br.has_id(cur_id)
                elif key == "page":
                    cur_page = crossref_json[key]
                    cur_re = self.g_set.add_re(self.name, self.id, crossref_source)
                    if cur_re.create_starting_page(cur_page):
                        cur_re.create_ending_page(cur_page)
                        cur_br.has_format(cur_re)
                elif key == "container-title":
                    retrieved_container = None
                    cont_br = None
                    cur_type = crossref_json["type"]

                    container_ids = self.__get_ids_for_container(crossref_json)
                    cur_issue_id = crossref_json["issue"] if "issue" in crossref_json else None
                    cur_volume_id = crossref_json["volume"] if "volume" in crossref_json else None
                    if cur_type == "journal-article":
                        if cur_issue_id is None:
                            if cur_volume_id is None:
                                retrieved_container = self.rf.retrieve(container_ids)
                            else:
                                retrieved_container = \
                                    self.rf.retrieve_volume_from_journal(container_ids, cur_volume_id)
                        else:
                            retrieved_container = self.rf.retrieve_issue_from_journal(
                                container_ids, cur_issue_id, cur_volume_id)
                    elif cur_type == "journal-issue":
                        if cur_volume_id is None:
                            retrieved_container = self.rf.retrieve(container_ids)
                        else:
                            retrieved_container = \
                                self.rf.retrieve_volume_from_journal(container_ids, cur_volume_id)
                    else:
                        retrieved_container = self.rf.retrieve(container_ids)

                    if retrieved_container is not None:
                        cont_br = self.g_set.add_br(
                            self.name, self.id, crossref_source, retrieved_container)
                    else:
                        cur_container_title = None
                        if len(crossref_json[key]) > 0:
                            cur_container_title = self.__create_title_from_list(crossref_json[key])

                        if cur_container_title is not None:
                            cur_container_type = None
                            cont_br = self.g_set.add_br(self.name, self.id, crossref_source)

                            if cur_type == "book-chapter":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-part":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-section":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-track":
                                cur_container_type = "book-section"
                                cont_book = self.g_set.add_br(self.name, self.id, crossref_source)
                                cont_book.create_book()
                                cont_book.create_title(cur_container_title)
                                self.__associate_isbn(cont_book, crossref_json, crossref_source)
                                cont_book.has_part(cont_br)
                                cont_br.create_book_section()
                            elif cur_type == "component":
                                cur_container_type = "component"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "dataset":
                                cur_container_type = "dataset"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "journal-article":
                                if "issue" not in crossref_json and "volume" not in crossref_json:
                                    cur_container_type = "journal"
                                    jou_br = cont_br
                                    self.__add_journal_data(jou_br, cur_container_title)
                                else:
                                    # If we have an issue or a volume specified, the journal may have
                                    # been already added to the corpus in the past. Thus, check it
                                    # before creating a new object for that journal
                                    retrieved_journal = self.rf.retrieve(container_ids)
                                    if retrieved_journal is None:
                                        jou_br = self.g_set.add_br(self.name, self.id, crossref_source)
                                        self.__associate_issn(jou_br, crossref_json, crossref_source)
                                        self.__add_journal_data(
                                            jou_br, cur_container_title)
                                    else:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id, crossref_source, retrieved_journal)

                                    if "issue" in crossref_json:
                                        cur_container_type = "issue"
                                        cont_br.create_issue()
                                        cont_br.create_number(crossref_json["issue"])
                                        if "volume" not in crossref_json:
                                            jou_br.has_part(cont_br)

                                    if "volume" in crossref_json:
                                        cur_volume_id = crossref_json["volume"]
                                        if "issue" in crossref_json:
                                            # If we have an issue specified, the volume may have
                                            # been already added to the corpus in the past. Thus, check it
                                            # before creating a new object for that volume
                                            retrieved_volume = self.rf.retrieve_volume_from_journal(
                                                container_ids, cur_volume_id)
                                            if retrieved_volume is None:
                                                vol_br = self.g_set.add_br(
                                                    self.name, self.id, crossref_source)
                                                self.__add_volume_data(vol_br, cur_volume_id)
                                                jou_br.has_part(vol_br)
                                            else:
                                                vol_br = self.g_set.add_br(
                                                    self.name, self.id, crossref_source, retrieved_volume)
                                            vol_br.has_part(cont_br)
                                        else:
                                            cur_container_type = "volume"
                                            vol_br = cont_br
                                            self.__add_volume_data(vol_br, cur_volume_id)
                                            jou_br.has_part(vol_br)
                            elif cur_type == "journal-issue":
                                cur_container_type = "journal"
                                if "volume" in crossref_json:
                                    cur_container_type = "volume"
                                    self.__add_volume_data(cont_br, crossref_json["volume"])
                                    # If we have a volume specified, the journal may have
                                    # been already added to the corpus in the past. Thus, check it
                                    # before creating a new object for that journal
                                    retrieved_journal = self.rf.retrieve(container_ids)
                                    if retrieved_journal is None:
                                        jou_br = self.g_set.add_br(self.name, self.id, crossref_source)
                                        self.__associate_issn(jou_br, crossref_json, crossref_source)
                                        self.__add_journal_data(
                                            jou_br, cur_container_title)
                                    else:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id, crossref_source, retrieved_journal)

                                    jou_br.has_part(cont_br)
                                else:
                                    jou_br = cont_br
                                    self.__add_journal_data(jou_br, cur_container_title)

                            elif cur_type == "journal-volume":
                                cur_container_type = "volume"
                                self.__add_journal_data(cont_br, cur_container_title)
                                self.__associate_issn(cont_br, crossref_json, crossref_source)
                            elif cur_type == "other":
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "proceedings-article":
                                cur_container_type = "proceedings"
                                cont_br.create_proceedings()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "reference-entry":
                                cur_container_type = "reference-book"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "report":
                                cur_container_type = "report-series"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "standard":
                                cur_container_type = "standard-series"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)

                            # If the current type is in any of the ISSN or ISBN list
                            # add the identifier to the resource
                            if cur_container_type is not None:
                                if cur_container_type in self.issn_types:
                                    self.__associate_issn(cont_br, crossref_json, crossref_source)
                                if cur_container_type in self.isbn_types:
                                    self.__associate_isbn(cont_br, crossref_json, crossref_source)

                    if cont_br is not None:
                        cont_br.has_part(cur_br)
                elif key == "type":
                    cur_type = crossref_json[key]
                    if cur_type == "book":
                        cur_br.create_book()
                    elif cur_type == "book-chapter":
                        cur_br.create_book_chapter()
                    elif cur_type == "book-part":
                        cur_br.create_book_part()
                    elif cur_type == "book-section":
                        cur_br.create_book_section()
                    elif cur_type == "book-series":
                        cur_br.create_book_series()
                    elif cur_type == "book-set":
                        cur_br.create_book_set()
                    elif cur_type == "book-track":
                        cur_br.create_book_track()
                    elif cur_type == "component":
                        cur_br.create_component()
                    elif cur_type == "dataset":
                        cur_br.create_dataset()
                    elif cur_type == "dissertation":
                        cur_br.create_dissertation()
                    elif cur_type == "edited-book":
                        cur_br.create_edited_book()
                    elif cur_type == "journal":
                        self.__add_journal_data(cur_br, cur_title)
                    elif cur_type == "journal-article":
                        cur_br.create_journal_article()
                    elif cur_type == "journal-issue":
                        cur_br.create_issue()
                    elif cur_type == "journal-volume":
                        cur_br.create_volume()
                    elif cur_type == "monograph":
                        cur_br.create_monograph()
                    elif cur_type == "other":
                        cur_br.create_other()
                    elif cur_type == "proceedings":
                        cur_br.create_proceedings()
                    elif cur_type == "proceedings-article":
                        cur_br.create_proceedings_article()
                    elif cur_type == "reference-book":
                        cur_br.create_reference_book()
                    elif cur_type == "reference-entry":
                        cur_br.create_reference_entry()
                    elif cur_type == "report":
                        cur_br.create_report()
                    elif cur_type == "report-series":
                        cur_br.create_report_series()
                    elif cur_type == "standard":
                        cur_br.create_standard()
                    elif cur_type == "standard-series":
                        cur_br.create_standard_series()

                    # If the current type is in any of the ISSN or ISBN list
                    # add the identifier to the resource
                    if cur_type in self.issn_types:
                        self.__associate_issn(cur_br, crossref_json, crossref_source)
                    if cur_type in self.isbn_types:
                        self.__associate_isbn(cur_br, crossref_json, crossref_source)

        return cur_br

Exemplo n.º 9

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: essepuntato/opencitations

    def process_references(self):
        result = []

        for full_entry in self.entries:
            self.repok.new_article()
            self.reperr.new_article()
            cur_res = None

            entry = dg(full_entry, ["bibentry"])
            do_process_entry = True
            process_string = dg(full_entry, ["process_entry"])
            if process_string is not None:
                do_process_entry = process_string.lower().strip() == "true"
            provided_doi = dg(full_entry, ["doi"])
            provided_pmid = dg(full_entry, ["pmid"])
            provided_pmcid = dg(full_entry, ["pmcid"])
            provided_url = dg(full_entry, ["url"])

            # This is useful if additional data are stored in the field URL, e.g.:
            # 'http://pub.stat.ee/px/web.2001/dialog/statfile1.asp. Accessed on 2009'
            if provided_url is not None:
                provided_url = FormatProcessor.extract_url(provided_url)

            extracted_doi = FormatProcessor.extract_doi(entry)
            extracted_doi_used = False
            extracted_url = FormatProcessor.extract_url(entry)

            if provided_doi is not None:
                cur_res = self.process_doi(provided_doi, self.curator, self.source_provider)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message("The entity has been found by means of the "
                                     "DOI provided as input by %s." % self.source_provider,
                                     "DOI", provided_doi))

            if cur_res is None and provided_pmid is not None:
                cur_res = self.process_pmid(provided_pmid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message("The entity has been found by means of the "
                                     "PMID provided as input by %s." % self.source_provider,
                                     "PMID", provided_pmid))

            if cur_res is None and provided_pmcid is not None:
                cur_res = self.process_pmcid(provided_pmcid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message("The entity has been found by means of the "
                                     "PMCID provided as input by %s." % self.source_provider,
                                     "PMCID", provided_pmcid))

            if cur_res is None and provided_url is not None:
                cur_res = self.process_url(provided_url)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message("The entity has been found by means of the "
                                     "URL provided as input by %s." % self.source_provider,
                                     "URL", provided_url))

            if cur_res is None and entry is not None:
                if do_process_entry:
                    cur_res = self.process_entry(entry)
                if cur_res is None:
                    if self.get_bib_entry_doi and extracted_doi is not None:
                        extracted_doi_used = True
                        cur_res = self.process_doi(extracted_doi, self.name, self.source_provider)
                        if cur_res is not None:
                            self.repok.add_sentence(
                                self.message("The entity for '%s' has been found by means of the "
                                             "DOI extracted from it." % entry,
                                             "DOI", extracted_doi))
                    if cur_res is None and self.get_bib_entry_url and extracted_url is not None:
                        existing_res = self.rf.retrieve_from_url(extracted_url)
                        if existing_res is not None:
                            cur_res = self.g_set.add_br(
                                self.name, self.source_provider, self.source, existing_res)
                            self.repok.add_sentence(
                                self.message("The entity for '%s' has been found by means of the "
                                             "URL extracted from it." % entry,
                                             "URL", extracted_url))

                else:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been retrieved by using the search API.",
                            "entry", entry))

            # If no errors were generated, proceed
            if self.reperr.is_empty():
                # If it is none
                if cur_res is None:
                    cur_res = self.g_set.add_br(self.name)
                    self.rf.update_graph_set(self.g_set)
                    self.repok.add_sentence(
                        self.message("The entity has been created even if no results have "
                                     "been returned by the API.",
                                     "entry", entry))

                # Add the DOI, the PMID and the PMCID if they have been provided by the curator
                # (if they are not already associated to the resource)
                self.__add_doi(cur_res, provided_doi, self.curator)
                self.__add_pmid(cur_res, provided_pmid)
                self.__add_pmcid(cur_res, provided_pmcid)
                self.__add_url(cur_res, provided_url)

                # Add any DOI extracted from the entry if it is not already included (and only if
                # a resource has not been retrieved by a DOI specified in the entry explicitly, or
                # by a Crossref search.
                if self.get_bib_entry_doi and extracted_doi_used:
                    self.__add_doi(cur_res, extracted_doi, self.name)

                # Add any URL extracted from the entry if it is not already included
                if self.get_bib_entry_url:
                    self.__add_url(cur_res, extracted_url)

                result += [cur_res]
                self.rf.update_graph_set(self.g_set)

            else:  # If errors have been raised, stop the process for this entry (by returning None)
                return None

        # If the process comes here, then everything worked correctly
        return result

Exemplo n.º 10

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: patmha/CiTOCrawler

    def process_crossref_json(self,
                              crossref_json,
                              crossref_source,
                              doi_curator=None,
                              doi_source_provider=None,
                              doi_source=None):
        # Check if the found bibliographic resource already exist either
        # in the triplestore or in the current graph set.
        self.rf.update_graph_set(self.g_set)
        retrieved_resource = self.rf.retrieve(
            self.__get_ids_for_type(crossref_json))

        if retrieved_resource is not None:
            cur_br = self.g_set.add_br(self.name, self.id, crossref_source,
                                       retrieved_resource)
        else:
            cur_br = self.g_set.add_br(self.name, self.id, crossref_source)
            for key in crossref_json:
                if key == "title":
                    cur_title = self.__create_title_from_list(
                        crossref_json[key])
                    cur_br.create_title(cur_title)
                elif key == "subtitle":
                    cur_br.create_subtitle(
                        self.__create_title_from_list(crossref_json[key]))
                elif key == "author":
                    # Get all ORCID of the authors (if any)
                    all_authors = crossref_json["author"]
                    all_family_names = dg(all_authors, ["family"])
                    author_orcid = []
                    if "DOI" in crossref_json and all_family_names:
                        doi_string = crossref_json["DOI"]
                        author_orcid = self.of.get_orcid_ids(
                            doi_string, all_family_names)

                    # Used to create ordered list of authors/editors of bibliographic entities
                    prev_role = None

                    # Analyse all authors
                    for author in crossref_json["author"]:
                        given_name_string = None
                        if "given" in author:
                            given_name_string = author["given"]
                        family_name_string = None
                        if "family" in author:
                            family_name_string = author["family"]

                        cur_orcid_record = None
                        if family_name_string:
                            # Get all the ORCID/author records retrieved that share the
                            # family name into consideration
                            orcid_with_such_family = dgt(
                                author_orcid, "family", family_name_string)
                            author_with_such_family = dgt(
                                all_authors, "family", family_name_string)
                            if len(orcid_with_such_family) == 1 and len(
                                    author_with_such_family) == 1:
                                cur_orcid_record = orcid_with_such_family[0]
                            elif given_name_string is not None and \
                                 len(orcid_with_such_family) >= 1 and len(author_with_such_family) >= 1:
                                # From the previous lists of ORCID/author record, get the list
                                # of all the given name defined
                                orcid_given_with_such_family = dg(
                                    orcid_with_such_family, ["given"])
                                author_given_with_such_family = dg(
                                    author_with_such_family, ["given"])

                                # Get the indexes of the previous list that best match with the
                                # given name of the author we are considering
                                closest_orcid_matches_idx = \
                                    slc(orcid_given_with_such_family, given_name_string)
                                closest_author_matches_idx = \
                                    slc(author_given_with_such_family, given_name_string)
                                if len(closest_orcid_matches_idx) == 1 and \
                                   len(closest_author_matches_idx) == 1:
                                    closest_author_orcid_matches_idx = slc(
                                        author_given_with_such_family,
                                        orcid_given_with_such_family[0])
                                    if closest_author_orcid_matches_idx == closest_author_matches_idx:
                                        cur_orcid_record = \
                                            orcid_with_such_family[closest_orcid_matches_idx[0]]

                        # An ORCID has been found to match with such author record, and we try to
                        # see if such orcid (and thus, the author) has been already added in the
                        # store
                        retrieved_agent = None
                        if cur_orcid_record is not None:
                            retrieved_agent = self.rf.retrieve_from_orcid(
                                cur_orcid_record["orcid"])

                        # If the resource does not exist already, create a new one
                        if retrieved_agent is None:
                            cur_agent = self.g_set.add_ra(
                                self.name, self.id, crossref_source)
                            if cur_orcid_record is not None:
                                cur_agent_orcid = \
                                    self.g_set.add_id(self.of.name, self.of.id, self.of.get_last_query())
                                cur_agent_orcid.create_orcid(
                                    cur_orcid_record["orcid"])
                                cur_agent.has_id(cur_agent_orcid)

                            if given_name_string is not None:
                                cur_agent.create_given_name(given_name_string)
                            elif cur_orcid_record is not None and "given" in cur_orcid_record:
                                cur_agent.create_given_name(
                                    cur_orcid_record["given"])

                            if family_name_string is not None:
                                cur_agent.create_family_name(
                                    family_name_string)
                            elif cur_orcid_record is not None and "family" in cur_orcid_record:
                                cur_agent.create_family_name(
                                    cur_orcid_record["family"])
                        else:
                            cur_agent = self.g_set.add_ra(
                                self.name, self.id, crossref_source,
                                retrieved_agent)

                        # Add statements related to the author resource (that could or could not
                        # exist in the store)
                        cur_role = self.g_set.add_ar(self.name, self.id,
                                                     crossref_source)
                        if crossref_json["type"] == "edited-book":
                            cur_role.create_editor(cur_br)
                        else:
                            cur_role.create_author(cur_br)
                        cur_agent.has_role(cur_role)

                        if prev_role is not None:
                            cur_role.follows(prev_role)

                        prev_role = cur_role

                elif key == "publisher":
                    cur_agent = None

                    # Check if the publishier already exists
                    if "member" in crossref_json and crossref_json[
                            "member"] is not None:
                        cur_member_url = crossref_json["member"]
                        retrieved_agent = self.rf.retrieve_from_url(
                            cur_member_url)
                        if retrieved_agent is not None:
                            cur_agent = self.g_set.add_ra(
                                self.name, self.id, crossref_source,
                                retrieved_agent)
                    else:
                        cur_member_url = None

                    # If the publisher is not already defined in the knowledge base,
                    # create a new one.
                    if cur_agent is None:
                        cur_agent = self.g_set.add_ra(self.name, self.id,
                                                      crossref_source)
                        cur_agent.create_name(crossref_json[key])

                        if cur_member_url is not None:
                            cur_agent_id = self.g_set.add_id(
                                self.name, self.id, crossref_source)
                            cur_agent_id.create_url(crossref_json["member"])
                            cur_agent.has_id(cur_agent_id)

                    cur_role = self.g_set.add_ar(self.name, self.id,
                                                 crossref_source)
                    cur_role.create_publisher(cur_br)
                    cur_agent.has_role(cur_role)
                elif key == "DOI":
                    cur_id = self.g_set.add_id(doi_curator,
                                               doi_source_provider, doi_source)
                    if cur_id.create_doi(crossref_json[key]):
                        cur_br.has_id(cur_id)
                elif key == "issued":
                    cur_br.create_pub_year(
                        crossref_json[key]["date-parts"][0][0])
                elif key == "URL":
                    cur_id = self.g_set.add_id(self.name, self.id,
                                               crossref_source)
                    if cur_id.create_url(crossref_json[key]):
                        cur_br.has_id(cur_id)
                elif key == "page":
                    cur_page = crossref_json[key]
                    cur_re = self.g_set.add_re(self.name, self.id,
                                               crossref_source)
                    if cur_re.create_starting_page(cur_page):
                        cur_re.create_ending_page(cur_page)
                        cur_br.has_format(cur_re)
                elif key == "container-title":
                    retrieved_container = None
                    cont_br = None
                    cur_type = crossref_json["type"]

                    container_ids = self.__get_ids_for_container(crossref_json)
                    cur_issue_id = crossref_json[
                        "issue"] if "issue" in crossref_json else None
                    cur_volume_id = crossref_json[
                        "volume"] if "volume" in crossref_json else None
                    if cur_type == "journal-article":
                        if cur_issue_id is None:
                            if cur_volume_id is None:
                                retrieved_container = self.rf.retrieve(
                                    container_ids)
                            else:
                                retrieved_container = \
                                    self.rf.retrieve_volume_from_journal(container_ids, cur_volume_id)
                        else:
                            retrieved_container = self.rf.retrieve_issue_from_journal(
                                container_ids, cur_issue_id, cur_volume_id)
                    elif cur_type == "journal-issue":
                        if cur_volume_id is None:
                            retrieved_container = self.rf.retrieve(
                                container_ids)
                        else:
                            retrieved_container = \
                                self.rf.retrieve_volume_from_journal(container_ids, cur_volume_id)
                    else:
                        retrieved_container = self.rf.retrieve(container_ids)

                    if retrieved_container is not None:
                        cont_br = self.g_set.add_br(self.name, self.id,
                                                    crossref_source,
                                                    retrieved_container)
                    else:
                        cur_container_title = None
                        if len(crossref_json[key]) > 0:
                            cur_container_title = self.__create_title_from_list(
                                crossref_json[key])

                        if cur_container_title is not None:
                            cur_container_type = None
                            cont_br = self.g_set.add_br(
                                self.name, self.id, crossref_source)

                            if cur_type == "book-chapter":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-part":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-section":
                                cur_container_type = "book"
                                cont_br.create_book()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "book-track":
                                cur_container_type = "book-section"
                                cont_book = self.g_set.add_br(
                                    self.name, self.id, crossref_source)
                                cont_book.create_book()
                                cont_book.create_title(cur_container_title)
                                self.__associate_isbn(cont_book, crossref_json,
                                                      crossref_source)
                                cont_book.has_part(cont_br)
                                cont_br.create_book_section()
                            elif cur_type == "component":
                                cur_container_type = "component"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "dataset":
                                cur_container_type = "dataset"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "journal-article":
                                if "issue" not in crossref_json and "volume" not in crossref_json:
                                    cur_container_type = "journal"
                                    jou_br = cont_br
                                    self.__add_journal_data(
                                        jou_br, cur_container_title)
                                else:
                                    # If we have an issue or a volume specified, the journal may have
                                    # been already added to the corpus in the past. Thus, check it
                                    # before creating a new object for that journal
                                    retrieved_journal = self.rf.retrieve(
                                        container_ids)
                                    if retrieved_journal is None:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id,
                                            crossref_source)
                                        self.__associate_issn(
                                            jou_br, crossref_json,
                                            crossref_source)
                                        self.__add_journal_data(
                                            jou_br, cur_container_title)
                                    else:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id,
                                            crossref_source, retrieved_journal)

                                    if "issue" in crossref_json:
                                        cur_container_type = "issue"
                                        cont_br.create_issue()
                                        cont_br.create_number(
                                            crossref_json["issue"])
                                        if "volume" not in crossref_json:
                                            jou_br.has_part(cont_br)

                                    if "volume" in crossref_json:
                                        cur_volume_id = crossref_json["volume"]
                                        if "issue" in crossref_json:
                                            # If we have an issue specified, the volume may have
                                            # been already added to the corpus in the past. Thus, check it
                                            # before creating a new object for that volume
                                            retrieved_volume = self.rf.retrieve_volume_from_journal(
                                                container_ids, cur_volume_id)
                                            if retrieved_volume is None:
                                                vol_br = self.g_set.add_br(
                                                    self.name, self.id,
                                                    crossref_source)
                                                self.__add_volume_data(
                                                    vol_br, cur_volume_id)
                                                jou_br.has_part(vol_br)
                                            else:
                                                vol_br = self.g_set.add_br(
                                                    self.name, self.id,
                                                    crossref_source,
                                                    retrieved_volume)
                                            vol_br.has_part(cont_br)
                                        else:
                                            cur_container_type = "volume"
                                            vol_br = cont_br
                                            self.__add_volume_data(
                                                vol_br, cur_volume_id)
                                            jou_br.has_part(vol_br)
                            elif cur_type == "journal-issue":
                                cur_container_type = "journal"
                                if "volume" in crossref_json:
                                    cur_container_type = "volume"
                                    self.__add_volume_data(
                                        cont_br, crossref_json["volume"])
                                    # If we have a volume specified, the journal may have
                                    # been already added to the corpus in the past. Thus, check it
                                    # before creating a new object for that journal
                                    retrieved_journal = self.rf.retrieve(
                                        container_ids)
                                    if retrieved_journal is None:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id,
                                            crossref_source)
                                        self.__associate_issn(
                                            jou_br, crossref_json,
                                            crossref_source)
                                        self.__add_journal_data(
                                            jou_br, cur_container_title)
                                    else:
                                        jou_br = self.g_set.add_br(
                                            self.name, self.id,
                                            crossref_source, retrieved_journal)

                                    jou_br.has_part(cont_br)
                                else:
                                    jou_br = cont_br
                                    self.__add_journal_data(
                                        jou_br, cur_container_title)

                            elif cur_type == "journal-volume":
                                cur_container_type = "volume"
                                self.__add_journal_data(
                                    cont_br, cur_container_title)
                                self.__associate_issn(cont_br, crossref_json,
                                                      crossref_source)
                            elif cur_type == "other":
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "proceedings-article":
                                cur_container_type = "proceedings"
                                cont_br.create_proceedings()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "reference-entry":
                                cur_container_type = "reference-book"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "report":
                                cur_container_type = "report-series"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)
                            elif cur_type == "standard":
                                cur_container_type = "standard-series"
                                cont_br.create_expression_collection()
                                cont_br.create_title(cur_container_title)

                            # If the current type is in any of the ISSN or ISBN list
                            # add the identifier to the resource
                            if cur_container_type is not None:
                                if cur_container_type in self.issn_types:
                                    self.__associate_issn(
                                        cont_br, crossref_json,
                                        crossref_source)
                                if cur_container_type in self.isbn_types:
                                    self.__associate_isbn(
                                        cont_br, crossref_json,
                                        crossref_source)

                    if cont_br is not None:
                        cont_br.has_part(cur_br)
                elif key == "type":
                    cur_type = crossref_json[key]
                    if cur_type == "book":
                        cur_br.create_book()
                    elif cur_type == "book-chapter":
                        cur_br.create_book_chapter()
                    elif cur_type == "book-part":
                        cur_br.create_book_part()
                    elif cur_type == "book-section":
                        cur_br.create_book_section()
                    elif cur_type == "book-series":
                        cur_br.create_book_series()
                    elif cur_type == "book-set":
                        cur_br.create_book_set()
                    elif cur_type == "book-track":
                        cur_br.create_book_track()
                    elif cur_type == "component":
                        cur_br.create_component()
                    elif cur_type == "dataset":
                        cur_br.create_dataset()
                    elif cur_type == "dissertation":
                        cur_br.create_dissertation()
                    elif cur_type == "edited-book":
                        cur_br.create_edited_book()
                    elif cur_type == "journal":
                        self.__add_journal_data(cur_br, cur_title)
                    elif cur_type == "journal-article":
                        cur_br.create_journal_article()
                    elif cur_type == "journal-issue":
                        cur_br.create_issue()
                    elif cur_type == "journal-volume":
                        cur_br.create_volume()
                    elif cur_type == "monograph":
                        cur_br.create_monograph()
                    elif cur_type == "other":
                        cur_br.create_other()
                    elif cur_type == "proceedings":
                        cur_br.create_proceedings()
                    elif cur_type == "proceedings-article":
                        cur_br.create_proceedings_article()
                    elif cur_type == "reference-book":
                        cur_br.create_reference_book()
                    elif cur_type == "reference-entry":
                        cur_br.create_reference_entry()
                    elif cur_type == "report":
                        cur_br.create_report()
                    elif cur_type == "report-series":
                        cur_br.create_report_series()
                    elif cur_type == "standard":
                        cur_br.create_standard()
                    elif cur_type == "standard-series":
                        cur_br.create_standard_series()

                    # If the current type is in any of the ISSN or ISBN list
                    # add the identifier to the resource
                    if cur_type in self.issn_types:
                        self.__associate_issn(cur_br, crossref_json,
                                              crossref_source)
                    if cur_type in self.isbn_types:
                        self.__associate_isbn(cur_br, crossref_json,
                                              crossref_source)

        return cur_br

Exemplo n.º 11

0

Exibir arquivo

Arquivo: crossref_processor.py Projeto: patmha/CiTOCrawler

    def process_references(self):
        result = []

        for full_entry in self.entries:
            self.repok.new_article()
            self.reperr.new_article()
            cur_res = None

            entry = dg(full_entry, ["bibentry"])
            do_process_entry = True
            process_string = dg(full_entry, ["process_entry"])
            if process_string is not None:
                do_process_entry = process_string.lower().strip() == "true"
            provided_doi = dg(full_entry, ["doi"])
            provided_pmid = dg(full_entry, ["pmid"])
            provided_pmcid = dg(full_entry, ["pmcid"])
            provided_url = dg(full_entry, ["url"])

            # This is useful if additional data are stored in the field URL, e.g.:
            # 'http://pub.stat.ee/px/web.2001/dialog/statfile1.asp. Accessed on 2009'
            if provided_url is not None:
                provided_url = FormatProcessor.extract_url(provided_url)

            extracted_doi = FormatProcessor.extract_doi(entry)
            extracted_doi_used = False
            extracted_url = FormatProcessor.extract_url(entry)

            if provided_doi is not None:
                cur_res = self.process_doi(provided_doi, self.curator,
                                           self.source_provider)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "DOI provided as input by %s." %
                            self.source_provider, "DOI", provided_doi))

            if cur_res is None and provided_pmid is not None:
                cur_res = self.process_pmid(provided_pmid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "PMID provided as input by %s." %
                            self.source_provider, "PMID", provided_pmid))

            if cur_res is None and provided_pmcid is not None:
                cur_res = self.process_pmcid(provided_pmcid)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "PMCID provided as input by %s." %
                            self.source_provider, "PMCID", provided_pmcid))

            if cur_res is None and provided_url is not None:
                cur_res = self.process_url(provided_url)
                if cur_res is not None:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been found by means of the "
                            "URL provided as input by %s." %
                            self.source_provider, "URL", provided_url))

            if cur_res is None and entry is not None:
                if do_process_entry:
                    cur_res = self.process_entry(entry)
                if cur_res is None:
                    if self.get_bib_entry_doi and extracted_doi is not None:
                        extracted_doi_used = True
                        cur_res = self.process_doi(extracted_doi, self.name,
                                                   self.source_provider)
                        if cur_res is not None:
                            self.repok.add_sentence(
                                self.message(
                                    "The entity for '%s' has been found by means of the "
                                    "DOI extracted from it." % entry, "DOI",
                                    extracted_doi))
                    if cur_res is None and self.get_bib_entry_url and extracted_url is not None:
                        existing_res = self.rf.retrieve_from_url(extracted_url)
                        if existing_res is not None:
                            cur_res = self.g_set.add_br(
                                self.name, self.source_provider, self.source,
                                existing_res)
                            self.repok.add_sentence(
                                self.message(
                                    "The entity for '%s' has been found by means of the "
                                    "URL extracted from it." % entry, "URL",
                                    extracted_url))

                else:
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been retrieved by using the search API.",
                            "entry", entry))

            # If no errors were generated, proceed
            if self.reperr.is_empty():
                # If it is none
                if cur_res is None:
                    cur_res = self.g_set.add_br(self.name)
                    self.rf.update_graph_set(self.g_set)
                    self.repok.add_sentence(
                        self.message(
                            "The entity has been created even if no results have "
                            "been returned by the API.", "entry", entry))

                # Add the DOI, the PMID and the PMCID if they have been provided by the curator
                # (if they are not already associated to the resource)
                self.__add_doi(cur_res, provided_doi, self.curator)
                self.__add_pmid(cur_res, provided_pmid)
                self.__add_pmcid(cur_res, provided_pmcid)
                self.__add_url(cur_res, provided_url)

                # Add any DOI extracted from the entry if it is not already included (and only if
                # a resource has not been retrieved by a DOI specified in the entry explicitly, or
                # by a Crossref search.
                if self.get_bib_entry_doi and extracted_doi_used:
                    self.__add_doi(cur_res, extracted_doi, self.name)

                # Add any URL extracted from the entry if it is not already included
                if self.get_bib_entry_url:
                    self.__add_url(cur_res, extracted_url)

                result += [cur_res]
                self.rf.update_graph_set(self.g_set)

            else:  # If errors have been raised, stop the process for this entry (by returning None)
                return None

        # If the process comes here, then everything worked correctly
        return result