Пример #1
0
def test_grobid_parse() -> None:
    """
    Equivalent to test_grobid_parse_legacy(), but using the GrobidDocument type directly
    """

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert (
        doc.header.title ==
        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
    )

    assert doc.citations is not None
    ref = [c for c in doc.citations if c.id == "b12"][0]
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].full_name == "K Tasa"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Пример #2
0
def test_grobid_parse_legacy() -> None:
    """
    This function formerly tested the grobid2json file in this project. Now it
    tests backwards-compatibility of the grobid_tei_xml library.
    """

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)
    obj = doc.to_legacy_dict()

    assert (
        obj["title"] ==
        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
    )

    ref = [c for c in obj["citations"] if c["id"] == "b12"][0]
    assert ref["authors"][0] == {
        "given_name": "K",
        "name": "K Tasa",
        "surname": "Tasa"
    }
    assert ref["journal"] == "Quality Management in Health Care"
    assert ref["title"] == "Using patient feedback for quality improvement"
    assert ref["date"] == "1996"
    assert ref["pages"] == "206-225"
    assert ref["volume"] == "8"
    assert (
        ref["unstructured"] ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Пример #3
0
 def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
     if result["status"] != "success":
         return None
     try:
         tei_doc = parse_document_xml(result["tei_xml"])
     except xml.etree.ElementTree.ParseError as pe:
         result["status"] = "bad-grobid-xml"
         return dict(error_msg=str(pe)[:1000])
     tei_doc.remove_encumbered()
     tei_json = tei_doc.to_legacy_dict()
     meta = dict()
     biblio = dict()
     for k in (
         "title",
         "authors",
         "journal",
         "date",
         "doi",
     ):
         if tei_json.get(k):
             biblio[k] = tei_json[k]
     meta["biblio"] = biblio
     for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
         if tei_json.get(k):
             meta[k] = tei_json[k]
     return meta
Пример #4
0
def test_small_xml():
    """
    This used to be a test of grobid2json; now it is a compatability test for
    the to_legacy_dict() feature of grobid_tei_xml.
    """

    with open("tests/files/small.xml", "r") as f:
        tei_xml = f.read()
    with open("tests/files/small.json", "r") as f:
        json_form = json.loads(f.read())

    tei_doc = parse_document_xml(tei_xml)
    assert tei_doc.to_legacy_dict() == json_form
Пример #5
0
def run_transform(args):
    grobid_client = GrobidClient()
    for line in args.json_file:
        if not line.strip():
            continue
        line = json.loads(line)
        if args.metadata_only:
            out = grobid_client.metadata(line)
        else:
            tei_doc = parse_document_xml(line["tei_xml"])
            out = tei_doc.to_legacy_dict()
        if out:
            if "source" in line:
                out["source"] = line["source"]
            print(json.dumps(out))
def run(mode="hbase"):
    for line in sys.stdin:
        if mode == "hbase":
            sha1hex, tei_xml = parse_hbase(line)
        elif mode == "pg":
            sha1hex, tei_xml = parse_pg(line)
        else:
            raise NotImplementedError("parse mode: {}".format(mode))

        tei_doc = parse_document_xml(tei_xml)
        tei_doc.remove_encumbered()
        obj = tei_doc.to_legacy_dict()

        affiliations = []
        for author in obj["authors"]:
            if author.get("affiliation"):
                affiliations.append(author["affiliation"])
        if affiliations:
            # don't duplicate affiliations; only the unique ones
            affiliations = list(set([json.dumps(a) for a in affiliations]))
            affiliations = [json.loads(a) for a in affiliations]
            print("\t".join([sha1hex, json.dumps(affiliations)]))
def test_transform_refs_grobid() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    dummy_release = ReleaseEntity(
        ident="releasedummy22222222222222",
        work_id="workdummy22222222222222222",
        release_year=1234,
        release_stage="accepted",
        ext_ids={},
    )

    tei_doc = parse_document_xml(blob)
    refs = refs_from_grobid(dummy_release, tei_doc)

    ref = refs[12]
    assert ref.release_ident == "releasedummy22222222222222"
    assert ref.work_ident == "workdummy22222222222222222"
    assert ref.release_stage == "accepted"
    assert ref.release_year == 1234
    assert ref.ref_source == "grobid"
    assert ref.key == "b12"
    assert ref.index == 13
    assert ref.locator is None
    assert ref.biblio.contrib_raw_names is not None
    assert ref.biblio.contrib_raw_names[0] == "K Tasa"
    assert ref.biblio.container_name == "Quality Management in Health Care"
    assert ref.biblio.title == "Using patient feedback for quality improvement"
    assert ref.biblio.year == 1996
    assert ref.biblio.pages == "206-225"
    assert ref.biblio.volume == "8"
    assert (
        ref.biblio.unstructured
        == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Пример #8
0
def test_invalid_xml():

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")
Пример #9
0
def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
    """
    Current behavior is to return *both* fatcat refs and GROBID refs if
    available.
    """

    if heavy.doc_type != DocType.work:
        return []

    assert heavy.biblio_release_ident
    primary_release = [
        r for r in heavy.releases if r.ident == heavy.biblio_release_ident
    ][0]

    refs: List[RefStructured] = []

    fatcat_refs: List[RefStructured] = []
    if primary_release.refs:
        fatcat_refs = refs_from_release_refs(primary_release)
    else:
        # if there are not refs for "primary" release, take any other refs we can find
        for release in heavy.releases:
            if release.refs:
                fatcat_refs = refs_from_release_refs(release)
                break

    fulltext_refs: List[RefStructured] = []
    # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
    if (
        heavy.grobid_fulltext
        and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
    ):
        fulltext_release = [
            r
            for r in heavy.releases
            if r.ident == heavy.grobid_fulltext["release_ident"]
        ][0]
        tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
        fulltext_refs = refs_from_grobid(fulltext_release, tei_doc)

    crossref_refs: List[RefStructured] = []
    if heavy.crossref:
        crossref_release = [
            r for r in heavy.releases if r.ident == heavy.crossref["release_ident"]
        ][0]
        crossref_refs = refs_from_crossref(crossref_release, heavy.crossref)

    # TODO: better logic for prioritizing/combining references from multiple sources?
    # TODO: test coverage
    if (
        fatcat_refs
        and crossref_refs
        and all([r.ref_source in ["crossref", "fatcat-crossref"] for r in fatcat_refs])
    ):
        # priorize recent crossref over old-fatcat-imported-from-crossref (?)
        fatcat_refs = []
    elif (
        fatcat_refs
        and fulltext_refs
        and all([r.ref_source == ["grobid", "fatcat-grobid"] for r in fatcat_refs])
    ):
        # prioritize newer GROBID fulltext extraction (?)
        fatcat_refs = []

    refs.extend(fatcat_refs)
    refs.extend(crossref_refs)

    # include fulltext refs if there are more than in both of the crossref and fatcat refs
    if len(fulltext_refs) > len(fatcat_refs) and len(fulltext_refs) > len(
        crossref_refs
    ):
        refs.extend(fulltext_refs)

    # TODO: use GROBID to parse any refs which only have 'unstructured' (if they don't already come from GROBID)
    return refs
Пример #10
0
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:

    tags: List[str] = []
    work_ident: Optional[str] = None
    sim_issue: Optional[str] = None
    abstracts: List[ScholarAbstract] = []
    fulltext: Optional[ScholarFulltext] = None
    primary_release: Optional[ReleaseEntity] = None
    exclude_web_fulltext: bool = False

    ia_sim: Optional[ScholarSim] = None
    if heavy.sim_fulltext is not None:
        ia_sim = es_sim_from_sim(heavy.sim_fulltext)
        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)

    if heavy.doc_type == DocType.sim_page:
        assert ia_sim is not None
        assert heavy.sim_fulltext is not None
        if not ia_sim.first_page or not ia_sim.issue_item:
            # can't create a valid key if we don't have these fields, so shouldn't index
            return None
        key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
        sim_issue = ia_sim.issue_item
        biblio = es_biblio_from_sim(heavy.sim_fulltext)
        # fulltext extracted from heavy.sim_fulltext above
    elif heavy.doc_type == DocType.work:
        work_ident = heavy.releases[0].work_id
        key = f"work_{work_ident}"
        assert heavy.biblio_release_ident
        primary_release = [
            r for r in heavy.releases if r.ident == heavy.biblio_release_ident
        ][0]
        biblio = es_biblio_from_release(primary_release)
        biblio = biblio_metadata_hacks(biblio)
        exclude_web_fulltext = check_exclude_web(biblio)
        abstracts = es_abstracts_from_release(primary_release)

        # if no abstract from primary_release, try all the other releases
        for release in heavy.releases:
            if not abstracts:
                abstracts = es_abstracts_from_release(release)
    else:
        raise NotImplementedError(f"doc_type: {heavy.doc_type}")

    # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
    if (
        heavy.grobid_fulltext
        and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
    ):
        fulltext_release = [
            r
            for r in heavy.releases
            if r.ident == heavy.grobid_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f
            for f in fulltext_release.files
            if f.ident == heavy.grobid_fulltext["file_ident"]
        ][0]
        try:
            tei_doc: Optional[GrobidDocument] = parse_document_xml(
                heavy.grobid_fulltext["tei_xml"]
            )
        except xml.etree.ElementTree.ParseError:
            tei_doc = None
        if tei_doc:
            if not abstracts:
                abstracts = es_abstracts_from_grobid(tei_doc)
            grobid_fulltext = es_fulltext_from_grobid(
                tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file
            )
            if exclude_web_fulltext and grobid_fulltext:
                if not fulltext:
                    # include only partial fulltext object, with no access
                    fulltext = grobid_fulltext.remove_access()
            else:
                fulltext = grobid_fulltext

    if not fulltext and heavy.pdftotext_fulltext:
        fulltext_release = [
            r
            for r in heavy.releases
            if r.ident == heavy.pdftotext_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f
            for f in fulltext_release.files
            if f.ident == heavy.pdftotext_fulltext["file_ident"]
        ][0]
        pdftotext_fulltext = es_fulltext_from_pdftotext(
            heavy.pdftotext_fulltext["raw_text"],
            heavy.pdf_meta,
            fulltext_release,
            fulltext_file,
        )
        if exclude_web_fulltext and pdftotext_fulltext:
            fulltext = pdftotext_fulltext.remove_access()
        else:
            fulltext = pdftotext_fulltext

    if not fulltext and heavy.html_fulltext:
        fulltext_release = [
            r for r in heavy.releases if r.ident == heavy.html_fulltext["release_ident"]
        ][0]
        fulltext_webcapture = [
            f
            for f in fulltext_release.webcaptures
            if f.ident == heavy.html_fulltext["webcapture_ident"]
        ][0]
        html_fulltext = es_fulltext_from_html(
            heavy.html_fulltext,
            fulltext_release,
            fulltext_webcapture,
        )
        if exclude_web_fulltext and html_fulltext:
            fulltext = html_fulltext.remove_access()
        else:
            fulltext = html_fulltext

    # TODO: additional access list (eg, HTML if only PDF currently)
    access_dict = dict()
    if fulltext and fulltext.access_type:
        access_dict[fulltext.access_type] = ScholarAccess(
            access_type=fulltext.access_type,
            access_url=fulltext.access_url,
            mimetype=fulltext.file_mimetype,
            file_ident=fulltext.file_ident,
            release_ident=fulltext.release_ident,
        )
    if ia_sim and AccessType.ia_sim not in access_dict:
        access_dict[AccessType.ia_sim] = ScholarAccess(
            access_type=AccessType.ia_sim,
            access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
            # TODO: release_ident
        )

    # TODO: additional abstracts (?)

    tags = generate_tags(biblio, primary_release)

    # biorxiv/medrxiv hacks
    if (
        biblio.doi_prefix == "10.1101"
        and biblio.container_name in (None, "biorxiv/medrxiv")
        and biblio.release_stage != "published"
    ):
        for _, acc in access_dict.items():
            if "://www.medrxiv.org/" in acc.access_url:
                biblio.container_name = "medRxiv"
                if biblio.release_stage is None:
                    biblio.release_stage = "submitted"
            elif "://www.biorxiv.org/" in acc.access_url:
                biblio.container_name = "bioRxiv"
                if biblio.release_stage is None:
                    biblio.release_stage = "submitted"

    return ScholarDoc(
        key=key,
        collapse_key=sim_issue or work_ident,
        doc_type=heavy.doc_type.value,
        doc_index_ts=datetime.datetime.utcnow(),
        work_ident=work_ident,
        tags=tags,
        biblio=biblio,
        fulltext=fulltext,
        ia_sim=ia_sim,
        abstracts=abstracts,
        releases=[es_release_from_release(r) for r in heavy.releases],
        access=list(access_dict.values()),
    )