def test_grobid_parse() -> None: """ Equivalent to test_grobid_parse_legacy(), but using the GrobidDocument type directly """ with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() doc = parse_document_xml(blob) assert ( doc.header.title == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) assert doc.citations is not None ref = [c for c in doc.citations if c.id == "b12"][0] assert ref.authors[0].given_name == "K" assert ref.authors[0].full_name == "K Tasa" assert ref.authors[0].surname == "Tasa" assert ref.journal == "Quality Management in Health Care" assert ref.title == "Using patient feedback for quality improvement" assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" assert ( ref.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )
def test_grobid_parse_legacy() -> None: """ This function formerly tested the grobid2json file in this project. Now it tests backwards-compatibility of the grobid_tei_xml library. """ with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() doc = parse_document_xml(blob) obj = doc.to_legacy_dict() assert ( obj["title"] == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) ref = [c for c in obj["citations"] if c["id"] == "b12"][0] assert ref["authors"][0] == { "given_name": "K", "name": "K Tasa", "surname": "Tasa" } assert ref["journal"] == "Quality Management in Health Care" assert ref["title"] == "Using patient feedback for quality improvement" assert ref["date"] == "1996" assert ref["pages"] == "206-225" assert ref["volume"] == "8" assert ( ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: if result["status"] != "success": return None try: tei_doc = parse_document_xml(result["tei_xml"]) except xml.etree.ElementTree.ParseError as pe: result["status"] = "bad-grobid-xml" return dict(error_msg=str(pe)[:1000]) tei_doc.remove_encumbered() tei_json = tei_doc.to_legacy_dict() meta = dict() biblio = dict() for k in ( "title", "authors", "journal", "date", "doi", ): if tei_json.get(k): biblio[k] = tei_json[k] meta["biblio"] = biblio for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"): if tei_json.get(k): meta[k] = tei_json[k] return meta
def test_small_xml(): """ This used to be a test of grobid2json; now it is a compatability test for the to_legacy_dict() feature of grobid_tei_xml. """ with open("tests/files/small.xml", "r") as f: tei_xml = f.read() with open("tests/files/small.json", "r") as f: json_form = json.loads(f.read()) tei_doc = parse_document_xml(tei_xml) assert tei_doc.to_legacy_dict() == json_form
def run_transform(args): grobid_client = GrobidClient() for line in args.json_file: if not line.strip(): continue line = json.loads(line) if args.metadata_only: out = grobid_client.metadata(line) else: tei_doc = parse_document_xml(line["tei_xml"]) out = tei_doc.to_legacy_dict() if out: if "source" in line: out["source"] = line["source"] print(json.dumps(out))
def run(mode="hbase"): for line in sys.stdin: if mode == "hbase": sha1hex, tei_xml = parse_hbase(line) elif mode == "pg": sha1hex, tei_xml = parse_pg(line) else: raise NotImplementedError("parse mode: {}".format(mode)) tei_doc = parse_document_xml(tei_xml) tei_doc.remove_encumbered() obj = tei_doc.to_legacy_dict() affiliations = [] for author in obj["authors"]: if author.get("affiliation"): affiliations.append(author["affiliation"]) if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) affiliations = [json.loads(a) for a in affiliations] print("\t".join([sha1hex, json.dumps(affiliations)]))
def test_transform_refs_grobid() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() dummy_release = ReleaseEntity( ident="releasedummy22222222222222", work_id="workdummy22222222222222222", release_year=1234, release_stage="accepted", ext_ids={}, ) tei_doc = parse_document_xml(blob) refs = refs_from_grobid(dummy_release, tei_doc) ref = refs[12] assert ref.release_ident == "releasedummy22222222222222" assert ref.work_ident == "workdummy22222222222222222" assert ref.release_stage == "accepted" assert ref.release_year == 1234 assert ref.ref_source == "grobid" assert ref.key == "b12" assert ref.index == 13 assert ref.locator is None assert ref.biblio.contrib_raw_names is not None assert ref.biblio.contrib_raw_names[0] == "K Tasa" assert ref.biblio.container_name == "Quality Management in Health Care" assert ref.biblio.title == "Using patient feedback for quality improvement" assert ref.biblio.year == 1996 assert ref.biblio.pages == "206-225" assert ref.biblio.volume == "8" assert ( ref.biblio.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )
def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): parse_document_xml("this is not XML") with pytest.raises(ValueError): parse_document_xml("<xml></xml>")
def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: """ Current behavior is to return *both* fatcat refs and GROBID refs if available. """ if heavy.doc_type != DocType.work: return [] assert heavy.biblio_release_ident primary_release = [ r for r in heavy.releases if r.ident == heavy.biblio_release_ident ][0] refs: List[RefStructured] = [] fatcat_refs: List[RefStructured] = [] if primary_release.refs: fatcat_refs = refs_from_release_refs(primary_release) else: # if there are not refs for "primary" release, take any other refs we can find for release in heavy.releases: if release.refs: fatcat_refs = refs_from_release_refs(release) break fulltext_refs: List[RefStructured] = [] # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ if ( heavy.grobid_fulltext and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" ): fulltext_release = [ r for r in heavy.releases if r.ident == heavy.grobid_fulltext["release_ident"] ][0] tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) fulltext_refs = refs_from_grobid(fulltext_release, tei_doc) crossref_refs: List[RefStructured] = [] if heavy.crossref: crossref_release = [ r for r in heavy.releases if r.ident == heavy.crossref["release_ident"] ][0] crossref_refs = refs_from_crossref(crossref_release, heavy.crossref) # TODO: better logic for prioritizing/combining references from multiple sources? # TODO: test coverage if ( fatcat_refs and crossref_refs and all([r.ref_source in ["crossref", "fatcat-crossref"] for r in fatcat_refs]) ): # priorize recent crossref over old-fatcat-imported-from-crossref (?) fatcat_refs = [] elif ( fatcat_refs and fulltext_refs and all([r.ref_source == ["grobid", "fatcat-grobid"] for r in fatcat_refs]) ): # prioritize newer GROBID fulltext extraction (?) fatcat_refs = [] refs.extend(fatcat_refs) refs.extend(crossref_refs) # include fulltext refs if there are more than in both of the crossref and fatcat refs if len(fulltext_refs) > len(fatcat_refs) and len(fulltext_refs) > len( crossref_refs ): refs.extend(fulltext_refs) # TODO: use GROBID to parse any refs which only have 'unstructured' (if they don't already come from GROBID) return refs
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: tags: List[str] = [] work_ident: Optional[str] = None sim_issue: Optional[str] = None abstracts: List[ScholarAbstract] = [] fulltext: Optional[ScholarFulltext] = None primary_release: Optional[ReleaseEntity] = None exclude_web_fulltext: bool = False ia_sim: Optional[ScholarSim] = None if heavy.sim_fulltext is not None: ia_sim = es_sim_from_sim(heavy.sim_fulltext) fulltext = es_fulltext_from_sim(heavy.sim_fulltext) if heavy.doc_type == DocType.sim_page: assert ia_sim is not None assert heavy.sim_fulltext is not None if not ia_sim.first_page or not ia_sim.issue_item: # can't create a valid key if we don't have these fields, so shouldn't index return None key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}" sim_issue = ia_sim.issue_item biblio = es_biblio_from_sim(heavy.sim_fulltext) # fulltext extracted from heavy.sim_fulltext above elif heavy.doc_type == DocType.work: work_ident = heavy.releases[0].work_id key = f"work_{work_ident}" assert heavy.biblio_release_ident primary_release = [ r for r in heavy.releases if r.ident == heavy.biblio_release_ident ][0] biblio = es_biblio_from_release(primary_release) biblio = biblio_metadata_hacks(biblio) exclude_web_fulltext = check_exclude_web(biblio) abstracts = es_abstracts_from_release(primary_release) # if no abstract from primary_release, try all the other releases for release in heavy.releases: if not abstracts: abstracts = es_abstracts_from_release(release) else: raise NotImplementedError(f"doc_type: {heavy.doc_type}") # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ if ( heavy.grobid_fulltext and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" ): fulltext_release = [ r for r in heavy.releases if r.ident == heavy.grobid_fulltext["release_ident"] ][0] fulltext_file = [ f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext["file_ident"] ][0] try: tei_doc: Optional[GrobidDocument] = parse_document_xml( heavy.grobid_fulltext["tei_xml"] ) except xml.etree.ElementTree.ParseError: tei_doc = None if tei_doc: if not abstracts: abstracts = es_abstracts_from_grobid(tei_doc) grobid_fulltext = es_fulltext_from_grobid( tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file ) if exclude_web_fulltext and grobid_fulltext: if not fulltext: # include only partial fulltext object, with no access fulltext = grobid_fulltext.remove_access() else: fulltext = grobid_fulltext if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext["release_ident"] ][0] fulltext_file = [ f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext["file_ident"] ][0] pdftotext_fulltext = es_fulltext_from_pdftotext( heavy.pdftotext_fulltext["raw_text"], heavy.pdf_meta, fulltext_release, fulltext_file, ) if exclude_web_fulltext and pdftotext_fulltext: fulltext = pdftotext_fulltext.remove_access() else: fulltext = pdftotext_fulltext if not fulltext and heavy.html_fulltext: fulltext_release = [ r for r in heavy.releases if r.ident == heavy.html_fulltext["release_ident"] ][0] fulltext_webcapture = [ f for f in fulltext_release.webcaptures if f.ident == heavy.html_fulltext["webcapture_ident"] ][0] html_fulltext = es_fulltext_from_html( heavy.html_fulltext, fulltext_release, fulltext_webcapture, ) if exclude_web_fulltext and html_fulltext: fulltext = html_fulltext.remove_access() else: fulltext = html_fulltext # TODO: additional access list (eg, HTML if only PDF currently) access_dict = dict() if fulltext and fulltext.access_type: access_dict[fulltext.access_type] = ScholarAccess( access_type=fulltext.access_type, access_url=fulltext.access_url, mimetype=fulltext.file_mimetype, file_ident=fulltext.file_ident, release_ident=fulltext.release_ident, ) if ia_sim and AccessType.ia_sim not in access_dict: access_dict[AccessType.ia_sim] = ScholarAccess( access_type=AccessType.ia_sim, access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}", # TODO: release_ident ) # TODO: additional abstracts (?) tags = generate_tags(biblio, primary_release) # biorxiv/medrxiv hacks if ( biblio.doi_prefix == "10.1101" and biblio.container_name in (None, "biorxiv/medrxiv") and biblio.release_stage != "published" ): for _, acc in access_dict.items(): if "://www.medrxiv.org/" in acc.access_url: biblio.container_name = "medRxiv" if biblio.release_stage is None: biblio.release_stage = "submitted" elif "://www.biorxiv.org/" in acc.access_url: biblio.container_name = "bioRxiv" if biblio.release_stage is None: biblio.release_stage = "submitted" return ScholarDoc( key=key, collapse_key=sim_issue or work_ident, doc_type=heavy.doc_type.value, doc_index_ts=datetime.datetime.utcnow(), work_ident=work_ident, tags=tags, biblio=biblio, fulltext=fulltext, ia_sim=ia_sim, abstracts=abstracts, releases=[es_release_from_release(r) for r in heavy.releases], access=list(access_dict.values()), )