def grobid_ref_to_release(ref: dict) -> ReleaseEntity: """ Takes the dict returned by transform_grobid_ref_xml() and returns a partial ReleaseEntity object (for use with fuzzycat) """ contribs = [] for author in ref.get("authors") or []: contribs.append( ReleaseContrib( raw_name=author.get("name"), given_name=author.get("given_name"), surname=author.get("surname"), )) release = ReleaseEntity( title=ref.get("title"), contribs=contribs, volume=ref.get("volume"), issue=ref.get("issue"), pages=ref.get("pages"), ext_ids=ReleaseExtIds( doi=clean_doi(ref.get("doi")), pmid=ref.get("pmid"), pmcid=ref.get("pmcid"), arxiv=ref.get("arxiv_id"), ), ) if ref.get("journal"): release.extra = {"container_name": ref.get("journal")} if ref.get("date"): if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): release.release_year = int(ref["date"][0:4]) # TODO: try to parse 'date' into an ISO date format, and assign to release_date? return release
def ref_to_release(ref: dict) -> ReleaseEntity: contribs = [] for author in ref.get("authors") or []: contribs.append( ReleaseContrib( raw_name=author.get("name"), given_name=author.get("given_name"), surname=author.get("surname"), )) release = ReleaseEntity( title=ref.get("title"), contribs=contribs, volume=ref.get("volume"), issue=ref.get("issue"), pages=ref.get("pages"), ext_ids=ReleaseExtIds( doi=ref.get("doi"), pmid=ref.get("pmid"), pmcid=ref.get("pmcid"), arxiv=ref.get("arxiv_id"), ), ) if ref.get("journal"): release.extra = {"container_name": ref.get("journal")} if ref.get("date"): if len(ref["date"]) == 4 and ref["date"].isdigit(): release.release_year = int(ref["date"]) return release
def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: contribs = [] for author in ref.authors or []: contribs.append( ReleaseContrib( raw_name=author.full_name, given_name=author.given_name, surname=author.surname, ) ) release = ReleaseEntity( title=ref.title, contribs=contribs, volume=ref.volume, issue=ref.issue, pages=ref.pages, ext_ids=ReleaseExtIds( doi=ref.doi, pmid=ref.pmid, pmcid=ref.pmcid, arxiv=ref.arxiv_id, ), ) if ref.journal: release.extra = {"container_name": ref.journal} if ref.date: if len(ref.date) == 4 and ref.date.isdigit(): release.release_year = int(ref.date) return release
def test_elasticsearch_release_kbart_year(): this_year = datetime.date.today().year r = ReleaseEntity( title="something", release_year=this_year, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "kbart": { "lockss": { "year_spans": [[1900, this_year - 2]], }, }, }, ) es = release_to_elasticsearch(r) assert es["release_year"] == this_year assert es["preservation"] == "none" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is False assert es["in_web"] is False assert es["in_dweb"] is False assert es["in_ia"] is False assert es["in_ia_sim"] is False assert es["in_kbart"] is False assert es["in_jstor"] is False r.container = ContainerEntity( name="dummy journal", extra={ "kbart": { "lockss": { "year_spans": [[1900, this_year - 1]], }, }, }, ) es = release_to_elasticsearch(r) assert es["release_year"] == this_year assert es["preservation"] == "dark" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is True assert es["in_web"] is False assert es["in_dweb"] is False assert es["in_ia"] is False assert es["in_ia_sim"] is False assert es["in_kbart"] is True assert es["in_jstor"] is False
def test_transform_refs_grobid() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() dummy_release = ReleaseEntity( ident="releasedummy22222222222222", work_id="workdummy22222222222222222", release_year=1234, ext_ids={}, ) tei_dict = teixml2json(blob, True) refs = refs_from_grobid(dummy_release, tei_dict) ref = refs[12].biblio assert ref.contrib_raw_names is not None assert ref.contrib_raw_names[0] == "K Tasa" assert ref.container_name == "Quality Management in Health Care" assert ref.title == "Using patient feedback for quality improvement" assert ref.year == 1996 assert ref.pages == "206-225" assert ref.volume == "8" assert ( ref.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )
def enrich_release_from_crossref(release: ReleaseEntity, record: Dict[str, Any]) -> ReleaseEntity: """ Hack to copy some SIM-relevant fields from Crossref record to release entity. We should really update fatcat catalog itself with these fields, instead of doing the update here in the scholar pipeline, but that is a more delicate update, and we expect this to help make SIM matches faster (late 2021/early 2022). """ if release.volume is None and record.get("volume"): release.volume = clean_str(record["volume"]) if release.issue is None and record.get("issue"): release.issue = clean_str(record["issue"]) if release.pages is None and record.get("pages"): release.pages = clean_str(record["page"]) return release
def update_entity(self, re: ReleaseEntity) -> None: """ Mutates a release entity in place, updating fields with values from this form. Form must be validated *before* calling this function. """ for simple_attr in RELEASE_SIMPLE_ATTRS: a = getattr(self, simple_attr).data # special case blank strings if a == "": a = None setattr(re, simple_attr, a) for extid_attr in RELEASE_EXTID_ATTRS: a = getattr(self, extid_attr).data # special case blank strings if a == "": a = None setattr(re.ext_ids, extid_attr, a) if self.release_date.data: re.release_year = self.release_date.data.year # bunch of complexity here to preserve old contrib metadata (eg, # affiliation and extra) not included in current forms # TODO: this may be broken; either way needs tests if re.contribs: old_contribs = re.contribs.copy() re.contribs = [] else: old_contribs = [] re.contribs = [] for c in self.contribs: if c.prev_index.data not in ("", None): rc = old_contribs[int(c.prev_index.data)] rc.role = c.role.data or None rc.raw_name = c.raw_name.data or None else: rc = ReleaseContrib( role=c.role.data or None, raw_name=c.raw_name.data or None, ) re.contribs.append(rc) if self.edit_description.data: re.edit_extra = dict(description=self.edit_description.data)
def parse_record(self, row: str) -> ReleaseEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False ident = row.strip().split()[0] assert len(ident) == 26 return ReleaseEntity( ident=ident, ext_ids=ReleaseExtIds(), )
def test_transform_refs_crossref() -> None: with open("tests/files/example_crossref_record.json", "r") as f: record = json.loads(f.read()) dummy_release = ReleaseEntity( ident="releasedummy22222222222222", work_id="workdummy22222222222222222", release_year=1234, release_stage="accepted", ext_ids={}, ) refs = refs_from_crossref(dummy_release, record) assert refs[0].release_ident == "releasedummy22222222222222" assert refs[0].work_ident == "workdummy22222222222222222" assert refs[0].release_stage == "accepted" assert refs[0].release_year == 1234 assert refs[0].ref_source == "crossref" assert refs[0].key == "BIB0001|his12200-cit-0001" assert refs[0].index == 1 assert refs[0].locator is None assert refs[0].biblio.contrib_raw_names is not None assert refs[0].biblio.contrib_raw_names[0] == "Churg" assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med." assert ( refs[0].biblio.title == "The separation of benign and malignant mesothelial proliferations" ) assert refs[0].biblio.year == 2012 assert refs[0].biblio.pages == "1217" assert refs[0].biblio.volume == "136" assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-ra" assert refs[0].biblio.unstructured is None assert ( refs[6].biblio.title == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference" ) assert refs[6].biblio.year == 2001 assert refs[7].key == "CIT0041" assert ( refs[7].biblio.unstructured == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6." ) assert refs[8].key == "576_CR3" assert refs[8].biblio.unstructured is not None assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication" assert refs[8].biblio.year == 1997 assert refs[8].biblio.version == "2"
def test_fuzzy_match_different(entity_importer, mocker) -> None: """ Simple fuzzycat-mocked test for "strong match" case """ r1 = ReleaseEntity( title="example title: novel work", contribs=[ReleaseContrib(raw_name="robin hood")], ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"), ) r2 = ReleaseEntity( title="Example Title: Novel Work?", contribs=[ReleaseContrib(raw_name="robin hood")], ext_ids=ReleaseExtIds(), ) r3 = ReleaseEntity( title="entirely different", contribs=[ReleaseContrib(raw_name="king tut")], ext_ids=ReleaseExtIds(), ) match_raw = mocker.patch( 'fatcat_tools.importers.common.match_release_fuzzy') match_raw.side_effect = [[r3, r2, r3, r2]] resp = entity_importer.match_existing_release_fuzzy(r1) assert (resp[0], resp[2]) == ("STRONG", r2) match_raw.side_effect = [[r2, r2, r3, r1]] resp = entity_importer.match_existing_release_fuzzy(r1) assert (resp[0], resp[2]) == ("EXACT", r1) match_raw.side_effect = [[r3]] resp = entity_importer.match_existing_release_fuzzy(r1) assert resp == None match_raw.side_effect = [[]] resp = entity_importer.match_existing_release_fuzzy(r1) assert resp == None
def biblio_to_release(biblio: dict) -> ReleaseEntity: """ Helper for close_fuzzy_biblio_matches() et al """ contribs = [] if biblio.get('authors'): for a in biblio['authors']: contribs.append( ReleaseContrib( raw_name=a.get('name'), given_name=a.get('given_name'), surname=a.get('surname'), )) elif biblio.get('author_names'): for a in biblio['author_names']: contribs.append(ReleaseContrib(raw_name=a)) elif biblio.get('first_author'): contribs.append(ReleaseContrib(raw_name=biblio['first_author'])) release = ReleaseEntity( title=biblio.get("title"), ext_ids=ReleaseExtIds( doi=clean_doi(biblio.get("doi")), pmid=biblio.get("pmid"), pmcid=biblio.get("pmcid"), arxiv=biblio.get("arxiv_id"), ), volume=biblio.get("volume"), issue=biblio.get("issue"), pages=biblio.get("pages") or biblio.get("first_page"), publisher=biblio.get("publisher"), release_stage=biblio.get("release_stage"), release_type=biblio.get("release_type"), extra=dict(), ) if biblio.get('journal'): release.extra['container_name'] = biblio['journal'] elif biblio.get('conference'): release.extra['container_name'] = biblio['conference'] if biblio.get('year'): year = biblio['year'] if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit(): release.release_year = int(year[0:4]) elif isinstance(year, int): release.release_year = year elif biblio.get('date'): date = biblio['date'] if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit(): release.release_year = int(date[0:4]) return release
def test_fuzzy_match_none(entity_importer, mocker) -> None: """ Simple ES-mocked test for "no search results" case """ es_raw = mocker.patch( 'elasticsearch.connection.Urllib3HttpConnection.perform_request') es_raw.side_effect = [ (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)), ] release = ReleaseEntity( title= "some long title which should not match anything because it is for testing", ext_ids=ReleaseExtIds(), ) resp = entity_importer.match_existing_release_fuzzy(release) assert resp == None
def generic_deleted_entity(entity_type: str, ident: str) -> Any: if entity_type == "container": entity: Any = ContainerEntity() elif entity_type == "creator": entity = CreatorEntity() elif entity_type == "file": entity = FileEntity() elif entity_type == "fileset": entity = FilesetEntity() elif entity_type == "webcapture": entity = WebcaptureEntity() elif entity_type == "release": entity = ReleaseEntity(ext_ids=ReleaseExtIds()) elif entity_type == "work": entity = WorkEntity() else: raise NotImplementedError entity.ident = ident entity.state = "deleted" return entity
def test_transform_refs_grobid() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() dummy_release = ReleaseEntity( ident="releasedummy22222222222222", work_id="workdummy22222222222222222", release_year=1234, release_stage="accepted", ext_ids={}, ) tei_doc = parse_document_xml(blob) refs = refs_from_grobid(dummy_release, tei_doc) ref = refs[12] assert ref.release_ident == "releasedummy22222222222222" assert ref.work_ident == "workdummy22222222222222222" assert ref.release_stage == "accepted" assert ref.release_year == 1234 assert ref.ref_source == "grobid" assert ref.key == "b12" assert ref.index == 13 assert ref.locator is None assert ref.biblio.contrib_raw_names is not None assert ref.biblio.contrib_raw_names[0] == "K Tasa" assert ref.biblio.container_name == "Quality Management in Health Care" assert ref.biblio.title == "Using patient feedback for quality improvement" assert ref.biblio.year == 1996 assert ref.biblio.pages == "206-225" assert ref.biblio.volume == "8" assert ( ref.biblio.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )
def to_entity(self) -> ReleaseEntity: assert self.title.data entity = ReleaseEntity(title=self.title.data, ext_ids=ReleaseExtIds()) self.update_entity(entity) return entity
def test_rich_elasticsearch_convert(): r = ReleaseEntity( title="something", release_year=1234, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "ia": { "sim": { "year_spans": [[1000, 1100]], }, }, "kbart": { "lockss": { "year_spans": [[1200, 1300]], }, "jstor": { "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]], }, }, "sherpa_romeo": { "color": "blue" }, "doaj": { "as_of": "2010-02-03" }, }, ) r.files = [ FileEntity( mimetype="application/pdf", urls=[ FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), FileUrl( rel="webarchive", url= "https://web.archive.org/web/20001122030405/http://example.com", ), FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), ], extra={ "shadows": {}, }, ) ] es = release_to_elasticsearch(r) assert es["release_year"] == r.release_year assert es["file_count"] == 1 assert es["fileset_count"] == 0 assert es["webcapture_count"] == 0 assert es["ref_count"] == 2 assert es["ref_linked_count"] == 1 assert es["preservation"] == "bright" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is True assert es["in_web"] is True assert es["in_dweb"] is True assert es["in_ia"] is True assert es["in_ia_sim"] is False assert es["in_kbart"] is True assert es["in_jstor"] is True
def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ # only runs on datacite entities with a DOI assert re.ext_ids.doi # release_type exception: Global Biodiversity Information Facility # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith( "10.15468/dl."): re.release_type = "stub" # release_type exception: lots of "Experimental Crystal Structure Determination" # publisher: "Cambridge Crystallographic Data Centre" if re.ext_ids.doi.startswith("10.5517/"): re.release_type = "entry" # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." if re.title.lower().startswith( "additional file") and re.release_type in ( "article", "article-journal", ): re.release_type = "component" # figshare if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith( "10.25384"): # set version if DOI ends with versioned suffix doi_suffix = re.ext_ids.doi.split(".")[-1] if doi_suffix and doi_suffix.startswith( "v") and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" if " from " in re.title and re.release_type not in ("stub", "graphic"): if re.title.startswith("Figure "): re.release_type = "component" elif re.title.startswith("Table "): re.release_type = "component" # figshare.com if (re.ext_ids.doi.startswith("10.6084/m9.figshare.") and re.extra.get("container_name") is None): re.extra["container_name"] = "figshare.com" # Columbia Institutional Repository includes full bibliographic # metadata, which results in incorrect container_id matches. But this # DOI prefix also publishes actual journals! if (re.ext_ids.doi.startswith("10.7916/") and "-" in re.ext_ids.doi and re.publisher == "Columbia University" and re.extra and re.extra.get("datacite")): for relation in re.extra["datacite"].get("relations", []): if relation.get("relationType") == "IsVariantFormOf": re.container_id = None if re.release_stage in ("published", None): re.release_stage = "submitted" # several institutional and other repositories (including "RWTH" and # "DESY") also results in incorrect container_id matches. # This probably doesn't filter out enough, but is a start. IR_DOI_PREFIXES = [ "10.15495/epub_ubt_", "10.18154/rwth-20", "10.3204/pubdb-", "10.3204/phppubdb-", "10.26204/kluedo/", ] for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get( "datacite"): if re.ext_ids.doi.startswith(prefix): for relation in re.extra["datacite"].get("relations", []): if relation.get("relationType") == "IsVariantFormOf": re.container_id = None return re
def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]: if not record: return None metadata = record.arXivRaw if not metadata: return None extra: Dict[str, Any] = dict() extra_arxiv: Dict[str, Any] = dict() # don't know! release_type = "article" base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace("\n", " ")) authors = parse_arxiv_authors(metadata.authors.get_text().replace( "\n", " ")) contribs = [ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author") for i, a in enumerate(authors) ] lang: Optional[str] = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace("\n", " ").strip() extra_arxiv["comments"] = comments if "in french" in comments.lower(): lang = "fr" elif "in spanish" in comments.lower(): lang = "es" elif "in portuguese" in comments.lower(): lang = "pt" elif "in hindi" in comments.lower(): lang = "hi" elif "in japanese" in comments.lower(): lang = "ja" elif "in german" in comments.lower(): lang = "de" elif "simplified chinese" in comments.lower(): lang = "zh" elif "in russian" in comments.lower(): lang = "ru" # more languages? number = None if metadata.find("journal-ref") and metadata.find( "journal-ref").get_text(): journal_ref = metadata.find("journal-ref").get_text().replace( "\n", " ").strip() extra_arxiv["journal_ref"] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower( ): release_type = "paper-conference" if metadata.find("report-no") and metadata.find("report-no").string: number = metadata.find("report-no").string.strip() # at least some people plop extra metadata in here. hrmf! if "ISSN " in number or "ISBN " in number or len( number.split()) > 2: extra_arxiv["report-no"] = number number = None else: release_type = "report" if metadata.find("acm-class") and metadata.find("acm-class").string: extra_arxiv["acm_class"] = metadata.find( "acm-class").string.strip() if metadata.categories and metadata.categories.get_text(): extra_arxiv["categories"] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) abstracts = None if metadata.abstract: # TODO: test for this multi-abstract code path abstracts = [] abst = metadata.abstract.get_text().strip() orig = None if "-----" in abst: both = abst.split("-----") abst = both[0].strip() orig = both[1].strip() if "$" in abst or "{" in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) abstracts.append( fatcat_openapi_client.ReleaseAbstract( content=abst_plain, mimetype="text/plain", lang="en")) else: mime = "text/plain" abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) if orig: abstracts.append( fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) # indicates that fulltext probably isn't english either if lang == "en": lang = None # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # arxiv: comments, categories, etc extra_arxiv["base_id"] = base_id extra["superceded"] = True extra["arxiv"] = extra_arxiv versions = [] for version in metadata.find_all("version"): arxiv_id = base_id + version["version"] release_date = version.date.string.strip() release_date = datetime.datetime.strptime( release_date, "%a, %d %b %Y %H:%M:%S %Z").date() # TODO: source_type? versions.append( ReleaseEntity( work_id=None, title=title, # original_title version=version["version"], release_type=release_type, release_stage="submitted", release_date=release_date.isoformat(), release_year=release_date.year, ext_ids=fatcat_openapi_client.ReleaseExtIds( arxiv=arxiv_id, ), number=number, language=lang, license_slug=license_slug, abstracts=abstracts, contribs=contribs, extra=extra.copy(), )) # TODO: assert that versions are actually in order? assert versions versions[-1].extra.pop("superceded") # only apply DOI to most recent version (HACK) if doi: versions[-1].ext_ids.doi = doi if len(versions) > 1: versions[-1].release_stage = "accepted" return versions
def enrich_release_entity(entity: ReleaseEntity) -> ReleaseEntity: if entity.state in ("redirect", "deleted"): return entity if entity.state == "active": entity._es = release_to_elasticsearch(entity, force_bool=False) if entity.container and entity.container.state == "active": entity.container._es = container_to_elasticsearch(entity.container, force_bool=False) if entity.files: # remove shadows-only files with no URLs entity.files = [ f for f in entity.files if not (f.extra and f.extra.get("shadows") and not f.urls) ] if entity.filesets: for fs in entity.filesets: fs._total_size = sum([f.size for f in fs.manifest]) if entity.webcaptures: for wc in entity.webcaptures: wc._wayback_suffix = wayback_suffix(wc) for ref in entity.refs: # this is a UI hack to get rid of XML crud in unstructured refs like: # LOCKSS (2014) Available: <ext-link # xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" # xlink:href="http://lockss.org/" # xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014 # November 1. if ref.extra and ref.extra.get("unstructured"): ref.extra["unstructured"] = strip_extlink_xml( ref.extra["unstructured"]) # for backwards compatibility, copy extra['subtitle'] to subtitle if not entity.subtitle and entity.extra and entity.extra.get("subtitle"): if isinstance(entity.extra["subtitle"], str): entity.subtitle = entity.extra["subtitle"] elif isinstance(entity.extra["subtitle"], list): entity.subtitle = entity.extra["subtitle"][0] or None # author list to display; ensure it's sorted by index (any othors with # index=None go to end of list) authors = [ c for c in entity.contribs if c.role in ("author", None) and ( c.surname or c.raw_name or (c.creator and c.creator.surname)) ] entity._authors = sorted(authors, key=lambda c: (c.index is None and 99999999) or c.index) # need authors, title for citeproc to work entity._can_citeproc = bool(entity._authors) and bool(entity.title) if entity.abstracts and entity.abstracts[0].mimetype: # hack to show plain text instead of latex abstracts if "latex" in entity.abstracts[0].mimetype: entity.abstracts.reverse() # hack to (partially) clean up common JATS abstract display case if entity.abstracts[0].mimetype == "application/xml+jats": for tag in ("p", "jats", "jats:p"): entity.abstracts[0].content = entity.abstracts[ 0].content.replace("<{}>".format(tag), "") entity.abstracts[0].content = entity.abstracts[ 0].content.replace("</{}>".format(tag), "") # ugh, double encoding happens entity.abstracts[0].content = entity.abstracts[ 0].content.replace("</{}>".format(tag), "") entity.abstracts[0].content = entity.abstracts[ 0].content.replace("<{}>".format(tag), "") return entity
def try_update(self, re: ReleaseEntity) -> bool: # first, lookup existing by PMID (which must be defined) existing = None try: existing = self.api.lookup_release(pmid=re.ext_ids.pmid) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err # then try DOI lookup if there is one if not existing and re.ext_ids.doi: try: existing = self.api.lookup_release(doi=re.ext_ids.doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid ) warnings.warn(warn_str) self.counts["warn-pmid-doi-mismatch"] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None re.work_id = existing.work_id if existing and not self.do_updates: self.counts["exists"] += 1 return False if ( existing and existing.ext_ids.pmid and (existing.ext_ids.pmcid or not re.ext_ids.pmcid) and (existing.refs or not re.refs) ): # TODO: any other reasons to do an update? # don't update if it already has PMID self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid existing.container_id = existing.container_id or re.container_id existing.refs = existing.refs or re.refs existing.abstracts = existing.abstracts or re.abstracts existing.extra["pubmed"] = re.extra["pubmed"] # fix stub titles if existing.title in [ "OUP accepted manuscript", ]: existing.title = re.title existing.original_title = existing.original_title or re.original_title existing.release_type = existing.release_type or re.release_type existing.release_stage = existing.release_stage or re.release_stage existing.release_date = existing.release_date or re.release_date existing.release_year = existing.release_year or re.release_year existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages existing.language = existing.language or re.language # update subtitle in-place first if not existing.subtitle and existing.extra.get("subtitle"): subtitle = existing.extra.pop("subtitle") if type(subtitle) == list: subtitle = subtitle[0] if subtitle: existing.subtitle = subtitle if not existing.subtitle: existing.subtitle = re.subtitle try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: self.counts["skip-update-conflict"] += 1 return False else: raise err finally: return False return True
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ obj is a python dict (parsed from json). returns a ReleaseEntity """ # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get("type") in ( None, "journal", "proceedings", "standard-series", "report-series", "book-series", "book-set", "book-track", "proceedings-series", ): self.counts["skip-release-type"] += 1 return None # Do require the 'title' keys to exist, as release entities do if ("title" not in obj) or (not obj["title"]): self.counts["skip-blank-title"] += 1 return None release_type = self.map_release_type(obj["type"]) # contribs def do_contribs(obj_list: List[Dict[str, Any]], ctype: str) -> List[ReleaseContrib]: contribs = [] for i, am in enumerate(obj_list): creator_id = None if "ORCID" in am.keys(): creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( if am.get("given") and am.get("family"): raw_name: Optional[str] = "{} {}".format( am["given"], am["family"]) elif am.get("family"): raw_name = am["family"] else: # TODO: can end up empty raw_name = am.get("name") or am.get("given") extra: Dict[str, Any] = dict() if ctype == "author": index: Optional[int] = i else: index = None raw_affiliation = None affiliation_list = am.get("affiliation") or [] # TODO: currently requiring a "name" in all affiliations. Could # add ROR support (via identifier) in the near future affiliation_list = [a for a in affiliation_list if "name" in a] if affiliation_list and len(affiliation_list) > 0: raw_affiliation = affiliation_list[0]["name"] if len(affiliation_list) > 1: # note: affiliation => more_affiliations extra["more_affiliations"] = [ clean_str(a["name"]) for a in affiliation_list[1:] ] if am.get("sequence") and am.get("sequence") != "additional": extra["seq"] = clean_str(am.get("sequence")) assert ctype in ("author", "editor", "translator") raw_name = clean_str(raw_name) # TODO: what if 'raw_name' is None? contribs.append( ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, given_name=clean_str(am.get("given")), surname=clean_str(am.get("family")), raw_affiliation=clean_str(raw_affiliation), role=ctype, extra=extra or None, )) return contribs contribs = do_contribs(obj.get("author", []), "author") contribs.extend(do_contribs(obj.get("editor", []), "editor")) contribs.extend(do_contribs(obj.get("translator", []), "translator")) # container issn = obj.get("ISSN", [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = clean_str(obj.get("publisher")) container_name = obj.get("container-title") if container_name: container_name = clean_str(container_name[0], force_xml=True) if not container_name: container_name = None if (container_id is None and self.create_containers and (issnl is not None) and container_name): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), name=container_name, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id # license slug license_slug = None license_extra = [] for lic in obj.get("license", []): if lic["content-version"] not in ("vor", "unspecified"): continue slug = lookup_license_slug(lic["URL"]) if slug: license_slug = slug if "start" in lic: lic["start"] = lic["start"]["date-time"] license_extra.append(lic) # references refs = [] for i, rm in enumerate(obj.get("reference", [])): try: year: Optional[int] = int(rm.get("year")) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year is not None: if year > 2025 or year < 100: year = None except (TypeError, ValueError): year = None ref_extra: Dict[str, Any] = dict() key = rm.get("key") if key and key.startswith(obj["DOI"].upper()): key = key.replace(obj["DOI"].upper() + "-", "") key = key.replace(obj["DOI"].upper(), "") ref_container_name = rm.get("volume-title") if not ref_container_name: ref_container_name = rm.get("journal-title") elif rm.get("journal-title"): ref_extra["journal-title"] = rm["journal-title"] if rm.get("DOI"): ref_extra["doi"] = rm.get("DOI").lower() author = clean_str(rm.get("author")) if author: ref_extra["authors"] = [author] for k in ( "editor", "edition", "authority", "version", "genre", "url", "event", "issue", "volume", "date", "accessed_date", "issued", "page", "medium", "collection_title", "chapter_number", "unstructured", "series-title", "volume-title", ): if clean_str(rm.get(k)): ref_extra[k] = clean_str(rm[k]) refs.append( fatcat_openapi_client.ReleaseRef( index=i, # doing lookups would be a second import pass target_release_id=None, key=key, year=year, container_name=clean_str(ref_container_name), title=clean_str(rm.get("article-title")), locator=clean_str(rm.get("first-page")), # TODO: just dump JSON somewhere here? extra=ref_extra or None, )) # abstracts abstracts = [] abstract = clean_str(obj.get("abstract")) if abstract and len(abstract) > 10: abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="application/xml+jats", content=abstract)) # extra fields extra: Dict[str, Any] = dict() extra_crossref: Dict[str, Any] = dict() # top-level extra keys if not container_id: if obj.get("container-title"): extra["container_name"] = container_name for key in "group-title": val = obj.get(key) if val: if type(val) == list: val = val[0] if type(val) == str: val = clean_str(val) if val: extra[key] = clean_str(val) else: extra[key] = val # crossref-nested extra keys for key in ("subject", "type", "alternative-id", "archive", "funder"): val = obj.get(key) if val: if type(val) == str: extra_crossref[key] = clean_str(val) else: extra_crossref[key] = val if license_extra: extra_crossref["license"] = license_extra if len(obj["title"]) > 1: aliases = [clean_str(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: extra["aliases"] = aliases # ISBN isbn13 = None for raw in obj.get("ISBN", []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status if obj["type"] in ( "journal-article", "conference-proceeding", "book", "dissertation", "book-chapter", ): release_stage: Optional[str] = "published" else: # unknown release_stage = None # filter out unreasonably huge releases if len(abstracts) > 100: self.counts["skip-huge-abstracts"] += 1 return None if len(contribs) > 2000: self.counts["skip-huge-contribs"] += 1 return None if len(refs) > 5000: self.counts["skip-huge-refs"] += 1 return None # release date parsing is amazingly complex raw_date = obj["issued"]["date-parts"][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None release_date = None elif len(raw_date) == 3: release_year = raw_date[0] release_date = datetime.date(year=raw_date[0], month=raw_date[1], day=raw_date[2]) else: # sometimes only the year is included, not the full date release_year = raw_date[0] release_date = None original_title: Optional[str] = None if obj.get("original-title"): ot = obj.get("original-title") if ot is not None: original_title = clean_str(ot[0], force_xml=True) title: Optional[str] = None if obj.get("title"): title = clean_str(obj["title"][0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts["skip-blank-title"] += 1 return None doi = clean_doi(obj["DOI"].lower()) if not doi: self.counts["skip-bad-doi"] += 1 return None subtitle = None if obj.get("subtitle"): subtitle = clean_str(obj["subtitle"][0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: extra["crossref"] = extra_crossref re = ReleaseEntity( work_id=None, container_id=container_id, title=title, subtitle=subtitle, original_title=original_title, release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, isbn13=isbn13, ), volume=clean_str(obj.get("volume")), issue=clean_str(obj.get("issue")), pages=clean_str(obj.get("page")), language=clean_str(obj.get("language")), license_slug=license_slug, extra=extra or None, abstracts=abstracts or None, contribs=contribs or None, refs=refs or None, ) return re
def parse_record(self, record: Any) -> Optional[ReleaseEntity]: """ record is a beautiful soup object returns a ReleaseEntity, or None In JALC metadata, both English and Japanese records are given for most fields. """ extra: Dict[str, Any] = dict() extra_jalc: Dict[str, Any] = dict() titles = record.find_all("title") if not titles: return None title = titles[0].get_text().replace("\n", " ").strip() original_title = None if title.endswith("."): title = title[:-1] if len(titles) > 1: original_title = titles[1].get_text().replace("\n", " ").strip() if original_title.endswith("."): original_title = original_title[:-1] doi = None if record.doi: doi = clean_doi(record.doi.string.strip().lower()) # TODO: following code is redundant with clean_doi() if not doi: return None if doi.startswith("http://dx.doi.org/"): doi = doi.replace("http://dx.doi.org/", "") elif doi.startswith("https://dx.doi.org/"): doi = doi.replace("https://dx.doi.org/", "") elif doi.startswith("http://doi.org/"): doi = doi.replace("http://doi.org/", "") elif doi.startswith("https://doi.org/"): doi = doi.replace("https://doi.org/", "") if not (doi.startswith("10.") and "/" in doi): sys.stderr.write("bogus JALC DOI: {}\n".format(doi)) doi = None if not doi: return None people = record.find_all("Person") contribs = parse_jalc_persons(people) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i release_year = None release_date = None date = record.date or None if date: date = date.string if len(date) == 10: release_date_date = datetime.datetime.strptime( date["completed-date"], DATE_FMT).date() release_year = release_date_date.year release_date = release_date_date.isoformat() elif len(date) == 4 and date.isdigit(): release_year = int(date) pages = None if record.startingPage and record.startingPage.string.strip(): pages = record.startingPage.string.strip() if record.endingPage and record.endingPage.string.strip(): pages = "{}-{}".format(pages, record.endingPage.string.strip()) # double check to prevent "-" as pages if pages and pages.strip() == "-": pages = None volume = None if record.volume: volume = record.volume.string issue = None if record.number: # note: number/issue transform issue = record.number.string # container issn = None issn_list = record.find_all("issn") if issn_list: # if we wanted the other ISSNs, would also need to uniq the list. # But we only need one to lookup ISSN-L/container issn = issn_list[0].string if issn: issnl = self.issn2issnl(issn) else: issnl = None container_id = None if issnl: container_id = self.lookup_issnl(issnl) publisher = None container_name = None container_extra: Dict[str, Any] = dict() if record.publicationName: pubs = [ p.get_text().replace("\n", " ").strip() for p in record.find_all("publicationName") if p.get_text() ] pubs = [clean_str(p) for p in pubs if p] assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] container_name = clean_str(pubs[0]) if len(pubs) > 1: container_extra["original_name"] = clean_str(pubs[1]) if record.publisher: pubs = [ p.get_text().replace("\n", " ").strip() for p in record.find_all("publisher") if p.get_text() ] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: publisher = clean_str(pubs[0]) if len(pubs) > 1: container_extra["publisher_aliases"] = pubs[1:] if (container_id is None and self.create_containers and (issnl is not None) and container_name): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_extra["country"] = "jp" container_extra["languages"] = ["ja"] ce = fatcat_openapi_client.ContainerEntity( name=container_name, container_type="journal", publisher=publisher, issnl=issnl, extra=(container_extra or None), ) ce_edit = self.create_container(ce) container_id = ce_edit.ident # short-cut future imports in same batch self._issnl_id_map[issnl] = container_id # the vast majority of works are in japanese # TODO: any indication when *not* in japanese? lang = "ja" # reasonable default for this collection release_type = "article-journal" # extra: # translation_of # aliases # container_name # group-title # always put at least an empty dict here to indicate the DOI registrar # (informally) extra["jalc"] = extra_jalc title = clean_str(title) if not title: return None re = ReleaseEntity( work_id=None, title=title, original_title=clean_str(original_title), release_type=release_type, release_stage="published", release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds(doi=doi, ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=lang, # license_slug container_id=container_id, contribs=contribs, extra=extra, ) return re