def update_entity(self, fe: FileEntity) -> None: """ Mutates in place, updating fields with values from this form. Form must be validated *before* calling this function. """ for simple_attr in FILE_SIMPLE_ATTRS: a = getattr(self, simple_attr).data # be flexible about hash capitalization if simple_attr in ("md5", "sha1", "sha256"): a = a.lower() # special case blank strings if a == "": a = None setattr(fe, simple_attr, a) fe.urls = [] for u in self.urls: fe.urls.append( FileUrl( rel=u.rel.data or None, url=u.url.data or None, )) fe.release_ids = [] for ri in self.release_ids: fe.release_ids.append(ri.data) if self.edit_description.data: fe.edit_extra = dict(description=self.edit_description.data)
def clean_entity(self, entity: FileEntity) -> FileEntity: """ TODO: mimetype is bogus like (???) => clean mimetype """ # URL has ://web.archive.org/web/None/ link => delete URL entity.urls = [ u for u in entity.urls if "://web.archive.org/web/None/" not in u.url ] # URL has ://archive.org/ link with rel=repository => rel=archive for u in entity.urls: if "://archive.org/" in u.url and u.rel == "repository": u.rel = "archive" # URL has short wayback date ("2017") and another url with that as prefix => delete URL stub_wayback_urls = [] full_wayback_urls = [] for u in entity.urls: if "://web.archive.org/web/" in u.url: if len(u.url.split("/")[4]) <= 8: stub_wayback_urls.append(u.url) else: full_wayback_urls.append("/".join(u.url.split("/")[5:])) for stub in stub_wayback_urls: target = "/".join(stub.split("/")[5:]) if target in full_wayback_urls: entity.urls = [u for u in entity.urls if u.url != stub] return entity
def generic_file_cleanups(existing: FileEntity) -> FileEntity: """ Conservative cleanup of existing file entities. Intended to be used in most bulk cleanups and other file entity updates, to reduce edit volume for catalog size/churn efficiency. Note: the former check for 'None' as a wayback datetime has been completely cleaned up """ # update old/deprecated 'rel' on URLs for i in range(len(existing.urls)): u = existing.urls[i] if u.rel == "repository" and "://archive.org/download/" in u.url: existing.urls[i].rel = "archive" if u.rel == "social": u.rel = "academicsocial" # remove exact URL duplicates, while preserving order, and removing # "later" copies, not "first" copies # this is sensitive to both url.url and url.rel combined! dedupe_urls = [] for url_pair in existing.urls: if url_pair not in dedupe_urls: dedupe_urls.append(url_pair) existing.urls = dedupe_urls # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] all_wayback_urls = [ u.url for u in existing.urls if "://web.archive.org/web/" in u.url ] for url in all_urls: # https/http redundancy if url.startswith("http://") and url.replace( "http://", "https://", 1) in all_urls: redundant_urls.append(url) continue # default HTTP port included and not included if ":80/" in url and url.replace(":80", "", 1) in all_urls: redundant_urls.append(url) continue # partial and complete wayback timestamps if "://web.archive.org/web/2017/" in url: original_url = "/".join(url.split("/")[5:]) assert len(original_url) > 5 for wb_url in all_wayback_urls: alt_timestamp = wb_url.split("/")[4] if len(alt_timestamp) >= 10 and original_url in wb_url: redundant_urls.append(url) break existing.urls = [ u for u in existing.urls if u.url not in redundant_urls ] return existing
def parse_record(self, row: Dict[str, Any]) -> FileEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False file_ident = uuid2fcid(row["file_ident"]) wrong_release_ident = uuid2fcid(row["wrong_release_ident"]) edit_extra = row["edit_extra"] assert edit_extra["link_source"] in ["unpaywall", "doi"] file_edit_doi = clean_doi(edit_extra["link_source_id"]) if not file_edit_doi: self.counts["skip-bad-doi"] += 1 return False # check that the "wrong" release exists and doesn't have the DOI wrong_release = None try: wrong_release = self.api.get_release(wrong_release_ident) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not wrong_release: self.counts["skip-wrong-release-missing"] += 1 return None if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi: self.counts["skip-wrong-release-is-ok"] += 1 return None # fetch the "correct" release, if any fixed_release_ids = [] correct_release = None try: correct_release = self.api.lookup_release(doi=file_edit_doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if correct_release: fixed_release_ids.append(correct_release.ident) fe = FileEntity( ident=file_ident, release_ids=fixed_release_ids, edit_extra=edit_extra, ) fe._wrong_release_ident = wrong_release_ident return fe
def parse_record(self, row: Dict[str, Any]) -> FileEntity: request = row["request"] file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) if request.get("ingest_type") not in ("pdf", "xml"): self.counts["skip-ingest-type"] += 1 return None assert (request["ingest_type"], file_meta["mimetype"]) in [ ("pdf", "application/pdf"), ("xml", "application/xml"), ("xml", "application/jats+xml"), ("xml", "application/tei+xml"), ("xml", "text/xml"), ] # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = self.parse_ingest_release_ident(row) if not release_ident: self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) fe = FileEntity( md5=file_meta["md5hex"], sha1=file_meta["sha1hex"], sha256=file_meta["sha256hex"], size=file_meta["size_bytes"], mimetype=file_meta["mimetype"], release_ids=[release_ident], urls=urls, ) edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra return fe
def test_file_meta_importer_basic(file_meta_importer): # insert two file entities api = file_meta_importer.api eg = quick_eg(file_meta_importer.api) # with full metadata f1edit = api.create_file( eg.editgroup_id, FileEntity( size=372121, md5="e1fd97475c8aa102568f5d70a1bd0c07", sha1="0000045687dad717ed6512e395b04ec9c00995b7", sha256= "51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0", mimetype="application/pdf", )) # partial/stub metadata f2edit = api.create_file( eg.editgroup_id, FileEntity( sha1="00000376ad49f56145721503f1eb5e6e49e779fd", mimetype="application/pdf", )) api.accept_editgroup(eg.editgroup_id) last_index = file_meta_importer.api.get_changelog(limit=1)[0].index with open('tests/files/example_file_meta.json', 'r') as f: counts = JsonLinePusher(file_meta_importer, f).run() assert counts['insert'] == 0 assert counts['exists'] == 0 assert counts['update'] == 1 assert counts['skip-no-match'] == 4 assert counts['skip-missing-field'] == 1 assert counts['skip-existing-complete'] == 1 # cleanup file entities eg = quick_eg(file_meta_importer.api) api.delete_file(eg.editgroup_id, f1edit.ident) api.delete_file(eg.editgroup_id, f2edit.ident) api.accept_editgroup(eg.editgroup_id)
def generic_deleted_entity(entity_type: str, ident: str) -> Any: if entity_type == "container": entity: Any = ContainerEntity() elif entity_type == "creator": entity = CreatorEntity() elif entity_type == "file": entity = FileEntity() elif entity_type == "fileset": entity = FilesetEntity() elif entity_type == "webcapture": entity = WebcaptureEntity() elif entity_type == "release": entity = ReleaseEntity(ext_ids=ReleaseExtIds()) elif entity_type == "work": entity = WorkEntity() else: raise NotImplementedError entity.ident = ident entity.state = "deleted" return entity
def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]: dois = [d.lower() for d in obj.get("dois", [])] # lookup dois re_list = set() for doi in dois: doi = clean_doi(doi) if not doi: self.counts["skip-bad-doi"] += 1 return None try: re = self.api.lookup_release(doi=doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: # print("DOI not found: {}".format(doi)) pass else: re_list.add(re.ident) # look up other external ids for extid_type in ( "arxiv", "pmid", "pmcid", "jstor", "wikidata_qid", "core", "isbn13", "ark", ): extid = obj.get(extid_type) if extid: try: re = self.api.lookup_release(**{extid_type: extid}) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: pass else: re_list.add(re.ident) release_ids = list(re_list) if len(release_ids) == 0: self.counts["skip-no-releases"] += 1 return None if len(release_ids) > SANE_MAX_RELEASES: self.counts["skip-too-many-releases"] += 1 return None # parse URLs and CDX urls_set = set() for url in obj.get("urls", []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url is not None: urls_set.add(url) for cdx in obj.get("cdx", []): original = cdx["url"] if cdx.get("dt"): wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) urls_set.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url is not None: urls_set.add(url) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls_set] if len(urls) == 0: self.counts["skip-no-urls"] += 1 return None if len(urls) > SANE_MAX_URLS: self.counts["skip-too-many-urls"] += 1 return None size = obj.get("size") if size: size = int(size) mimetype = obj.get("mimetype", self.default_mimetype) if not mimetype and urls: if urls[0].url.endswith(".pdf"): mimetype = "application/pdf" fe = FileEntity( md5=obj.get("md5"), sha1=obj["sha1"], sha256=obj.get("sha256"), size=size, mimetype=mimetype, release_ids=release_ids, urls=urls, ) return fe
def to_entity(self) -> FileEntity: assert self.sha1.data entity = FileEntity() self.update_entity(entity) return entity
def test_rich_elasticsearch_convert(): r = ReleaseEntity( title="something", release_year=1234, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "ia": { "sim": { "year_spans": [[1000, 1100]], }, }, "kbart": { "lockss": { "year_spans": [[1200, 1300]], }, "jstor": { "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]], }, }, "sherpa_romeo": { "color": "blue" }, "doaj": { "as_of": "2010-02-03" }, }, ) r.files = [ FileEntity( mimetype="application/pdf", urls=[ FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), FileUrl( rel="webarchive", url= "https://web.archive.org/web/20001122030405/http://example.com", ), FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), ], extra={ "shadows": {}, }, ) ] es = release_to_elasticsearch(r) assert es["release_year"] == r.release_year assert es["file_count"] == 1 assert es["fileset_count"] == 0 assert es["webcapture_count"] == 0 assert es["ref_count"] == 2 assert es["ref_linked_count"] == 1 assert es["preservation"] == "bright" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is True assert es["in_web"] is True assert es["in_dweb"] is True assert es["in_ia"] is True assert es["in_ia_sim"] is False assert es["in_kbart"] is True assert es["in_jstor"] is True
def enrich_file_entity(entity: FileEntity) -> FileEntity: if entity.state == "active": entity._es = file_to_elasticsearch(entity) return entity
def to_entity(self): assert(self.sha1.data) entity = FileEntity() self.update_entity(entity) return entity
def parse_record(self, row: Dict[str, Any]) -> FileEntity: request = row["request"] # double check that want() filtered request correctly if request.get("ingest_type") not in [ "dataset", ]: self.counts["skip-ingest-type"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: self.counts["skip-release-not-found"] += 1 return None assert row["file_count"] == len(row["manifest"]) == 1 file_meta = row["manifest"][0] # print(file_meta) assert file_meta["status"] == "success" # add file-level access URLs entity_urls = [] if file_meta.get("platform_url"): entity_urls.append( FileUrl(rel="web", url=file_meta["platform_url"])) if file_meta.get("terminal_url") and file_meta.get("terminal_dt"): entity_urls.append( FileUrl( rel="webarchive", url= f"https://web.archive.org/web/{file_meta['terminal_dt']}/{file_meta['terminal_url']}", )) if row["ingest_strategy"] == "archiveorg-file": entity_urls.append( FileUrl( rel="archive", url= f"https://archive.org/download/{row['archiveorg_item_name']}/{file_meta['path']}", )) if not entity_urls: self.counts["skip-no-access-url"] += 1 return None entity_extra: Dict[str, Any] = dict() entity_extra["path"] = file_meta["path"] # this is to work around a bug in old sandcrawler ingest code if file_meta["md5"] == file_meta["sha1"]: self.counts["skip-bad-hashes"] += 1 return None fe = FileEntity( md5=file_meta["md5"], sha1=file_meta["sha1"], sha256=file_meta["sha256"], size=file_meta["size"], mimetype=file_meta["mimetype"], release_ids=[release_ident], urls=entity_urls, extra=entity_extra or None, ) if not (fe.md5 and fe.sha1 and fe.sha256 and (fe.size is not None) and fe.mimetype): self.counts["skip-partial-file-info"] += 1 return None edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra return fe
def test_merge_file_metadata_from(api) -> None: fm = FileMerger(api=api) fe_partial = FileEntity( ident="aaaasb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3aaaaaa", ) fe_norelease = FileEntity( ident="bbbbsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="d2c7318315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", ) fe_nourls = FileEntity( ident="ccccsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="d2c7318315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], ) fe_complete = FileEntity( ident="ddddsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="ddddddd315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], urls=[ FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"), ], extra=dict(asdf=123), ) fe_pseudo_complete = FileEntity( ident="eeeesb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], urls=[ FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"), ], extra=dict(asdf=123), ) fe_another_release_id = FileEntity( ident="fffffffapzfhbbxxc7rgu2yw6m", release_ids=["qqqqqg7mxrayxfltget7fqcrjy"], ) fe_another_url = FileEntity( ident="zzzzzzzapzfhbbxxc7rgu2yw6m", urls=[ FileUrl(rel="repository", url="http://someuni.edu/repo/file.pdf"), ], ) fe_more_extra = FileEntity( ident="fffffffapzfhbbxxc7rgu2yw6m", release_ids=["qqqqqg7mxrayxfltget7fqcrjy"], extra=dict(thang=456), ) assert fm.merge_file_metadata_from(fe_nourls, fe_partial) is False assert fm.merge_file_metadata_from(fe_complete, fe_pseudo_complete) is False assert fm.merge_file_metadata_from(fe_complete, fe_complete) is False assert fm.merge_file_metadata_from(fe_partial, fe_norelease) is True assert fe_partial.md5 == fe_norelease.md5 assert fe_partial.size == fe_norelease.size assert fm.merge_file_metadata_from(fe_partial, fe_complete) is True assert fe_partial.md5 != fe_complete.md5 assert fe_partial.extra == fe_complete.extra assert set([(u.rel, u.url) for u in fe_partial.urls or [] ]) == set([(u.rel, u.url) for u in fe_complete.urls or []]) assert fe_partial.release_ids == fe_complete.release_ids assert fm.merge_file_metadata_from(fe_partial, fe_another_release_id) is True assert fe_partial.release_ids == [ "dlrxjg7mxrayxfltget7fqcrjy", "qqqqqg7mxrayxfltget7fqcrjy", ] assert fm.merge_file_metadata_from(fe_partial, fe_another_release_id) is False assert fm.merge_file_metadata_from(fe_partial, fe_more_extra) is True assert fe_partial.extra == dict(asdf=123, thang=456) assert fm.merge_file_metadata_from(fe_partial, fe_more_extra) is False assert fm.merge_file_metadata_from(fe_partial, fe_another_url) is True assert fe_partial.urls[-1].url == "http://someuni.edu/repo/file.pdf" assert fm.merge_file_metadata_from(fe_partial, fe_another_url) is False
def test_choose_primary_file(api) -> None: fm = FileMerger(api=api) fe_partial = FileEntity( ident="aaaasb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3aaaaaa", ) fe_norelease = FileEntity( ident="bbbbsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="d2c7318315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", ) fe_nourls = FileEntity( ident="ccccsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="d2c7318315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], ) fe_complete = FileEntity( ident="ddddsb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", md5="d2c7318315bfc7d3aab0db933e95e632", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], urls=[ FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"), ], extra=dict(asdf=123), ) fe_pseudo_complete = FileEntity( ident="eeeesb5apzfhbbxxc7rgu2yw6m", sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb", sha256= "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163", size=60719, mimetype="application/pdf", release_ids=["dlrxjg7mxrayxfltget7fqcrjy"], urls=[ FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"), ], extra=dict(asdf=123), ) assert fm.choose_primary_file([fe_partial, fe_norelease ]) == "bbbbsb5apzfhbbxxc7rgu2yw6m" assert (fm.choose_primary_file([fe_partial, fe_nourls, fe_norelease ]) == "ccccsb5apzfhbbxxc7rgu2yw6m") assert (fm.choose_primary_file( [fe_partial, fe_complete, fe_nourls, fe_norelease]) == "ddddsb5apzfhbbxxc7rgu2yw6m") assert (fm.choose_primary_file( [fe_partial, fe_pseudo_complete, fe_nourls, fe_norelease]) == "ccccsb5apzfhbbxxc7rgu2yw6m")