def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace( 'sha1:', ''))).decode('ascii').lower() fe = fatcat_openapi_client.FileEntity( sha1=sha1, size=int(file_size), mimetype=mimetype, release_ids=[], urls=[], ) # parse URLs and CDX original = cdx['url'] assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) fe.urls.append( fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: fe.urls.append( fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])) return fe
def parse_record(self, row): extid = row['identifier'].strip() # check/cleanup DOI if self.extid_type == 'doi': extid = extid.lower() extid.replace('http://doi.org/', '') extid.replace('https://doi.org/', '') if extid.startswith('doi:'): extid = extid[4:] if not extid.startswith('10.'): self.counts['skip-extid-invalid'] return None # lookup extid try: re = self.api.lookup_release(**{self.extid_type: extid}) except fatcat_openapi_client.rest.ApiException as err: if err.status == 404: # bail on 404 (release not in DB) self.counts['skip-extid-not-found'] += 1 return None elif err.status == 400: self.counts['skip-extid-invalid'] += 1 return None else: raise err url = make_rel_url(row['final_url'], self.default_link_rel) if not url: self.counts['skip-url'] += 1 return None if not row['final_timestamp']: self.counts['skip-missing-timestamp'] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( row['final_timestamp'], row['final_url']) urls = [url, ("webarchive", wayback)] urls = [ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls ] if len(urls) > SANE_MAX_URLS: self.counts['skip-too-many-url'] += 1 return None fe = fatcat_openapi_client.FileEntity( sha1=b32_hex(row['final_sha1']), mimetype=row['final_mimetype'] or self.default_mimetype, release_ids=[re.ident], urls=urls, ) return fe
def parse_record(self, row): # bezerk mode doesn't make sense for this importer assert self.bezerk_mode == False file_meta = row fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], sha1=file_meta['sha1hex'], sha256=file_meta['sha256hex'], size=file_meta['size_bytes'], mimetype=file_meta['mimetype'], ) return fe
def parse_record(self, row: Dict[str, Any]) -> FileEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False file_meta = row fe = fatcat_openapi_client.FileEntity( md5=file_meta["md5hex"], sha1=file_meta["sha1hex"], sha256=file_meta["sha256hex"], size=file_meta["size_bytes"], mimetype=file_meta["mimetype"], ) return fe
def parse_record(self, row): request = row['request'] file_meta = row['file_meta'] # double check that want() filtered request correctly (eg, old requests) if request.get('ingest_type') not in ('pdf', 'xml'): self.counts['skip-ingest-type'] += 1 return None assert (request['ingest_type'], file_meta['mimetype']) in [ ("pdf", "application/pdf"), ("xml", "application/xml"), ("xml", "application/jats+xml"), ("xml", "application/tei+xml"), ("xml", "text/xml"), ] # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = self.parse_ingest_release_ident(row) if not release_ident: self.counts['skip-release-not-found'] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? self.counts['skip-no-terminal'] += 1 return None urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], sha1=file_meta['sha1hex'], sha256=file_meta['sha256hex'], size=file_meta['size_bytes'], mimetype=file_meta['mimetype'], release_ids=[release_ident], urls=urls, ) edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra return fe
def parse_record(self, obj): """ We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ shadow_corpus = obj['shadow']['shadow_corpus'] assert shadow_corpus == shadow_corpus.strip().lower() doi = clean_doi(obj['shadow'].get('doi')) pmid = clean_pmid(obj['shadow'].get('pmid')) isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) shadow_id = obj['shadow'].get('shadow_id').strip() assert shadow_id extra = {'{}_id'.format(shadow_corpus): shadow_id} for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: if not ext_id: continue extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id # lookup release via several idents re = None for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: if not ext_id: continue try: re = self.api.lookup_release(**{ext_type: ext_id}) except fatcat_openapi_client.rest.ApiException as err: if err.status not in (404, 400): raise err re = None if re: break if not re: self.counts['skip-release-not-found'] += 1 return None release_ids = [ re.ident, ] # parse single CDX into URLs (if exists) urls = [] if obj.get('cdx'): url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) if url != None: urls.append(url) wayback = "https://web.archive.org/web/{}/{}".format( obj['cdx']['datetime'], obj['cdx']['url']) urls.append(("webarchive", wayback)) urls = [ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls ] fe = fatcat_openapi_client.FileEntity( md5=obj['file_meta']['md5hex'], sha1=obj['file_meta']['sha1hex'], sha256=obj['file_meta']['sha256hex'], size=int(obj['file_meta']['size_bytes']), mimetype=obj['file_meta']['mimetype'] or None, release_ids=release_ids, urls=urls, extra=dict(shadows=extra), ) return fe
def parse_record(self, obj): dois = [d.lower() for d in obj.get('dois', [])] # lookup dois re_list = set() for doi in dois: doi = clean_doi(doi) if not doi: self.counts['skip-bad-doi'] += 1 return None try: re = self.api.lookup_release(doi=doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: #print("DOI not found: {}".format(doi)) pass else: re_list.add(re.ident) # look up other external ids for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'): extid = obj.get(extid_type) if extid: try: re = self.api.lookup_release(**{extid_type: extid}) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: pass else: re_list.add(re.ident) release_ids = list(re_list) if len(release_ids) == 0: self.counts['skip-no-releases'] += 1 return None if len(release_ids) > SANE_MAX_RELEASES: self.counts['skip-too-many-releases'] += 1 return None # parse URLs and CDX urls = set() for url in obj.get('urls', []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url != None: urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] if cdx.get('dt'): wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url != None: urls.add(url) urls = [ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls ] if len(urls) == 0: self.counts['skip-no-urls'] += 1 return None if len(urls) > SANE_MAX_URLS: self.counts['skip-too-many-urls'] += 1 return None size = obj.get('size') if size: size = int(size) mimetype = obj.get('mimetype', self.default_mimetype) if not mimetype and urls: if urls[0].url.endswith('.pdf'): mimetype = 'application/pdf' fe = fatcat_openapi_client.FileEntity( md5=obj.get('md5'), sha1=obj['sha1'], sha256=obj.get('sha256'), size=size, mimetype=mimetype, release_ids=release_ids, urls=urls, ) return fe
def parse_record(self, row): request = row['request'] fatcat = request.get('fatcat') file_meta = row['file_meta'] # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None if fatcat and fatcat.get('release_ident'): release_ident = fatcat.get('release_ident') elif request.get('ext_ids'): # if no fatcat ident, try extids for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv'): extid = request['ext_ids'].get(extid_type) if not extid: continue try: release = self.api.lookup_release(**{extid_type: extid}) except fatcat_openapi_client.rest.ApiException as err: if err.status == 404: continue elif err.status == 400: self.counts['warn-extid-invalid'] += 1 continue release_ident = release.ident break if not release_ident and row.get('grobid'): # try biblio-glutton extracted hit if row['grobid'].get('fatcat_release'): release_ident = row['grobid']['fatcat_release'].split('_')[-1] self.counts['glutton-match'] += 1 if not release_ident: self.counts['skip-release-not-found'] += 1 return None terminal = row.get('terminal') if not terminal: # support old cdx-only ingest results cdx = row.get('cdx') if not cdx: # TODO: support archive.org hits? self.counts['skip-no-terminal'] += 1 return None else: terminal = { 'terminal_url': cdx['url'], 'terminal_dt': cdx['datetime'], 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), } # work around old schema if not 'terminal_url' in terminal: terminal['terminal_url'] = terminal['url'] if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 default_rel = self.default_link_rel if request.get('link_source') == 'doi': default_rel = 'publisher' default_rel = request.get('rel', default_rel) url = make_rel_url(terminal['terminal_url'], default_rel) if not url: self.counts['skip-url'] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( terminal['terminal_dt'], terminal['terminal_url']) urls = [url, ("webarchive", wayback)] urls = [ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls ] fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], sha1=file_meta['sha1hex'], sha256=file_meta['sha256hex'], size=file_meta['size_bytes'], mimetype=file_meta['mimetype'], release_ids=[release_ident], urls=urls, ) if request.get('edit_extra'): fe.edit_extra = request['edit_extra'] else: fe.edit_extra = dict() if request.get('ingest_request_source'): fe.edit_extra['ingest_request_source'] = request[ 'ingest_request_source'] if request.get('link_source') and request.get('link_source_id'): fe.edit_extra['link_source'] = request['link_source'] fe.edit_extra['link_source_id'] = request['link_source_id'] if not fe.edit_extra: fe.edit_extra = None return fe
def test_access_redirect_fallback(client: Any, mocker: Any) -> None: with open("tests/files/elastic_fulltext_get.json") as f: elastic_resp = json.loads(f.read()) es_raw = mocker.patch( "elasticsearch.connection.Urllib3HttpConnection.perform_request" ) es_raw.side_effect = [ (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), (200, {}, json.dumps(elastic_resp)), ] fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") fatcat_get_work_raw.side_effect = [ fatcat_openapi_client.WorkEntity( state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", ) ] * 4 fatcat_get_work_releases_raw = mocker.patch( "fatcat_openapi_client.DefaultApi.get_work_releases" ) fatcat_get_work_releases_raw.side_effect = [ [ fatcat_openapi_client.ReleaseEntity( ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", ext_ids=fatcat_openapi_client.ReleaseExtIds(), ), ] ] * 4 fatcat_get_release_raw = mocker.patch( "fatcat_openapi_client.DefaultApi.get_release" ) fatcat_get_release_raw.side_effect = [ fatcat_openapi_client.ReleaseEntity( state="active", ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", ext_ids=fatcat_openapi_client.ReleaseExtIds(), files=[ fatcat_openapi_client.FileEntity( ident="ffffffffffffffffffffffffff", urls=[ fatcat_openapi_client.FileUrl( rel="web", url="https://blarg.example.com", ), fatcat_openapi_client.FileUrl( rel="webarchive", url="https://web.archive.org/web/12345/https://example.com", ), fatcat_openapi_client.FileUrl( rel="archive", url="https://archive.org/download/some/thing.pdf", ), ], ), ], ) ] * 4 # redirects should work after API lookup, for both wayback and archive.org rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", allow_redirects=False, ) assert rv.status_code == 302 assert ( rv.headers["Location"] == "https://web.archive.org/web/12345id_/https://example.com" ) rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", allow_redirects=False, ) assert rv.status_code == 302 assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" # wrong URLs should still not work, but display a page with helpful links rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, ) assert rv.status_code == 404 assert b"Access Location Not Found" in rv.content assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", allow_redirects=False, ) assert rv.status_code == 404 assert b"Access Location Not Found" in rv.content assert b"archive.org/download/some/thing.else.pdf" in rv.content