def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace( 'sha1:', ''))).decode('ascii').lower() fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), mimetype=mimetype, release_ids=[], urls=[], ) # parse URLs and CDX original = cdx['url'] assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) fe.urls.append( fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: fe.urls.append( fatcat_client.FileEntityUrls(rel=original_url[0], url=original_url[1])) return fe
def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace( 'sha1:', ''))).decode('ascii').lower() # lookup existing SHA1, or create new entity try: existing_file = self.api.lookup_file(sha1=sha1) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err existing_file = None if existing_file: # if file is already in here, presumably not actually long-tail return None fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), mimetype=mimetype, releases=[], urls=[], ) # parse URLs and CDX original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) fe.urls.append( fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) original_url = self.make_url(original) if original_url != None: fe.urls.append(original_url) return fe
def parse_record(self, obj): dois = [d.lower() for d in obj.get('dois', [])] # lookup dois re_list = set() for doi in dois: try: re = self.api.lookup_release(doi=doi) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: #print("DOI not found: {}".format(doi)) pass else: re_list.add(re.ident) release_ids = list(re_list) if len(release_ids) == 0: self.counts['skip-no-doi'] += 1 return None # parse URLs and CDX urls = set() for url in obj.get('url', []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url != None: urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url != None: urls.add(url) urls = [ fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls ] if len(urls) == 0: return None size = obj.get('size') if size: size = int(size) fe = fatcat_client.FileEntity( md5=obj.get('md5'), sha1=obj['sha1'], sha256=obj.get('sha256'), size=size, mimetype=obj.get('mimetype'), release_ids=release_ids, urls=urls, ) return fe
def make_url(self, raw): rel = self.default_link_rel # TODO: this is where we could map specific domains to rel types, # and also filter out bad domains, invalid URLs, etc if "//archive.org/" in raw or "//arxiv.org/" in raw: # TODO: special-case the arxiv.org bulk mirror? rel = "repository" elif "//web.archive.org/" in raw or "//archive.is/" in raw: rel = "webarchive" return fatcat_client.FileEntityUrls(url=raw, rel=rel)
def try_update(self, fe): # lookup sha1, or create new entity existing = None try: existing = self.api.lookup_file(sha1=fe.sha1) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err if not existing: return True fe.release_ids = list(set(fe.release_ids + existing.release_ids)) if set(fe.release_ids) == set( existing.release_ids) and len(existing.urls) > 0: # no new release matches *and* there are already existing URLs self.counts['exists'] += 1 return False # merge the existing into this one and update existing.urls = list( set([(u.rel, u.url) for u in fe.urls + existing.urls])) existing.urls = [ fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls ] existing.release_ids = list(set(fe.release_ids + existing.release_ids)) existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 existing.sha256 = existing.sha256 or fe.sha256 self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) self.counts['update'] += 1 return False
def parse_matched_dict(self, obj): sha1 = obj['sha1'] dois = [d.lower() for d in obj.get('dois', [])] # lookup sha1, or create new entity fe = None if not self.skip_file_update: try: fe = self.api.lookup_file(sha1=sha1) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err if fe is None: fe = fatcat_client.FileEntity( sha1=sha1, releases=[], urls=[], ) # lookup dois re_list = set() for doi in dois: try: re = self.api.lookup_release(doi=doi) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err re = None if re is None: print("DOI not found: {}".format(doi)) else: re_list.add(re.ident) if len(re_list) == 0: return None if fe.releases == set(re_list): return None re_list.update(fe.releases) fe.releases = list(re_list) # parse URLs and CDX existing_urls = [feu.url for feu in fe.urls] for url in obj.get('url', []): if url not in existing_urls: url = self.make_url(url) if url != None: fe.urls.append(url) for cdx in obj.get('cdx', []): original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) if wayback not in existing_urls: fe.urls.append( fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) if original not in existing_urls: url = self.make_url(original) if url != None: fe.urls.append(url) if obj.get('size') != None: fe.size = int(obj['size']) fe.sha256 = obj.get('sha256', fe.sha256) fe.md5 = obj.get('md5', fe.sha256) if obj.get('mimetype') is None: if fe.mimetype is None: fe.mimetype = self.default_mime else: fe.mimetype = obj.get('mimetype') return fe