예제 #1
0
    def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):

        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace(
            'sha1:', ''))).decode('ascii').lower()

        fe = fatcat_client.FileEntity(
            sha1=sha1,
            size=int(file_size),
            mimetype=mimetype,
            release_ids=[],
            urls=[],
        )

        # parse URLs and CDX
        original = cdx['url']
        assert len(cdx['dt']) >= 8
        wayback = "https://web.archive.org/web/{}/{}".format(
            cdx['dt'], original)
        fe.urls.append(
            fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
        original_url = make_rel_url(original,
                                    default_link_rel=self.default_link_rel)
        if original_url is not None:
            fe.urls.append(
                fatcat_client.FileEntityUrls(rel=original_url[0],
                                             url=original_url[1]))

        return fe
예제 #2
0
    def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):

        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace(
            'sha1:', ''))).decode('ascii').lower()

        # lookup existing SHA1, or create new entity
        try:
            existing_file = self.api.lookup_file(sha1=sha1)
        except fatcat_client.rest.ApiException as err:
            if err.status != 404:
                raise err
            existing_file = None

        if existing_file:
            # if file is already in here, presumably not actually long-tail
            return None
        fe = fatcat_client.FileEntity(
            sha1=sha1,
            size=int(file_size),
            mimetype=mimetype,
            releases=[],
            urls=[],
        )

        # parse URLs and CDX
        original = cdx['url']
        wayback = "https://web.archive.org/web/{}/{}".format(
            cdx['dt'], original)
        fe.urls.append(
            fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
        original_url = self.make_url(original)
        if original_url != None:
            fe.urls.append(original_url)

        return fe
예제 #3
0
    def parse_record(self, obj):
        dois = [d.lower() for d in obj.get('dois', [])]

        # lookup dois
        re_list = set()
        for doi in dois:
            try:
                re = self.api.lookup_release(doi=doi)
            except fatcat_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
                re = None
            if re is None:
                #print("DOI not found: {}".format(doi))
                pass
            else:
                re_list.add(re.ident)
        release_ids = list(re_list)
        if len(release_ids) == 0:
            self.counts['skip-no-doi'] += 1
            return None

        # parse URLs and CDX
        urls = set()
        for url in obj.get('url', []):
            url = make_rel_url(url, default_link_rel=self.default_link_rel)
            if url != None:
                urls.add(url)
        for cdx in obj.get('cdx', []):
            original = cdx['url']
            wayback = "https://web.archive.org/web/{}/{}".format(
                cdx['dt'], original)
            urls.add(("webarchive", wayback))
            url = make_rel_url(original,
                               default_link_rel=self.default_link_rel)
            if url != None:
                urls.add(url)
        urls = [
            fatcat_client.FileEntityUrls(rel=rel, url=url)
            for (rel, url) in urls
        ]
        if len(urls) == 0:
            return None

        size = obj.get('size')
        if size:
            size = int(size)

        fe = fatcat_client.FileEntity(
            md5=obj.get('md5'),
            sha1=obj['sha1'],
            sha256=obj.get('sha256'),
            size=size,
            mimetype=obj.get('mimetype'),
            release_ids=release_ids,
            urls=urls,
        )
        return fe
예제 #4
0
 def make_url(self, raw):
     rel = self.default_link_rel
     # TODO: this is where we could map specific domains to rel types,
     # and also filter out bad domains, invalid URLs, etc
     if "//archive.org/" in raw or "//arxiv.org/" in raw:
         # TODO: special-case the arxiv.org bulk mirror?
         rel = "repository"
     elif "//web.archive.org/" in raw or "//archive.is/" in raw:
         rel = "webarchive"
     return fatcat_client.FileEntityUrls(url=raw, rel=rel)
예제 #5
0
    def try_update(self, fe):
        # lookup sha1, or create new entity
        existing = None
        try:
            existing = self.api.lookup_file(sha1=fe.sha1)
        except fatcat_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        if not existing:
            return True

        fe.release_ids = list(set(fe.release_ids + existing.release_ids))
        if set(fe.release_ids) == set(
                existing.release_ids) and len(existing.urls) > 0:
            # no new release matches *and* there are already existing URLs
            self.counts['exists'] += 1
            return False

        # merge the existing into this one and update
        existing.urls = list(
            set([(u.rel, u.url) for u in fe.urls + existing.urls]))
        existing.urls = [
            fatcat_client.FileEntityUrls(rel=rel, url=url)
            for (rel, url) in existing.urls
        ]
        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
        existing.mimetype = existing.mimetype or fe.mimetype
        existing.size = existing.size or fe.size
        existing.md5 = existing.md5 or fe.md5
        existing.sha256 = existing.sha256 or fe.sha256
        self.api.update_file(existing.ident,
                             existing,
                             editgroup_id=self.get_editgroup_id())
        self.counts['update'] += 1
        return False
예제 #6
0
    def parse_matched_dict(self, obj):
        sha1 = obj['sha1']
        dois = [d.lower() for d in obj.get('dois', [])]

        # lookup sha1, or create new entity
        fe = None
        if not self.skip_file_update:
            try:
                fe = self.api.lookup_file(sha1=sha1)
            except fatcat_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
        if fe is None:
            fe = fatcat_client.FileEntity(
                sha1=sha1,
                releases=[],
                urls=[],
            )

        # lookup dois
        re_list = set()
        for doi in dois:
            try:
                re = self.api.lookup_release(doi=doi)
            except fatcat_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
                re = None
            if re is None:
                print("DOI not found: {}".format(doi))
            else:
                re_list.add(re.ident)
        if len(re_list) == 0:
            return None
        if fe.releases == set(re_list):
            return None
        re_list.update(fe.releases)
        fe.releases = list(re_list)

        # parse URLs and CDX
        existing_urls = [feu.url for feu in fe.urls]
        for url in obj.get('url', []):
            if url not in existing_urls:
                url = self.make_url(url)
                if url != None:
                    fe.urls.append(url)
        for cdx in obj.get('cdx', []):
            original = cdx['url']
            wayback = "https://web.archive.org/web/{}/{}".format(
                cdx['dt'], original)
            if wayback not in existing_urls:
                fe.urls.append(
                    fatcat_client.FileEntityUrls(url=wayback,
                                                 rel="webarchive"))
            if original not in existing_urls:
                url = self.make_url(original)
                if url != None:
                    fe.urls.append(url)

        if obj.get('size') != None:
            fe.size = int(obj['size'])
        fe.sha256 = obj.get('sha256', fe.sha256)
        fe.md5 = obj.get('md5', fe.sha256)
        if obj.get('mimetype') is None:
            if fe.mimetype is None:
                fe.mimetype = self.default_mime
        else:
            fe.mimetype = obj.get('mimetype')
        return fe