Пример #1
0
    async def _fetch_download_url(self):
        """Provider needs a WaterButler URL to download and get metadata.  If ``url`` is already
        a WaterButler url, return that.  If not, then the url points to an OSF endpoint that will
        redirect to WB.  Issue a GET request against it, then return the WB url stored in the
        Location header.
        """
        if not self.download_url:
            # v1 Waterbutler url provided
            path = urlparse(self.url).path
            if path.startswith('/v1/resources'):
                self.download_url = self.url
                self.metrics.add('download_url.orig_type', 'wb_v1')
            else:
                self.metrics.add('download_url.orig_type', 'osf')
                # make request to osf, don't follow, store waterbutler download url
                request = await self._make_request(
                    'GET',
                    self.url,
                    allow_redirects=False,
                    headers={'Content-Type': 'application/json'})
                await request.release()

                if request.status != 302:
                    raise exceptions.MetadataError(
                        request.reason,
                        metadata_url=self.url,
                        provider=self.NAME,
                        code=request.status,
                    )
                self.download_url = request.headers['location']

            self.metrics.add('download_url.derived_url',
                             str(self.download_url))

        return self.download_url
Пример #2
0
    async def metadata(self):
        """Fetch metadata about the file from WaterButler. V0 and V1 urls must be handled
        differently.
        """
        download_url = await self._fetch_download_url()
        logger.debug('download_url::{}'.format(download_url))
        if '/file?' in download_url:
            # URL is for WaterButler v0 API
            # TODO Remove this when API v0 is officially deprecated
            self.metrics.add('metadata.wb_api', 'v0')
            metadata_url = download_url.replace('/file?', '/data?', 1)
            metadata_response = await self._make_request('GET', metadata_url)
            metadata = await metadata_response.json()
        else:
            # URL is for WaterButler v1 API
            self.metrics.add('metadata.wb_api', 'v1')
            metadata_response = await self._make_request(
                'HEAD',
                download_url,
                headers={settings.MFR_ACTION_HEADER: self.action or ''}
            )
            response_code = metadata_response.status
            response_reason = metadata_response.reason
            response_headers = metadata_response.headers
            await metadata_response.release()
            if response_code != 200:
                raise exceptions.MetadataError(
                    'Failed to fetch file metadata from WaterButler. Received response: ',
                    'code {} {}'.format(str(response_code), str(response_reason)),
                    metadata_url=download_url,
                    response=response_reason,
                    provider=self.NAME,
                    code=400
                )

            try:
                metadata = {'data': json.loads(response_headers['x-waterbutler-metadata'])['attributes']}
            except ContentEncodingError:
                pass  # hack: aiohttp tries to unzip empty body when Content-Encoding is set

        self.metrics.add('metadata.raw', metadata)

        # e.g.,
        # metadata = {'data': {
        #     'name': 'blah.png',
        #     'contentType': 'image/png',
        #     'etag': 'ABCD123456...',
        #     'extra': {
        #         ...
        #     },
        # }}

        name, ext = os.path.splitext(metadata['data']['name'])
        size = metadata['data']['size']

        max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext)
        if max_file_size and size and int(size) > max_file_size:
            raise TooBigToRenderError(
                "This file with extension '{ext}' exceeds the size limit of {max_size} and will not "
                "be rendered. To view this file download it and view it "
                "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)),
                requested_size=int(size), maximum_size=max_file_size,
            )

        content_type = metadata['data']['contentType'] or mimetypes.guess_type(metadata['data']['name'])[0]
        cleaned_url = furl.furl(download_url)
        for unneeded in OsfProvider.UNNEEDED_URL_PARAMS:
            cleaned_url.args.pop(unneeded, None)
        self.metrics.add('metadata.clean_url_args', str(cleaned_url))
        meta = metadata['data']
        unique_key = hashlib.sha256((meta['etag'] + cleaned_url.url).encode('utf-8')).hexdigest()
        stable_str = '/{}/{}{}'.format(meta['resource'], meta['provider'], meta['path'])
        stable_id = hashlib.sha256(stable_str.encode('utf-8')).hexdigest()
        logger.debug('stable_identifier: str({}) hash({})'.format(stable_str, stable_id))

        return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url, stable_id)