Exemplos de HashStreamWriter em Python, exemplos de waterbutler.core.streams.HashStreamWriter em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: provider.py Projeto: icereval/waterbutler

    def upload(self, stream, path, conflict='replace', **kwargs):
        """Uploads the given stream to S3

        :param waterbutler.core.streams.RequestWrapper stream: The stream to put to S3
        :param str path: The full path of the key to upload to/into

        :rtype: dict, bool
        """
        path, exists = yield from self.handle_name_conflict(path, conflict=conflict)
        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))

        resp = yield from self.make_request(
            'PUT',
            self.bucket.new_key(path.path).generate_url(
                settings.TEMP_URL_SECS,
                'PUT',
                encrypt_key=self.encrypt_uploads
            ),
            data=stream,
            headers={'Content-Length': str(stream.size)},
            expects=(200, 201, ),
            throws=exceptions.UploadError,
        )
        # md5 is returned as ETag header as long as server side encryption is not used.
        # TODO: nice assertion error goes here
        assert resp.headers['ETag'].replace('"', '') == stream.writers['md5'].hexdigest

        return (yield from self.metadata(path, **kwargs)), not exists

Exemplo n.º 2

0

Exibir arquivo

Arquivo: provider.py Projeto: HarryRybacki/waterbutler

    def upload(self, stream, path, check_created=True, fetch_metadata=True, **kwargs):
        """Uploads the given stream to CloudFiles
        :param ResponseStreamReader stream: The stream to put to CloudFiles
        :param str path: The full path of the object to upload to/into
        :rtype ResponseStreamReader:
        """
        if check_created:
            created = not (yield from self.exists(path))
        else:
            created = None

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        url = self.sign_url(path, 'PUT')
        resp = yield from self.make_request(
            'PUT',
            url,
            data=stream,
            headers={'Content-Length': str(stream.size)},
            expects=(200, 201),
            throws=exceptions.UploadError,
        )
        # md5 is returned as ETag header as long as server side encryption is not used.
        # TODO: nice assertion error goes here
        assert resp.headers['ETag'].replace('"', '') == stream.writers['md5'].hexdigest

        if fetch_metadata:
            metadata = yield from self.metadata(path)
        else:
            metadata = None

        return metadata, created

Exemplo n.º 3

0

Exibir arquivo

Arquivo: provider.py Projeto: exploration-space/waterbutler

    async def _contiguous_upload(self, stream, path):
        """Uploads the given stream in one request.
        """

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))

        headers = {'Content-Length': str(stream.size)}
        # this is usually set in boto.s3.key.generate_url, but do it here
        # do be explicit about our header payloads for signing purposes
        if self.encrypt_uploads:
            headers['x-amz-server-side-encryption'] = 'AES256'
        upload_url = functools.partial(
            self.bucket.new_key(path.path).generate_url,
            settings.TEMP_URL_SECS,
            'PUT',
            headers=headers,
        )

        resp = await self.make_request(
            'PUT',
            upload_url,
            data=stream,
            skip_auto_headers={'CONTENT-TYPE'},
            headers=headers,
            expects=(200, 201, ),
            throws=exceptions.UploadError,
        )
        await resp.release()

        # md5 is returned as ETag header as long as server side encryption is not used.
        if stream.writers['md5'].hexdigest != resp.headers['ETag'].replace('"', ''):
            raise exceptions.UploadChecksumMismatchError()

Exemplo n.º 4

0

Exibir arquivo

    async def upload(self, stream, path, conflict='replace', **kwargs):
        """Upload a file to provider root or to an article whose defined_type is
        configured to represent a folder.

        :param asyncio.StreamReader stream: stream to upload
        :param FigsharePath path: FigsharePath to upload the file to.
        :param dict \*\*kwargs: Will be passed to returned metadata object
        """
        path, exists = await self.handle_name_conflict(path, conflict=conflict)
        if not path.parent.is_root:
            parent_resp = await self.make_request(
                'GET',
                self.build_url(False, *self.root_path_parts, 'articles',
                               path.parent.identifier),
                expects=(200, ),
            )
            parent_json = await parent_resp.json()
            if not parent_json['defined_type'] in settings.FOLDER_TYPES:
                del path._parts[1]

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        file_id = await self._upload_file(self.container_id, path.name, stream)

        # Build new file path and return metadata
        path = FigsharePath('/' + file_id,
                            _ids=('', file_id),
                            folder=False,
                            is_public=False)
        metadata = await self.metadata(path, **kwargs)
        if stream.writers['md5'].hexdigest != metadata.extra['hashes']['md5']:
            raise exceptions.UploadChecksumMismatchError()

        return metadata, True

Exemplo n.º 5

0

Exibir arquivo

    async def _create_blob(self, stream):
        blob_stream = streams.JSONStream({
            'encoding': 'base64',
            'content': streams.Base64EncodeStream(stream),
        })

        sha1_calculator = streams.HashStreamWriter(hashlib.sha1)
        stream.add_writer('sha1', sha1_calculator)
        git_blob_header = 'blob {}\0'.format(str(stream.size))
        sha1_calculator.write(git_blob_header.encode('utf-8'))

        resp = await self.make_request(
            'POST',
            self.build_repo_url('git', 'blobs'),
            data=blob_stream,
            headers={
                'Content-Type': 'application/json',
                'Content-Length': str(blob_stream.size),
            },
            expects=(201, ),
            throws=exceptions.UploadError,
        )

        blob_metadata = await resp.json()
        if stream.writers['sha1'].hexdigest != blob_metadata['sha']:
            raise exceptions.UploadChecksumMismatchError()

        return blob_metadata

Exemplo n.º 6

0

Exibir arquivo

Arquivo: parity.py Projeto: snakazawa/RDM-waterbutler

async def _upload_parity(path, credentials, settings):
    _, name = os.path.split(path)
    provider_name = settings.get('provider')
    provider = make_provider(provider_name, {}, credentials, settings)
    with open(path, 'rb') as file_pointer:
        stream = streams.FileStreamReader(file_pointer)
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))
        await provider.upload(stream,
                              (await provider.validate_path('/' + name)))
    return (name, stream.writers['sha256'].hexdigest)

Exemplo n.º 7

0

Exibir arquivo

    async def upload(self, stream, path, **kwargs):
        """Zips the given stream then uploads to Dataverse.
        This will delete existing draft files with the same name.

        :param waterbutler.core.streams.RequestWrapper stream: The stream to put to Dataverse
        :param str path: The filename prepended with '/'

        :rtype: dict, bool
        """

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))

        zip_stream = streams.ZipStreamReader(
            AsyncIterator([(path.name, stream)]))

        # Write stream to disk (Necessary to find zip file size)
        f = tempfile.TemporaryFile()
        chunk = await zip_stream.read()
        while chunk:
            f.write(chunk)
            chunk = await zip_stream.read()
        file_stream = streams.FileStreamReader(f)

        dv_headers = {
            "Content-Disposition": "filename=temp.zip",
            "Content-Type": "application/zip",
            "Packaging": "http://purl.org/net/sword/package/SimpleZip",
            "Content-Length": str(file_stream.size),
        }

        # Delete old file if it exists
        if path.identifier:
            await self.delete(path)

        resp = await self.make_request('POST',
                                       self.build_url(
                                           settings.EDIT_MEDIA_BASE_URL,
                                           'study', self.doi),
                                       headers=dv_headers,
                                       auth=(self.token, ),
                                       data=file_stream,
                                       expects=(201, ),
                                       throws=exceptions.UploadError)
        await resp.release()

        # Find appropriate version of file
        metadata = await self._get_data('latest')
        files = metadata if isinstance(metadata, list) else []
        file_metadata = next(file for file in files if file.name == path.name)

        if stream.writers['md5'].hexdigest != file_metadata.extra['hashes'][
                'md5']:
            raise exceptions.UploadChecksumMismatchError()

        return file_metadata, path.identifier is None

Exemplo n.º 8

0

Exibir arquivo

    async def upload(self, stream, path, conflict='replace', **kwargs):
        """Upload a file to provider root or to an article whose defined_type is
        configured to represent a folder.

        :param asyncio.StreamReader stream: stream to upload
        :param FigsharePath path: FigsharePath to upload the file to.
        :param dict \*\*kwargs: Will be passed to returned metadata object
        """
        if path.identifier and conflict == 'replace':
            raise exceptions.UnsupportedOperationError(
                'Files in Figshare cannot be updated')

        path, exists = await self.handle_name_conflict(path, conflict=conflict)
        if not path.parent.is_root:
            parent_resp = await self.make_request(
                'GET',
                self.build_url(False, *self.root_path_parts, 'articles',
                               path.parent.identifier),
                expects=(200, ),
            )
            parent_json = await parent_resp.json()
            if not parent_json['defined_type'] in settings.FOLDER_TYPES:
                del path._parts[1]

        # Create article or retrieve article_id from existing article
        if not path.parent.is_root:
            article_id = path.parent.identifier
        else:
            article_name = json.dumps({'title': path.name})
            if self.container_type == 'project':
                article_id = await self._create_article(article_name)
            elif self.container_type == 'collection':
                # TODO don't think this is correct.  Probably should POST to /accounts/articles
                article_id = await self._create_article(article_name)
                article_list = json.dumps({'articles': [article_id]})
                await self.make_request(
                    'POST',
                    self.build_url(False, *self.root_path_parts, 'articles'),
                    data=article_list,
                    expects=(201, ),
                )

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        file_id = await self._upload_file(article_id, path.name, stream)

        # Build new file path and return metadata
        path = FigsharePath('/' + article_id + '/' + file_id,
                            _ids=(self.container_id, article_id, file_id),
                            folder=False,
                            is_public=False)
        metadata = await self.metadata(path, **kwargs)
        if stream.writers['md5'].hexdigest != metadata.extra['hashes']['md5']:
            raise exceptions.UploadChecksumMismatchError()

        return metadata, True

Exemplo n.º 9

0

Exibir arquivo

    async def _upload_part(self, stream: streams.BaseStream, part_id: str,
                           part_size: int, start_offset: int,
                           session_id: str) -> dict:
        """Upload one part/chunk of the given stream to Box.

        Box requires that the sha of the part be sent along in the headers of the request.  To do
        this WB must write the stream segment to disk before uploading.  The part sha is calculated
        as the tempfile is written.

        API Docs: https://developer.box.com/reference#upload-part
        """

        cutoff_stream = streams.CutoffStream(stream, cutoff=part_size)
        part_hasher_name = 'part-{}-sha1'.format(part_id)
        stream.add_writer(part_hasher_name,
                          streams.HashStreamWriter(hashlib.sha1))

        f = tempfile.TemporaryFile()
        chunk = await cutoff_stream.read(self.TEMP_CHUNK_SIZE)
        while chunk:
            f.write(chunk)
            chunk = await cutoff_stream.read(self.TEMP_CHUNK_SIZE)
        file_stream = streams.FileStreamReader(f)

        part_sha = stream.writers[part_hasher_name].digest
        part_sha_b64 = base64.standard_b64encode(part_sha).decode()
        stream.remove_writer(part_hasher_name)

        byte_range = self._build_range_header(
            (start_offset, start_offset + part_size - 1))
        content_range = str(byte_range).replace('=', ' ') + '/{}'.format(
            stream.size)

        async with self.request(
                'PUT',
                self._build_upload_url('files', 'upload_sessions', session_id),
                headers=
            {
                # ``Content-Length`` is required for ``asyncio`` to use inner chunked stream read
                'Content-Length': str(part_size),
                'Content-Range': content_range,
                'Content-Type:': 'application/octet-stream',
                'Digest': 'sha={}'.format(part_sha_b64)
            },
                data=file_stream,
                expects=(201, 200),
                throws=exceptions.UploadError,
        ) as resp:
            data = await resp.json()

        f.close()
        return data['part']

Exemplo n.º 10

0

Exibir arquivo

Arquivo: provider.py Projeto: weko3-dev02/waterbutler

    async def _send_to_storage_provider(self, stream, path, **kwargs):
        """Send uploaded file data to the storage provider, where it will be stored w/o metadata
        in a content-addressable format.

        :return: metadata of the file as it exists on the storage provider
        """

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        remote_pending_path = await provider.validate_path('/' + pending_name)
        logger.debug(
            'upload: remote_pending_path::{}'.format(remote_pending_path))

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        await provider.upload(stream,
                              remote_pending_path,
                              check_created=False,
                              fetch_metadata=False,
                              **kwargs)

        complete_name = stream.writers['sha256'].hexdigest
        remote_complete_path = await provider.validate_path('/' +
                                                            complete_name)

        try:
            metadata = await provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = await provider.move(provider, remote_pending_path,
                                              remote_complete_path)
        else:
            await provider.delete(remote_pending_path)

        return metadata

Exemplo n.º 11

0

Exibir arquivo

    async def _chunked_upload(self, path: WaterButlerPath,
                              stream: streams.BaseStream) -> dict:
        """Upload a large file to Box over multiple requests. This method will be used if the
        file to upload is larger than ``NONCHUNKED_UPLOAD_LIMIT``.  Checksum verification is built
        into this process, so manual verification is not needed.

        API Docs: https://developer.box.com/reference#chunked-upload
        """

        # Step 1: Add a sha1 calculator. The final sha1 will be needed to complete the session
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))

        # Step 2: Create an upload session with Box and recieve session id.
        session_data = await self._create_chunked_upload_session(path, stream)
        logger.debug('chunked upload session data: {}'.format(
            json.dumps(session_data)))

        metadata = None

        try:
            # Step 3. Split the data into parts and upload them to box.
            parts_manifest = await self._upload_parts(stream, session_data)
            logger.debug('chunked upload parts manifest: {}'.format(
                json.dumps(parts_manifest)))

            data_sha = base64.standard_b64encode(
                stream.writers['sha1'].digest).decode()

            # Step 4. Complete the session and return the uploaded file's metadata.
            retry = self.UPLOAD_COMMIT_RETRIES
            while retry > 0:
                --retry
                try:
                    metadata = await self._complete_chunked_upload_session(
                        session_data, parts_manifest, data_sha)
                    break
                except RetryChunkedUploadCommit:
                    continue

        except Exception as err:
            msg = 'An unexpected error has occurred during the multi-part upload.'
            logger.error('{} upload_id={} error={!r}'.format(
                msg, session_data, err))
            aborted = await self._abort_chunked_upload(session_data, data_sha)
            if not aborted:
                msg += '  The abort action failed to clean up the temporary file parts generated ' \
                    'during the upload process.  Please manually remove them.'
            raise exceptions.UploadError(msg)

        return metadata

Exemplo n.º 12

0

Exibir arquivo

    async def upload(self, stream, path, conflict='replace', **kwargs):
        """Uploads the given stream to S3

        :param waterbutler.core.streams.RequestWrapper stream: The stream to put to S3
        :param str path: The full path of the key to upload to/into

        :rtype: dict, bool
        """
        await self._check_region()

        path, exists = await self.handle_name_conflict(path, conflict=conflict)
        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))

        headers = {'Content-Length': str(stream.size)}

        # this is usually set in boto.s3.key.generate_url, but do it here
        # do be explicit about our header payloads for signing purposes
        if self.encrypt_uploads:
            headers['x-amz-server-side-encryption'] = 'AES256'

        upload_url = functools.partial(
            self.bucket.new_key(path.path).generate_url,
            settings.TEMP_URL_SECS,
            'PUT',
            headers=headers,
        )
        resp = await self.make_request(
            'PUT',
            upload_url,
            data=stream,
            skip_auto_headers={'CONTENT-TYPE'},
            headers=headers,
            expects=(
                200,
                201,
            ),
            throws=exceptions.UploadError,
        )
        # md5 is returned as ETag header as long as server side encryption is not used.
        if stream.writers['md5'].hexdigest != resp.headers['ETag'].replace(
                '"', ''):
            raise exceptions.UploadChecksumMismatchError()

        await resp.release()
        return (await self.metadata(path, **kwargs)), not exists

Exemplo n.º 13

0

Exibir arquivo

    async def upload(
            self,  # type: ignore
            stream: streams.BaseStream,
            path: WaterButlerPath,
            conflict: str = 'replace',
            **kwargs) -> Tuple[BoxFileMetadata, bool]:
        if path.identifier and conflict == 'keep':
            path, _ = await self.handle_name_conflict(path,
                                                      conflict=conflict,
                                                      kind='folder')
            path._parts[-1]._id = None

        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))

        data_stream = streams.FormDataStream(
            attributes=json.dumps({
                'name': path.name,
                'parent': {
                    'id': path.parent.identifier
                }
            }))
        data_stream.add_file('file',
                             stream,
                             path.name,
                             disposition='form-data')

        async with self.request(
                'POST',
                self._build_upload_url(
                    *filter(lambda x: x is not None, ('files', path.identifier,
                                                      'content'))),
                data=data_stream,
                headers=data_stream.headers,
                expects=(201, ),
                throws=exceptions.UploadError,
        ) as resp:
            data = await resp.json()

        entry = data['entries'][0]
        if stream.writers['sha1'].hexdigest != entry['sha1']:
            raise exceptions.UploadChecksumMismatchError()

        created = path.identifier is None
        path._parts[-1]._id = entry['id']
        return BoxFileMetadata(entry, path), created

Exemplo n.º 14

0

Exibir arquivo

    async def _contiguous_upload(self, path: WaterButlerPath,
                                 stream: streams.BaseStream) -> dict:
        """Upload a file to Box using a single request. This will only be called if the file is
        smaller than the ``NONCHUNKED_UPLOAD_LIMIT``.

        API Docs: https://developer.box.com/reference#upload-a-file
        """
        assert stream.size <= self.NONCHUNKED_UPLOAD_LIMIT
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))

        data_stream = streams.FormDataStream(
            attributes=json.dumps({
                'name': path.name,
                'parent': {
                    'id': path.parent.identifier
                }
            }))
        data_stream.add_file('file',
                             stream,
                             path.name,
                             disposition='form-data')

        if path.identifier is not None:
            segments = ['files', path.identifier, 'content']
        else:
            segments = ['files', 'content']

        response = await self.make_request(
            'POST',
            self._build_upload_url(*segments),
            data=data_stream,
            headers=data_stream.headers,
            expects=(201, ),
            throws=exceptions.UploadError,
        )
        data = await response.json()

        entry = data['entries'][0]
        if stream.writers['sha1'].hexdigest != entry['sha1']:
            raise exceptions.UploadChecksumMismatchError()

        return entry

Exemplo n.º 15

0

Exibir arquivo

Arquivo: provider.py Projeto: snakazawa/RDM-waterbutler

    async def upload(self, stream, path: wb_path.WaterButlerPath, *args, **kwargs) \
            -> typing.Tuple[GoogleDriveFileMetadata, bool]:
        assert path.is_file

        if path.identifier:
            segments = [path.identifier]
        else:
            segments = []

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))

        upload_metadata = self._build_upload_metadata(path.parent.identifier, path.name)
        upload_id = await self._start_resumable_upload(not path.identifier, segments, stream.size,
                                                       upload_metadata)
        data = await self._finish_resumable_upload(segments, stream, upload_id)

        if data['md5Checksum'] != stream.writers['md5'].hexdigest:
            raise exceptions.UploadChecksumMismatchError()

        return GoogleDriveFileMetadata(data, path), path.identifier is None

Exemplo n.º 16

0

Exibir arquivo

Arquivo: provider.py Projeto: snakazawa/RDM-waterbutler

    async def upload(self,
                     stream,
                     path,
                     check_created=True,
                     fetch_metadata=True,
                     **kwargs):
        """Uploads the given stream to CloudFiles
        :param ResponseStreamReader stream: The stream to put to CloudFiles
        :param str path: The full path of the object to upload to/into
        :rtype ResponseStreamReader:
        """
        if check_created:
            created = not (await self.exists(path))
        else:
            created = None
        self.metrics.add('upload.check_created', check_created)

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        resp = await self.make_request(
            'PUT',
            functools.partial(self.sign_url, path, 'PUT'),
            data=stream,
            headers={'Content-Length': str(stream.size)},
            expects=(200, 201),
            throws=exceptions.UploadError,
        )
        await resp.release()
        # md5 is returned as ETag header as long as server side encryption is not used.
        if stream.writers['md5'].hexdigest != resp.headers['ETag'].replace(
                '"', ''):
            raise exceptions.UploadChecksumMismatchError()

        if fetch_metadata:
            metadata = await self.metadata(path)
        else:
            metadata = None
        self.metrics.add('upload.fetch_metadata', fetch_metadata)

        return metadata, created

Exemplo n.º 17

0

Exibir arquivo

Arquivo: provider.py Projeto: aashish24/waterbutler

    def upload(self, stream, path, **kwargs):
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING, pending_name)
        remote_pending_path = yield from provider.validate_path('/' + pending_name)

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        with open(local_pending_path, 'wb') as file_pointer:
            stream.add_writer('file', file_pointer)
            yield from provider.upload(stream, remote_pending_path, check_created=False, fetch_metadata=False, **kwargs)

        complete_name = stream.writers['sha256'].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE, complete_name)
        remote_complete_path = yield from provider.validate_path('/' + complete_name)

        try:
            metadata = yield from provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = yield from provider.move(provider, remote_pending_path, remote_complete_path)
        else:
            yield from provider.delete(remote_pending_path)
        finally:
            metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        response = yield from self.make_signed_request(
            'POST',
            self.build_url(path.parent.identifier, 'children'),
            expects=(200, 201),
            data=json.dumps({
                'name': path.name,
                'user': self.auth['id'],
                'settings': self.settings['storage'],
                'metadata': metadata,
                'hashes': {
                    'md5': stream.writers['md5'].hexdigest,
                    'sha1': stream.writers['sha1'].hexdigest,
                    'sha256': stream.writers['sha256'].hexdigest,
                },
                'worker': {
                    'host': os.uname()[1],
                    # TODO: Include additional information
                    'address': None,
                    'version': self.__version__,
                },
            }),
            headers={'Content-Type': 'application/json'},
        )

        created = response.status == 201
        data = yield from response.json()

        if settings.RUN_TASKS and data.pop('archive', True):
            parity.main(
                local_complete_path,
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update({
            'name': name,
            'path': data['data']['path'],
            'version': data['data']['version'],
            'downloads': data['data']['downloads']
        })

        return OsfStorageFileMetadata(metadata, str(path)), created

Exemplo n.º 18

0

Exibir arquivo

    async def upload(self, stream, path, **kwargs):
        """Upload a new file to osfstorage

        When a file is uploaded to osfstorage, WB does a bit of a dance to make sure it gets there
        reliably.  First we take the stream and add several hash calculators that can determine the
        hash of the file as it streams through.  We then tee the file so that it's written to a
        "pending" directory on both local disk and the remote storage provider.  Once that's
        complete, we determine the file's final location, which will be in another directory (by
        default called 'complete'), and renamed to its sha256 hash.   We then check to see if a
        file already exists at that path on the remote storage provider.  If it does, we can skip
        moving the file (since its already been uploaded) and instead delete the pending file. If
        it does not, we move the file on the remote storage provider from the pending path to its
        final path.

        Once this is done the local copy of the file is moved from the pending directory to the
        complete directory.  The file metadata is sent back to the metadata provider to be recorded.
        Finally, we schedule two futures to archive the locally complete file.  One copies the file
        into Amazon Glacier, the other calculates a parity archive, so that the file can be
        reconstructed if any on-disk corruption happens.  These tasks are scheduled via celery and
        don't need to complete for the request to finish.

        Finally, WB constructs its metadata response and sends that back to the original request
        issuer.

        The local file sitting in complete will be archived by the celery tasks at some point in
        the future.  The archivers do not signal when they have finished their task, so for the time
        being the local complete files are allowed to accumulate and must be deleted by some
        external process.  COS currently uses a cron job to delete files older than X days.  If the
        system is being heavily used, it's possible that the files may be deleted before the
        archivers are able to run.  To get around this we have another script in the osf.io
        repository that can audit our files on the remote storage and initiate any missing archives.

        """
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING,
                                          pending_name)
        remote_pending_path = await provider.validate_path('/' + pending_name)
        logger.debug(
            'upload: local_pending_path::{}'.format(local_pending_path))
        logger.debug(
            'upload: remote_pending_path::{}'.format(remote_pending_path))

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        try:
            with open(local_pending_path, 'wb') as file_pointer:
                stream.add_writer('file', file_pointer)
                await provider.upload(stream,
                                      remote_pending_path,
                                      check_created=False,
                                      fetch_metadata=False,
                                      **kwargs)
        except Exception as exc:
            # If we fail to upload to the remote storage provider, then delete the copy of the file
            # from the local provider, too.  The user will have to reupload the file to local
            # anyway, and this will avoid filling up the local disk with unused pending files.
            try:
                os.remove(local_pending_path)
            except OSError as os_exc:
                raise exceptions.UploadFailedError(
                    'Upload failed, please try again.') from os_exc
            raise exceptions.UploadFailedError(
                'Upload failed, please try again.') from exc

        complete_name = stream.writers['sha256'].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE,
                                           complete_name)
        remote_complete_path = await provider.validate_path('/' +
                                                            complete_name)

        try:
            metadata = await provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = await provider.move(provider, remote_pending_path,
                                              remote_complete_path)
        else:
            await provider.delete(remote_pending_path)

        metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        async with self.signed_request(
                'POST',
                self.build_url(path.parent.identifier, 'children'),
                expects=(200, 201),
                data=json.dumps({
                    'name': path.name,
                    'user': self.auth['id'],
                    'settings': self.settings['storage'],
                    'metadata': metadata,
                    'hashes': {
                        'md5': stream.writers['md5'].hexdigest,
                        'sha1': stream.writers['sha1'].hexdigest,
                        'sha256': stream.writers['sha256'].hexdigest,
                    },
                    'worker': {
                        'host': os.uname()[1],
                        # TODO: Include additional information
                        'address': None,
                        'version': self.__version__,
                    },
                }),
                headers={'Content-Type': 'application/json'},
        ) as response:
            created = response.status == 201
            data = await response.json()

        if settings.RUN_TASKS and data.pop('archive', True):
            parity.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update({
            'name':
            name,
            'md5':
            data['data']['md5'],
            'path':
            data['data']['path'],
            'sha256':
            data['data']['sha256'],
            'version':
            data['data']['version'],
            'downloads':
            data['data']['downloads'],
            'checkout':
            data['data']['checkout'],
            'modified':
            data['data']['modified'],
            'modified_utc':
            utils.normalize_datetime(data['data']['modified']),
        })

        path._parts[-1]._id = metadata['path'].strip('/')
        return OsfStorageFileMetadata(metadata, str(path)), created