예제 #1
0
def test_upload_dir(tmpdir, nasa_item):
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS)

        tmpdir.mkdir('dir_test')
        with open(os.path.join(str(tmpdir), 'dir_test', 'foo.txt'), 'w') as fh:
            fh.write('hi')
        with open(os.path.join(str(tmpdir), 'dir_test', 'foo2.txt'),
                  'w') as fh:
            fh.write('hi 2')

        # Test no-slash upload, dir is not in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test') +
                                      '/',
                                      access_key='a',
                                      secret_key='b')
        expected_eps = [
            f'{S3_URL}nasa/foo.txt',
            f'{S3_URL}nasa/foo2.txt',
        ]
        for resp in _responses:
            assert resp.request.url in expected_eps

        # Test slash upload, dir is in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test'),
                                      access_key='a',
                                      secret_key='b')
        tmp_path = norm_filepath(str(tmpdir))
        expected_eps = [
            f'{S3_URL}nasa{tmp_path}/dir_test/foo.txt',
            f'{S3_URL}nasa{tmp_path}/dir_test/foo2.txt',
        ]
        for resp in _responses:
            assert resp.request.url in expected_eps
예제 #2
0
def test_upload_dir(tmpdir, nasa_item):
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS)

        tmpdir.mkdir("dir_test")
        with open(os.path.join(str(tmpdir), "dir_test", "foo.txt"), "w") as fh:
            fh.write("hi")
        with open(os.path.join(str(tmpdir), "dir_test", "foo2.txt"), "w") as fh:
            fh.write("hi 2")

        # Test no-slash upload, dir is not in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), "dir_test") + "/", access_key="a", secret_key="b")
        expected_eps = ["{0}nasa/foo.txt".format(S3_URL), "{0}nasa/foo2.txt".format(S3_URL)]
        for resp in _responses:
            assert resp.request.url in expected_eps

        # Test slash upload, dir is in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), "dir_test"), access_key="a", secret_key="b")
        tmp_path = norm_filepath(str(tmpdir))
        print(tmp_path)
        expected_eps = [
            "{0}nasa{1}/dir_test/{2}".format(S3_URL, tmp_path, "foo.txt"),
            "{0}nasa{1}/dir_test/{2}".format(S3_URL, tmp_path, "foo2.txt"),
        ]
        for resp in _responses:
            assert resp.request.url in expected_eps
예제 #3
0
def test_upload_dir(tmpdir, nasa_item):
    with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
        rsps.add(responses.PUT, S3_URL_RE,
                 adding_headers=EXPECTED_S3_HEADERS)

        tmpdir.mkdir('dir_test')
        with open(os.path.join(str(tmpdir), 'dir_test', 'foo.txt'), 'w') as fh:
            fh.write('hi')
        with open(os.path.join(str(tmpdir), 'dir_test', 'foo2.txt'), 'w') as fh:
            fh.write('hi 2')

        # Test no-slash upload, dir is not in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test') + '/',
                                      access_key='a',
                                      secret_key='b')
        expected_eps = [
            '{0}nasa/foo.txt'.format(S3_URL),
            '{0}nasa/foo2.txt'.format(S3_URL),
        ]
        for resp in _responses:
            assert resp.request.url in expected_eps

        # Test slash upload, dir is in key name.
        _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test'),
                                      access_key='a',
                                      secret_key='b')
        tmp_path = norm_filepath(str(tmpdir))
        expected_eps = [
            '{0}nasa{1}/dir_test/{2}'.format(S3_URL, tmp_path, 'foo.txt'),
            '{0}nasa{1}/dir_test/{2}'.format(S3_URL, tmp_path, 'foo2.txt'),
        ]
        for resp in _responses:
            assert resp.request.url in expected_eps
예제 #4
0
    def upload(self, files,
               metadata=None,
               headers=None,
               access_key=None,
               secret_key=None,
               queue_derive=None,
               verbose=None,
               verify=None,
               checksum=None,
               delete=None,
               retries=None,
               retries_sleep=None,
               debug=None,
               request_kwargs=None):
        """Upload files to an item. The item will be created if it
        does not exist.

        :type files: list
        :param files: The filepaths or file-like objects to upload.

        :type kwargs: dict
        :param kwargs: The keyword arguments from the call to
                       upload_file().

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> md = dict(mediatype='image', creator='Jake Johnson')
            >>> item.upload('/path/to/image.jpg', metadata=md, queue_derive=False)
            True

        :rtype: list
        :returns: A list of requests.Response objects.
        """
        queue_derive = True if queue_derive is None else queue_derive
        remote_dir_name = None
        if isinstance(files, dict):
            files = list(files.items())
        if not isinstance(files, (list, tuple)):
            files = [files]

        responses = []
        file_index = 0
        if checksum:
            total_files = recursive_file_count(files, item=self, checksum=True)
        else:
            total_files = recursive_file_count(files, item=self, checksum=False)
        for f in files:
            if (isinstance(f, string_types) and is_dir(f)) \
                    or (isinstance(f, tuple) and is_dir(f[-1])):
                if isinstance(f, tuple):
                    remote_dir_name = f[0].strip('/')
                    f = f[-1]
                for filepath, key in iter_directory(f):
                    file_index += 1
                    # Set derive header if queue_derive is True,
                    # and this is the last request being made.
                    if queue_derive is True and file_index >= total_files:
                        _queue_derive = True
                    else:
                        _queue_derive = False
                    if not f.endswith('/'):
                        if remote_dir_name:
                            key = '{0}{1}/{2}'.format(remote_dir_name, f, key)
                        else:
                            key = '{0}/{1}'.format(f, key)
                    elif remote_dir_name:
                        key = '{0}/{1}'.format(remote_dir_name, key)
                    key = norm_filepath(key)
                    resp = self.upload_file(filepath,
                                            key=key,
                                            metadata=metadata,
                                            headers=headers,
                                            access_key=access_key,
                                            secret_key=secret_key,
                                            queue_derive=_queue_derive,
                                            verbose=verbose,
                                            verify=verify,
                                            checksum=checksum,
                                            delete=delete,
                                            retries=retries,
                                            retries_sleep=retries_sleep,
                                            debug=debug,
                                            request_kwargs=request_kwargs)
                    responses.append(resp)
            else:
                file_index += 1
                # Set derive header if queue_derive is True,
                # and this is the last request being made.
                # if queue_derive is True and file_index >= len(files):
                if queue_derive is True and file_index >= total_files:
                    _queue_derive = True
                else:
                    _queue_derive = False

                if not isinstance(f, (list, tuple)):
                    key, body = (None, f)
                else:
                    key, body = f
                if key and not isinstance(key, string_types):
                    key = str(key)
                resp = self.upload_file(body,
                                        key=key,
                                        metadata=metadata,
                                        headers=headers,
                                        access_key=access_key,
                                        secret_key=secret_key,
                                        queue_derive=_queue_derive,
                                        verbose=verbose,
                                        verify=verify,
                                        checksum=checksum,
                                        delete=delete,
                                        retries=retries,
                                        retries_sleep=retries_sleep,
                                        debug=debug,
                                        request_kwargs=request_kwargs)
                responses.append(resp)
        return responses
예제 #5
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            if key:
                filename = key
            else:
                filename = body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            headers['Content-Length'] = '0'

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get('transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(u'uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier, f=key))
                    body.close()
                    os.remove(filename)
                body.close()
                return response
            except HTTPError as exc:
                body.close()
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
예제 #6
0
    def upload(self,
               files,
               metadata=None,
               headers=None,
               access_key=None,
               secret_key=None,
               queue_derive=None,
               verbose=None,
               verify=None,
               checksum=None,
               delete=None,
               retries=None,
               retries_sleep=None,
               debug=None,
               request_kwargs=None):
        """Upload files to an item. The item will be created if it
        does not exist.

        :type files: str, file, list, tuple, dict
        :param files: The filepaths or file-like objects to upload.

        :param \*\*kwargs: Optional arguments that :func:`Item.upload_file()` takes.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> md = dict(mediatype='image', creator='Jake Johnson')
            >>> item.upload('/path/to/image.jpg', metadata=md, queue_derive=False)
            [<Response [200]>]

        Uploading multiple files::

            >>> r = item.upload(['file1.txt', 'file2.txt'])
            >>> r = item.upload([fileobj, fileobj2])
            >>> r = item.upload(('file1.txt', 'file2.txt'))

        Uploading file objects:

            >>> import io
            >>> f = io.BytesIO(b"some initial binary data: \\x00\\x01")
            >>> r = item.upload({'remote-name.txt': f})
            >>> f = io.BytesIO(b"some more binary data: \\x00\\x01")
            >>> f.name = 'remote-name.txt'
            >>> r = item.upload(f)

            *Note: file objects must either have a name attribute, or be uploaded in a
            dict where the key is the remote-name*

        Setting the remote filename with a dict::

            >>> r = item.upload({'remote-name.txt': '/path/to/local/file.txt'})

        :rtype: list
        :returns: A list of :class:`requests.Response` objects.
        """
        queue_derive = True if queue_derive is None else queue_derive
        remote_dir_name = None
        if isinstance(files, dict):
            files = list(files.items())
        if not isinstance(files, (list, tuple)):
            files = [files]

        responses = []
        file_index = 0
        if checksum:
            total_files = recursive_file_count(files, item=self, checksum=True)
        else:
            total_files = recursive_file_count(files,
                                               item=self,
                                               checksum=False)
        for f in files:
            if (isinstance(f, string_types) and is_dir(f)) \
                    or (isinstance(f, tuple) and is_dir(f[-1])):
                if isinstance(f, tuple):
                    remote_dir_name = f[0].strip('/')
                    f = f[-1]
                for filepath, key in iter_directory(f):
                    file_index += 1
                    # Set derive header if queue_derive is True,
                    # and this is the last request being made.
                    if queue_derive is True and file_index >= total_files:
                        _queue_derive = True
                    else:
                        _queue_derive = False
                    if not f.endswith('/'):
                        if remote_dir_name:
                            key = '{0}{1}/{2}'.format(remote_dir_name, f, key)
                        else:
                            key = '{0}/{1}'.format(f, key)
                    elif remote_dir_name:
                        key = '{0}/{1}'.format(remote_dir_name, key)
                    key = norm_filepath(key)
                    resp = self.upload_file(filepath,
                                            key=key,
                                            metadata=metadata,
                                            headers=headers,
                                            access_key=access_key,
                                            secret_key=secret_key,
                                            queue_derive=_queue_derive,
                                            verbose=verbose,
                                            verify=verify,
                                            checksum=checksum,
                                            delete=delete,
                                            retries=retries,
                                            retries_sleep=retries_sleep,
                                            debug=debug,
                                            request_kwargs=request_kwargs)
                    responses.append(resp)
            else:
                file_index += 1
                # Set derive header if queue_derive is True,
                # and this is the last request being made.
                # if queue_derive is True and file_index >= len(files):
                if queue_derive is True and file_index >= total_files:
                    _queue_derive = True
                else:
                    _queue_derive = False

                if not isinstance(f, (list, tuple)):
                    key, body = (None, f)
                else:
                    key, body = f
                if key and not isinstance(key, string_types):
                    key = str(key)
                resp = self.upload_file(body,
                                        key=key,
                                        metadata=metadata,
                                        headers=headers,
                                        access_key=access_key,
                                        secret_key=secret_key,
                                        queue_derive=_queue_derive,
                                        verbose=verbose,
                                        verify=verify,
                                        checksum=checksum,
                                        delete=delete,
                                        retries=retries,
                                        retries_sleep=retries_sleep,
                                        debug=debug,
                                        request_kwargs=request_kwargs)
                responses.append(resp)
        return responses
예제 #7
0
    def upload(self, files,
               metadata=None,
               headers=None,
               access_key=None,
               secret_key=None,
               queue_derive=None,
               verbose=None,
               verify=None,
               checksum=None,
               delete=None,
               retries=None,
               retries_sleep=None,
               debug=None,
               request_kwargs=None):
        """Upload files to an item. The item will be created if it
        does not exist.

        :type files: str, file, list, tuple, dict
        :param files: The filepaths or file-like objects to upload.

        :param \*\*kwargs: Optional arguments that :func:`Item.upload_file()` takes.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> md = dict(mediatype='image', creator='Jake Johnson')
            >>> item.upload('/path/to/image.jpg', metadata=md, queue_derive=False)
            [<Response [200]>]

        Uploading multiple files::

            >>> r = item.upload(['file1.txt', 'file2.txt'])
            >>> r = item.upload([fileobj, fileobj2])
            >>> r = item.upload(('file1.txt', 'file2.txt'))

        Uploading file objects:

            >>> import io
            >>> f = io.BytesIO(b"some initial binary data: \\x00\\x01")
            >>> r = item.upload({'remote-name.txt': f})
            >>> f = io.BytesIO(b"some more binary data: \\x00\\x01")
            >>> f.name = 'remote-name.txt'
            >>> r = item.upload(f)

            *Note: file objects must either have a name attribute, or be uploaded in a
            dict where the key is the remote-name*

        Setting the remote filename with a dict::

            >>> r = item.upload({'remote-name.txt': '/path/to/local/file.txt'})

        :rtype: list
        :returns: A list of :class:`requests.Response` objects.
        """
        queue_derive = True if queue_derive is None else queue_derive
        remote_dir_name = None
        if isinstance(files, dict):
            files = list(files.items())
        if not isinstance(files, (list, tuple)):
            files = [files]

        responses = []
        file_index = 0
        if checksum:
            total_files = recursive_file_count(files, item=self, checksum=True)
        else:
            total_files = recursive_file_count(files, item=self, checksum=False)
        for f in files:
            if (isinstance(f, string_types) and is_dir(f)) \
                    or (isinstance(f, tuple) and is_dir(f[-1])):
                if isinstance(f, tuple):
                    remote_dir_name = f[0].strip('/')
                    f = f[-1]
                for filepath, key in iter_directory(f):
                    file_index += 1
                    # Set derive header if queue_derive is True,
                    # and this is the last request being made.
                    if queue_derive is True and file_index >= total_files:
                        _queue_derive = True
                    else:
                        _queue_derive = False
                    if not f.endswith('/'):
                        if remote_dir_name:
                            key = '{0}{1}/{2}'.format(remote_dir_name, f, key)
                        else:
                            key = '{0}/{1}'.format(f, key)
                    elif remote_dir_name:
                        key = '{0}/{1}'.format(remote_dir_name, key)
                    key = norm_filepath(key)
                    resp = self.upload_file(filepath,
                                            key=key,
                                            metadata=metadata,
                                            headers=headers,
                                            access_key=access_key,
                                            secret_key=secret_key,
                                            queue_derive=_queue_derive,
                                            verbose=verbose,
                                            verify=verify,
                                            checksum=checksum,
                                            delete=delete,
                                            retries=retries,
                                            retries_sleep=retries_sleep,
                                            debug=debug,
                                            request_kwargs=request_kwargs)
                    responses.append(resp)
            else:
                file_index += 1
                # Set derive header if queue_derive is True,
                # and this is the last request being made.
                # if queue_derive is True and file_index >= len(files):
                if queue_derive is True and file_index >= total_files:
                    _queue_derive = True
                else:
                    _queue_derive = False

                if not isinstance(f, (list, tuple)):
                    key, body = (None, f)
                else:
                    key, body = f
                if key and not isinstance(key, string_types):
                    key = str(key)
                resp = self.upload_file(body,
                                        key=key,
                                        metadata=metadata,
                                        headers=headers,
                                        access_key=access_key,
                                        secret_key=secret_key,
                                        queue_derive=_queue_derive,
                                        verbose=verbose,
                                        verify=verify,
                                        checksum=checksum,
                                        delete=delete,
                                        retries=retries,
                                        retries_sleep=retries_sleep,
                                        debug=debug,
                                        request_kwargs=request_kwargs)
                responses.append(resp)
        return responses
예제 #8
0
    def upload_file(self, body,
                    key=None,
                    metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = {} if headers is None else headers
        metadata = {} if metadata is None else metadata
        access_key = self.session.access_key if access_key is None else access_key
        secret_key = self.session.secret_key if secret_key is None else secret_key
        queue_derive = True if queue_derive is None else queue_derive
        verbose = False if verbose is None else verbose
        verify = True if verify is None else verify
        delete = False if delete is None else delete
        # Set checksum after delete.
        checksum = True if delete else checksum
        retries = 0 if retries is None else retries
        retries_sleep = 30 if retries_sleep is None else retries_sleep
        debug = False if debug is None else debug
        request_kwargs = {} if request_kwargs is None else request_kwargs
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            if key:
                filename = key
            else:
                filename = body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            headers['Content-Length'] = '0'

        if not headers.get('x-archive-size-hint'):
            headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = '{0.session.protocol}//s3.us.archive.org/{0.identifier}'.format(self)
        url = '{0}/{1}'.format(
            base_url, urllib.parse.quote(norm_filepath(key).lstrip('/').encode('utf-8')))

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info('{f} already exists: {u}'.format(f=key, u=url))
                if verbose:
                    print(' {f} already exists, skipping.'.format(f=key))
                if delete:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} '
                        'and verified, deleting '
                        'local copy'.format(i=self.identifier,
                                            f=key))
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = size / chunk_size + 1
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = progress.bar(
                        chunks,
                        expected_size=expected_size,
                        label=' uploading {f}: '.format(f=key))
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(' uploading {f}'.format(f=key))
                    data = body
            else:
                data = body

            headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=headers,
                                data=data,
                                metadata=metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                error_msg = ('s3 is overloaded, sleeping for '
                             '{0} seconds and retrying. '
                             '{1} retries left.'.format(retries_sleep, retries))
                while True:
                    if retries > 0:
                        if self.session.s3_is_overloaded(access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(' warning: {0}'.format(error_msg), file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get('transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(' warning: {0}'.format(error_msg), file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info('maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(u'uploaded {f} to {u}'.format(f=key, u=url))
                if delete and response.status_code == 200:
                    log.info(
                        '{f} successfully uploaded to '
                        'https://archive.org/download/{i}/{f} and verified, deleting '
                        'local copy'.format(i=self.identifier, f=key))
                    body.close()
                    os.remove(filename)
                body.close()
                return response
            except HTTPError as exc:
                body.close()
                msg = get_s3_xml_text(exc.response.content)
                error_msg = (' error uploading {0} to {1}, '
                             '{2}'.format(key, self.identifier, msg))
                log.error(error_msg)
                if verbose:
                    print(' error uploading {0}: {1}'.format(key, msg), file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg, response=exc.response, request=exc.request)
예제 #9
0
    def upload_file(self,
                    body,
                    key=None,
                    metadata=None,
                    file_metadata=None,
                    headers=None,
                    access_key=None,
                    secret_key=None,
                    queue_derive=None,
                    verbose=None,
                    verify=None,
                    checksum=None,
                    delete=None,
                    retries=None,
                    retries_sleep=None,
                    debug=None,
                    validate_identifier=None,
                    request_kwargs=None):
        """Upload a single file to an item. The item will be created
        if it does not exist.

        :type body: Filepath or file-like object.
        :param body: File or data to be uploaded.

        :type key: str
        :param key: (optional) Remote filename.

        :type metadata: dict
        :param metadata: (optional) Metadata used to create a new item.

        :type file_metadata: dict
        :param file_metadata: (optional) File-level metadata to add to
                              the files.xml entry for the file being
                              uploaded.

        :type headers: dict
        :param headers: (optional) Add additional IA-S3 headers to request.

        :type queue_derive: bool
        :param queue_derive: (optional) Set to False to prevent an item from
                             being derived after upload.

        :type verify: bool
        :param verify: (optional) Verify local MD5 checksum matches the MD5
                       checksum of the file received by IAS3.

        :type checksum: bool
        :param checksum: (optional) Skip based on checksum.

        :type delete: bool
        :param delete: (optional) Delete local file after the upload has been
                       successfully verified.

        :type retries: int
        :param retries: (optional) Number of times to retry the given request
                        if S3 returns a 503 SlowDown error.

        :type retries_sleep: int
        :param retries_sleep: (optional) Amount of time to sleep between
                              ``retries``.

        :type verbose: bool
        :param verbose: (optional) Print progress to stdout.

        :type debug: bool
        :param debug: (optional) Set to True to print headers to stdout, and
                      exit without sending the upload request.

        :type validate_identifier: bool
        :param validate_identifier: (optional) Set to True to validate the identifier before
                                    uploading the file.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> item.upload_file('/path/to/image.jpg',
            ...                  key='photos/image1.jpg')
            True
        """
        # Set defaults.
        headers = headers or {}
        metadata = metadata or {}
        file_metadata = file_metadata or {}
        access_key = access_key or self.session.access_key
        secret_key = secret_key or self.session.secret_key
        queue_derive = bool(queue_derive)
        verbose = bool(verbose)
        verify = bool(verify)
        delete = bool(delete)
        # Set checksum after delete.
        checksum = delete or checksum
        retries = retries or 0
        retries_sleep = retries_sleep or 30
        debug = bool(debug)
        validate_identifier = bool(validate_identifier)
        request_kwargs = request_kwargs or {}
        if 'timeout' not in request_kwargs:
            request_kwargs['timeout'] = 120
        md5_sum = None

        _headers = headers.copy()

        if not hasattr(body, 'read'):
            filename = body
            body = open(body, 'rb')
        else:
            filename = key or body.name

        size = get_file_size(body)

        # Support for uploading empty files.
        if size == 0:
            _headers['Content-Length'] = '0'

        if not _headers.get('x-archive-size-hint'):
            _headers['x-archive-size-hint'] = str(size)

        # Build IA-S3 URL.
        if validate_identifier:
            validate_s3_identifier(self.identifier)
        key = norm_filepath(filename).split('/')[-1] if key is None else key
        base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}'
        url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}'

        # Skip based on checksum.
        if checksum:
            md5_sum = get_md5(body)
            ia_file = self.get_file(key)
            if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum):
                log.info(f'{key} already exists: {url}')
                if verbose:
                    print(f' {key} already exists, skipping.', file=sys.stderr)
                if delete:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} '
                        'and verified, deleting local copy')
                    body.close()
                    os.remove(filename)
                # Return an empty response object if checksums match.
                # TODO: Is there a better way to handle this?
                body.close()
                return Response()

        # require the Content-MD5 header when delete is True.
        if verify or delete:
            if not md5_sum:
                md5_sum = get_md5(body)
            _headers['Content-MD5'] = md5_sum

        def _build_request():
            body.seek(0, os.SEEK_SET)
            if verbose:
                try:
                    # hack to raise exception so we get some output for
                    # empty files.
                    if size == 0:
                        raise Exception

                    chunk_size = 1048576
                    expected_size = math.ceil(size / chunk_size)
                    chunks = chunk_generator(body, chunk_size)
                    progress_generator = tqdm(chunks,
                                              desc=f' uploading {key}',
                                              dynamic_ncols=True,
                                              total=expected_size,
                                              unit='MiB')
                    data = IterableToFileAdapter(progress_generator, size)
                except:
                    print(f' uploading {key}', file=sys.stderr)
                    data = body
            else:
                data = body

            _headers.update(self.session.headers)
            request = S3Request(method='PUT',
                                url=url,
                                headers=_headers,
                                data=data,
                                metadata=metadata,
                                file_metadata=file_metadata,
                                access_key=access_key,
                                secret_key=secret_key,
                                queue_derive=queue_derive)
            return request

        if debug:
            prepared_request = self.session.prepare_request(_build_request())
            body.close()
            return prepared_request
        else:
            try:
                while True:
                    error_msg = ('s3 is overloaded, sleeping for '
                                 f'{retries_sleep} seconds and retrying. '
                                 f'{retries} retries left.')
                    if retries > 0:
                        if self.session.s3_is_overloaded(
                                access_key=access_key):
                            sleep(retries_sleep)
                            log.info(error_msg)
                            if verbose:
                                print(f' warning: {error_msg}',
                                      file=sys.stderr)
                            retries -= 1
                            continue
                    request = _build_request()
                    prepared_request = request.prepare()

                    # chunked transfer-encoding is NOT supported by IA-S3.
                    # It should NEVER be set. Requests adds it in certain
                    # scenarios (e.g. if content-length is 0). Stop it.
                    if prepared_request.headers.get(
                            'transfer-encoding') == 'chunked':
                        del prepared_request.headers['transfer-encoding']

                    response = self.session.send(prepared_request,
                                                 stream=True,
                                                 **request_kwargs)
                    if (response.status_code == 503) and (retries > 0):
                        log.info(error_msg)
                        if verbose:
                            print(f' warning: {error_msg}', file=sys.stderr)
                        sleep(retries_sleep)
                        retries -= 1
                        continue
                    else:
                        if response.status_code == 503:
                            log.info(
                                'maximum retries exceeded, upload failed.')
                        break
                response.raise_for_status()
                log.info(f'uploaded {key} to {url}')
                if delete and response.status_code == 200:
                    log.info(
                        f'{key} successfully uploaded to '
                        f'https://archive.org/download/{self.identifier}/{key} and verified, '
                        'deleting local copy')
                    body.close()
                    os.remove(filename)
                response.close()
                return response
            except HTTPError as exc:
                try:
                    msg = get_s3_xml_text(exc.response.content)
                except ExpatError:  # probably HTTP 500 error and response is invalid XML
                    msg = (
                        'IA S3 returned invalid XML '
                        f'(HTTP status code {exc.response.status_code}). '
                        'This is a server side error which is either temporary, '
                        'or requires the intervention of IA admins.')

                error_msg = f' error uploading {key} to {self.identifier}, {msg}'
                log.error(error_msg)
                if verbose:
                    print(f' error uploading {key}: {msg}', file=sys.stderr)
                # Raise HTTPError with error message.
                raise type(exc)(error_msg,
                                response=exc.response,
                                request=exc.request)
            finally:
                body.close()
예제 #10
0
    def upload(
            self,
            files,
            metadata: Mapping | None = None,
            headers: dict | None = None,
            access_key: str | None = None,
            secret_key: str | None = None,
            queue_derive=None,  # TODO: True if None??
            verbose: bool = False,
            verify: bool = False,
            checksum: bool = False,
            delete: bool = False,
            retries: int | None = None,
            retries_sleep: int | None = None,
            debug: bool = False,
            validate_identifier: bool = False,
            request_kwargs: dict | None = None) -> list[Request | Response]:
        r"""Upload files to an item. The item will be created if it
        does not exist.

        :type files: str, file, list, tuple, dict
        :param files: The filepaths or file-like objects to upload.

        :param \*\*kwargs: Optional arguments that :func:`Item.upload_file()` takes.

        :returns: A list of :class:`requests.Response` objects.

        Usage::

            >>> import internetarchive
            >>> item = internetarchive.Item('identifier')
            >>> md = {'mediatype': 'image', 'creator': 'Jake Johnson'}
            >>> item.upload('/path/to/image.jpg', metadata=md, queue_derive=False)
            [<Response [200]>]

        Uploading multiple files::

            >>> r = item.upload(['file1.txt', 'file2.txt'])
            >>> r = item.upload([fileobj, fileobj2])
            >>> r = item.upload(('file1.txt', 'file2.txt'))

        Uploading file objects:

            >>> import io
            >>> f = io.BytesIO(b'some initial binary data: \x00\x01')
            >>> r = item.upload({'remote-name.txt': f})
            >>> f = io.BytesIO(b'some more binary data: \x00\x01')
            >>> f.name = 'remote-name.txt'
            >>> r = item.upload(f)

            *Note: file objects must either have a name attribute, or be uploaded in a
            dict where the key is the remote-name*

        Setting the remote filename with a dict::

            >>> r = item.upload({'remote-name.txt': '/path/to/local/file.txt'})
        """
        queue_derive = True if queue_derive is None else queue_derive
        remote_dir_name = None
        total_files = 0
        if isinstance(files, dict):
            if files.get('name'):
                files = [files]
                total_files = 1
            else:
                files = list(files.items())
        if not isinstance(files, (list, tuple)):
            files = [files]
        if all(isinstance(f, dict) and f.get('name') for f in files):
            total_files = len(files)

        responses = []
        file_index = 0
        if queue_derive and total_files is None:
            if checksum:
                total_files = recursive_file_count(files,
                                                   item=self,
                                                   checksum=True)
            else:
                total_files = recursive_file_count(files,
                                                   item=self,
                                                   checksum=False)
        file_metadata = None
        for f in files:
            if isinstance(f, dict):
                if f.get('name'):
                    file_metadata = f.copy()
                    del file_metadata['name']
                    f = f['name']
            if ((isinstance(f, str) and is_dir(f))
                    or (isinstance(f, tuple) and is_dir(f[-1]))):
                if isinstance(f, tuple):
                    remote_dir_name = f[0].strip('/')
                    f = f[-1]
                for filepath, key in iter_directory(f):
                    file_index += 1
                    # Set derive header if queue_derive is True,
                    # and this is the last request being made.
                    if queue_derive is True and file_index >= total_files:
                        _queue_derive = True
                    else:
                        _queue_derive = False
                    if not f.endswith('/'):
                        if remote_dir_name:
                            key = f'{remote_dir_name}{f}/{key}'
                        else:
                            key = f'{f}/{key}'
                    elif remote_dir_name:
                        key = f'{remote_dir_name}/{key}'
                    key = norm_filepath(key)
                    resp = self.upload_file(
                        filepath,
                        key=key,
                        metadata=metadata,
                        file_metadata=file_metadata,
                        headers=headers,
                        access_key=access_key,
                        secret_key=secret_key,
                        queue_derive=_queue_derive,
                        verbose=verbose,
                        verify=verify,
                        checksum=checksum,
                        delete=delete,
                        retries=retries,
                        retries_sleep=retries_sleep,
                        debug=debug,
                        validate_identifier=validate_identifier,
                        request_kwargs=request_kwargs)
                    responses.append(resp)
            else:
                file_index += 1
                # Set derive header if queue_derive is True,
                # and this is the last request being made.
                # if queue_derive is True and file_index >= len(files):
                if queue_derive is True and file_index >= total_files:
                    _queue_derive = True
                else:
                    _queue_derive = False

                if not isinstance(f, (list, tuple)):
                    key, body = (None, f)
                else:
                    key, body = f
                if key and not isinstance(key, str):
                    key = str(key)
                resp = self.upload_file(
                    body,
                    key=key,
                    metadata=metadata,
                    file_metadata=file_metadata,
                    headers=headers,
                    access_key=access_key,
                    secret_key=secret_key,
                    queue_derive=_queue_derive,
                    verbose=verbose,
                    verify=verify,
                    checksum=checksum,
                    delete=delete,
                    retries=retries,
                    retries_sleep=retries_sleep,
                    debug=debug,
                    validate_identifier=validate_identifier,
                    request_kwargs=request_kwargs)
                responses.append(resp)
        return responses