def __init__(
        self,
        bucket_name,
        blob_name,
        read_size=QuerybookSettings.STORE_READ_SIZE,
        max_read_size=QuerybookSettings.STORE_MAX_READ_SIZE,
    ):
        from google.cloud import storage
        from google.auth.transport.requests import AuthorizedSession
        from google.resumable_media.requests import ChunkedDownload

        # First check for existence
        cred = get_google_credentials()
        client = storage.Client(project=cred.project_id, credentials=cred)
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        if not blob.exists():
            raise FileDoesNotExist("{}/{} does not exist".format(
                bucket_name, blob_name))

        # Start the transport process
        self._transport = AuthorizedSession(credentials=client._credentials)
        self._stream = BytesIO()

        download_url = (
            f"https://storage.googleapis.com/storage/v1/b/"
            f"{bucket_name}/o/{quote(blob_name, safe='')}?alt=media")

        self._download = ChunkedDownload(download_url, read_size, self._stream)

        super(GoogleDownloadClient, self).__init__(read_size, max_read_size)
예제 #2
0
 def _download_blob(self, blob):
     destination_file = self._destination_file(blob.name)
     transport = _create_transport()
     chunk_size = 10 * 1024 * 1024  # 10MB
     download = ChunkedDownload(blob.media_link, chunk_size,
                                destination_file)
     initial_bytes_downloaded = self._status.downloaded_bytes
     while not download.finished:
         download.consume_next_chunk(transport)
         self._update_status(downloaded_bytes=initial_bytes_downloaded +
                             download.bytes_downloaded)
     self._update_status(downloaded_files=self._status.downloaded_files + 1)
예제 #3
0
def main(args):
    file_location = Path(args.file_location)
    file_name = file_location.name
    local_file = file_location
    client = storage.Client()
    blob_folder = "word2vec_service/v2"
    bucket_name = "hutoma-datasets"
    bucket = client.get_bucket(bucket_name)
    blob_path = "{}/{}".format(blob_folder, file_name)
    blob = bucket.blob(blob_path)
    bytes_in_1MB = 1024 * 1024

    print("Operation {}: blob is {}, local file is {}".format(
        args.operation, blob_path, local_file))
    transport = g_requests.AuthorizedSession(credentials=client._credentials)

    if args.operation == "download":
        if not blob.exists():
            raise DataError("Blob {} doesn't exist".format(blob_path))
        if local_file.exists():
            confirm_prompt("File {} exists, overwrite?".format(local_file))
        url = ("https://www.googleapis.com/download/storage/v1/b/"
               "{bucket}/o/{blob_name}?alt=media").format(
                   bucket=bucket_name,
                   blob_name=urllib.parse.quote_plus(blob_path))
        chunk_size = bytes_in_1MB * 5  # 5MB
        with local_file.open("wb") as file_stream:
            download = ChunkedDownload(url, chunk_size, file_stream)
            download.finished
            response = download.consume_next_chunk(transport)
            if not download.finished:
                process_operation(transport, download)

    elif args.operation == "upload":
        if not local_file.exists():
            raise DataError("File {} doesn't exist".format(blob_path))
        if blob.exists():
            confirm_prompt("Blob {} exists, overwrite?".format(local_file))

        url = ("https://www.googleapis.com/upload/storage/v1/b/{bucket}" +
               "/o?uploadType=resumable").format(bucket=bucket_name)
        chunk_size = bytes_in_1MB  # 1MB
        upload = ResumableUpload(url, chunk_size)
        metadata = {"name": blob_path}
        content_type = "application/octet-stream"

        with local_file.open("rb") as file_stream:
            response = upload.initiate(transport, file_stream, metadata,
                                       content_type)
            if response.status_code != 200:
                raise DataError("Failed to initiate upload")
            process_operation(transport, upload)
예제 #4
0
 def _make_download(self):
     self.file_obj = io.BytesIO()
     download_url = self.blob._get_download_url()
     headers = storage.blob._get_encryption_headers(
         self.blob._encryption_key)
     headers['accept-encoding'] = 'gzip'
     if self.blob.chunk_size is None:
         self.download = Download(download_url,
                                  stream=self.file_obj,
                                  headers=headers,
                                  start=self.start,
                                  end=self.end)
     else:
         self.download = ChunkedDownload(
             download_url,
             self.blob.chunk_size,
             self.file_obj,
             headers=headers,
             start=self.start if self.start else 0,
             end=self.end)
예제 #5
0
파일: net.py 프로젝트: elastic/rally
def _download_from_gcs_bucket(bucket_name,
                              bucket_path,
                              local_path,
                              expected_size_in_bytes=None,
                              progress_indicator=None):
    # pylint: disable=import-outside-toplevel
    # lazily initialize Google Cloud Storage support - we might not need it
    import google.auth
    import google.auth.transport.requests as tr_requests
    import google.oauth2.credentials

    # Using Google Resumable Media as the standard storage library doesn't support progress
    # (https://github.com/googleapis/python-storage/issues/27)
    from google.resumable_media.requests import ChunkedDownload

    ro_scope = "https://www.googleapis.com/auth/devstorage.read_only"

    access_token = os.environ.get("GOOGLE_AUTH_TOKEN")
    if access_token:
        credentials = google.oauth2.credentials.Credentials(
            token=access_token, scopes=(ro_scope, ))
    else:
        # https://google-auth.readthedocs.io/en/latest/user-guide.html
        credentials, _ = google.auth.default(scopes=(ro_scope, ))

    transport = tr_requests.AuthorizedSession(credentials)
    chunk_size = 50 * 1024 * 1024  # 50MB

    with open(local_path, "wb") as local_fp:
        media_url = _build_gcs_object_url(bucket_name, bucket_path)
        download = ChunkedDownload(media_url, chunk_size, local_fp)
        # allow us to calculate the total bytes
        download.consume_next_chunk(transport)
        if not expected_size_in_bytes:
            expected_size_in_bytes = download.total_bytes
        while not download.finished:
            if progress_indicator and download.bytes_downloaded and download.total_bytes:
                progress_indicator(download.bytes_downloaded,
                                   expected_size_in_bytes)
            download.consume_next_chunk(transport)
예제 #6
0
url_template = (u'https://www.googleapis.com/download/storage/v1/b/'
                u'{bucket}/o/{blob_name}?alt=media')

url_template_upload = (
    u'https://www.googleapis.com/upload/storage/v1/b/{bucket}/o?'
    u'uploadType=resumable')

upload_url = url_template_upload.format(bucket=bucket_upload)

media_url = url_template.format(bucket=bucket, blob_name=blob_name)

chunk_size = 1 * 1024 * 1024
stream = io.BytesIO()

download = ChunkedDownload(media_url, chunk_size, stream)
upload = ResumableUpload(upload_url, chunk_size)

data = []
while download.finished != True:
    response = download.consume_next_chunk(transport)
    data.append(response.content.decode("utf-8").replace(',', '|'))

new_data = ''.join(data)
stream_upload = io.BytesIO(bytes(new_data, 'UTF-8'))
metadata = {u'name': blob_name_upload}
reponse_upload = upload.initiate(transport, stream_upload, metadata,
                                 content_type)

while upload.finished != True:
    upload.transmit_next_chunk(transport)
예제 #7
0
class BlobReader(io.IOBase):
    def __init__(self, blob, start=0, end=None, client=None):
        self.blob = blob
        self.start = start
        self.end = end
        self.client = client
        self.download = None
        self.file_obj = None

    def _make_download(self):
        self.file_obj = io.BytesIO()
        download_url = self.blob._get_download_url()
        headers = storage.blob._get_encryption_headers(
            self.blob._encryption_key)
        headers['accept-encoding'] = 'gzip'
        if self.blob.chunk_size is None:
            self.download = Download(download_url,
                                     stream=self.file_obj,
                                     headers=headers,
                                     start=self.start,
                                     end=self.end)
        else:
            self.download = ChunkedDownload(
                download_url,
                self.blob.chunk_size,
                self.file_obj,
                headers=headers,
                start=self.start if self.start else 0,
                end=self.end)

    def read(self, size=-1):
        value = b''
        if self.file_obj.tell() < len(self.file_obj.getvalue()):
            value = self.file_obj.read(size)
        if len(value) < size or size < 0:
            self.start += self.file_obj.tell()
            self._make_download()
            transport = self.blob._get_transport(self.client)
            if self.blob.chunk_size is None:
                self.download.consume(transport)
            else:
                self.download.consume_next_chunk(transport)
            self.file_obj.seek(0)
            value += self.file_obj.read(size - len(value))
        return value

    def seekable(self):
        return True

    def seek(self, offset, whence=0):
        if whence == 0:
            self.start = offset or 0
            self._make_download()
            return self.start
        elif whence == 1:
            if self.file_obj is None:
                self.start += offset
                return self.start
            pos = self.file_obj.tell() + offset
            if pos < 0 or pos > len(self.file_obj.getvalue()):
                self.start += pos
                self._make_download()
                return self.start
            self.file_obj.seek(offset, 1)
            return self.start + self.file_obj.tell()
        else:
            assert False, "whence == 2 is not supported"