示例#1
0
  def events_blobs_generator(self) -> Generator[blob.Blob, None, None]:
    """Generates pages of specified BigQuery table as blobs.

    Yields:
      blob: A blob object containing events from a page with length of
      _DEFAULT_PAGE_SIZE from the specified BigQuery table.

    Raises:
      DataInConnectorError: Raised when BigQuery table data cannot be accessed.
    """
    start_index = 0
    total_rows = -1
    bq_cursor = self.get_conn().cursor()

    # Get the first page to ensure the accessibility.
    try:
      query_results = self._get_tabledata_with_retries(bq_cursor=bq_cursor,
                                                       start_index=start_index)
    except googleapiclient_errors.HttpError as error:
      raise errors.DataInConnectorError(error=error, msg=str(error))
    else:
      if query_results is None:
        raise errors.DataInConnectorError(
            msg='Unable to get any blobs in {}.'.format(self.url))
      try:
        total_rows = int(query_results.get('totalRows'))
      except (AttributeError, TypeError, ValueError):
        raise errors.DataInConnectorError(
            msg='Unable to get total rows in {}.'.format(self.url))
      else:
        yield self._query_results_to_blob(query_results, start_index)
        start_index = start_index + _DEFAULT_PAGE_SIZE

    # Get the remaining pages of the requested table.
    while start_index < total_rows:
      try:
        query_results = self._get_tabledata_with_retries(
            bq_cursor=bq_cursor, start_index=start_index)
      except googleapiclient_errors.HttpError as error:
        # Generate a blob with error status.
        blob_unique_id = '{}/{}'.format(self.url, start_index)
        yield blob.Blob(events=[], blob_id=blob_unique_id,
                        platform=_PLATFORM, source=self.dataset_id,
                        location=self.table_id, position=start_index,
                        status=blob.BlobStatus.ERROR, status_desc=str(error))
      else:
        yield self._query_results_to_blob(query_results, start_index)
      finally:
        start_index = start_index + _DEFAULT_PAGE_SIZE
示例#2
0
  def events_blobs_generator(self) -> Generator[blob.Blob, None, None]:
    """Generates all blobs from the bucket's prefix location.

    Yields:
      A generator that generates Blob objects from blob contents within a
      prefix location in the bucket.

    Raises:
      DataInConnectorError: When listing blob in bucket returns a HttpError.
    """
    try:
      blob_names = self.list(bucket=self.bucket, prefix=self.prefix)
    except googleapiclient_errors.HttpError as error:
      raise errors.DataInConnectorError(
          error=error, msg='Failed to get list of blobs from bucket.')

    for blob_name in blob_names:
      url = 'gs://{}/{}'.format(self.bucket, blob_name)
      # Exclude folders from uploading to Datastore.
      if not blob_name.endswith('/'):
        try:
          events = self.get_blob_events(blob_name)
          yield blob.Blob(events=events, blob_id=url, platform=_PLATFORM,
                          source=self.bucket, location=blob_name,
                          position=_START_POSITION_IN_BLOB)
        except (errors.DataInConnectorBlobParseError,
                errors.DataInConnectorError) as error:
          yield blob.Blob(events=[], blob_id=url, platform=_PLATFORM,
                          source=self.bucket, location=blob_name,
                          position=_START_POSITION_IN_BLOB,
                          status=blob.BlobStatus.ERROR, status_desc=str(error))
示例#3
0
    def _gcs_blob_chunk_generator(
            self, blob_name: Text) -> Generator[bytes, None, None]:
        """Downloads and generates chunks from given blob.

    The base GoogleCloudStorageHook only allows downloading an entire file.
    To enable handling large files this class provides a chunk-wise download of
    bytes within the blob.

    Args:
      blob_name: Unique location within the bucket for the target blob.

    Yields:
      Chunks of the given blob, formatted as bytes.

    Raises:
      DataInConnectorError: When download failed.
    """
        done = False
        outio = io.BytesIO()
        try:
            request = self.get_conn().objects().get_media(
                bucket=self.bucket,  # pytype: disable=attribute-error
                object=blob_name)
            downloader = http.MediaIoBaseDownload(outio, request)
        except googleapiclient_errors.HttpError as error:
            raise errors.DataInConnectorError(
                error=error, msg='Failed to download the blob.')

        while not done:
            outio.truncate(0)
            outio.seek(0)

            try:
                status, done = downloader.next_chunk()
            except googleapiclient_errors.HttpError as error:
                raise errors.DataInConnectorError(
                    error=error, msg='Failed to download the blob.')

            self.log.debug('Blob loading: {}%'.format(
                int(status.progress() * 100)))
            yield outio.getvalue()
示例#4
0
    def test_events_blobs_generator_raises_data_in_connector_error(self):
        self.mocked_list.side_effect = errors.DataInConnectorError()

        with self.assertRaises(errors.DataInConnectorError):
            self.gcs_hook.events_blobs_generator().__next__()