def events_blobs_generator(self) -> Generator[blob.Blob, None, None]: """Generates pages of specified BigQuery table as blobs. Yields: blob: A blob object containing events from a page with length of _DEFAULT_PAGE_SIZE from the specified BigQuery table. Raises: DataInConnectorError: Raised when BigQuery table data cannot be accessed. """ start_index = 0 total_rows = -1 bq_cursor = self.get_conn().cursor() # Get the first page to ensure the accessibility. try: query_results = self._get_tabledata_with_retries(bq_cursor=bq_cursor, start_index=start_index) except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError(error=error, msg=str(error)) else: if query_results is None: raise errors.DataInConnectorError( msg='Unable to get any blobs in {}.'.format(self.url)) try: total_rows = int(query_results.get('totalRows')) except (AttributeError, TypeError, ValueError): raise errors.DataInConnectorError( msg='Unable to get total rows in {}.'.format(self.url)) else: yield self._query_results_to_blob(query_results, start_index) start_index = start_index + _DEFAULT_PAGE_SIZE # Get the remaining pages of the requested table. while start_index < total_rows: try: query_results = self._get_tabledata_with_retries( bq_cursor=bq_cursor, start_index=start_index) except googleapiclient_errors.HttpError as error: # Generate a blob with error status. blob_unique_id = '{}/{}'.format(self.url, start_index) yield blob.Blob(events=[], blob_id=blob_unique_id, platform=_PLATFORM, source=self.dataset_id, location=self.table_id, position=start_index, status=blob.BlobStatus.ERROR, status_desc=str(error)) else: yield self._query_results_to_blob(query_results, start_index) finally: start_index = start_index + _DEFAULT_PAGE_SIZE
def events_blobs_generator(self) -> Generator[blob.Blob, None, None]: """Generates all blobs from the bucket's prefix location. Yields: A generator that generates Blob objects from blob contents within a prefix location in the bucket. Raises: DataInConnectorError: When listing blob in bucket returns a HttpError. """ try: blob_names = self.list(bucket=self.bucket, prefix=self.prefix) except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError( error=error, msg='Failed to get list of blobs from bucket.') for blob_name in blob_names: url = 'gs://{}/{}'.format(self.bucket, blob_name) # Exclude folders from uploading to Datastore. if not blob_name.endswith('/'): try: events = self.get_blob_events(blob_name) yield blob.Blob(events=events, blob_id=url, platform=_PLATFORM, source=self.bucket, location=blob_name, position=_START_POSITION_IN_BLOB) except (errors.DataInConnectorBlobParseError, errors.DataInConnectorError) as error: yield blob.Blob(events=[], blob_id=url, platform=_PLATFORM, source=self.bucket, location=blob_name, position=_START_POSITION_IN_BLOB, status=blob.BlobStatus.ERROR, status_desc=str(error))
def _gcs_blob_chunk_generator( self, blob_name: Text) -> Generator[bytes, None, None]: """Downloads and generates chunks from given blob. The base GoogleCloudStorageHook only allows downloading an entire file. To enable handling large files this class provides a chunk-wise download of bytes within the blob. Args: blob_name: Unique location within the bucket for the target blob. Yields: Chunks of the given blob, formatted as bytes. Raises: DataInConnectorError: When download failed. """ done = False outio = io.BytesIO() try: request = self.get_conn().objects().get_media( bucket=self.bucket, # pytype: disable=attribute-error object=blob_name) downloader = http.MediaIoBaseDownload(outio, request) except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError( error=error, msg='Failed to download the blob.') while not done: outio.truncate(0) outio.seek(0) try: status, done = downloader.next_chunk() except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError( error=error, msg='Failed to download the blob.') self.log.debug('Blob loading: {}%'.format( int(status.progress() * 100))) yield outio.getvalue()
def test_events_blobs_generator_raises_data_in_connector_error(self): self.mocked_list.side_effect = errors.DataInConnectorError() with self.assertRaises(errors.DataInConnectorError): self.gcs_hook.events_blobs_generator().__next__()