def _test_download_file(self, mocker, type): staging_path = "gs://temp/" staging_file_name = "temp_0" dst_path = "/tmp/myfile.csv" table_id = "project_id.dataset_id.table_id" table_dldr = TableDownloader() mock_blob = _Blob() mocker.patch.object(mock_blob, "download_to_filename") table_dldr._bq = _Mock_BQ_Client() mocker.patch.object(table_dldr._bq, "extract_table", return_value=_Job()) table_dldr._gcs = _Mock_GCS_Client() mocker.patch.object(table_dldr._gcs, "get_bucket", return_value=_Bucket(mock_blob)) table_dldr.download_table_as_file(table_id, dst_path, staging_location=staging_path, file_type=type) exp_staging_path = os.path.join(staging_path, staging_file_name) assert len(table_dldr._bq.extract_table.call_args_list) == 1 args, kwargs = table_dldr._bq.extract_table.call_args_list[0] assert args[0].full_table_id == Table.from_string( table_id).full_table_id assert args[1] == exp_staging_path assert kwargs["job_config"].destination_format == str(type) mock_blob.download_to_filename.assert_called_once_with(dst_path)
def test_download_table_as_df(self, mocker): self._stop_time(mocker) mocked_gcs_to_df = mocker.patch("feast.sdk.utils.bq_util.gcs_to_df", return_value=None) staging_path = "gs://temp/" staging_file_name = "temp_0" table_id = "project_id.dataset_id.table_id" table_dldr = TableDownloader() exp_staging_path = os.path.join(staging_path, staging_file_name) table_dldr._bq = _Mock_BQ_Client() mocker.patch.object(table_dldr._bq, "extract_table", return_value=_Job()) table_dldr.download_table_as_df(table_id, staging_location=staging_path) assert len(table_dldr._bq.extract_table.call_args_list) == 1 args, kwargs = table_dldr._bq.extract_table.call_args_list[0] assert args[0].full_table_id == Table.from_string( table_id).full_table_id assert args[1] == exp_staging_path assert kwargs["job_config"].destination_format == "CSV" mocked_gcs_to_df.assert_called_once_with(exp_staging_path)
def download_table_as_df(self, full_table_id, staging_location): """ Download a BigQuery table as Pandas Dataframe Args: full_table_id (src) : fully qualified BigQuery table id staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = DestinationFormat.CSV job = self.bq.extract_table(Table.from_string(full_table_id), staging_file_path, job_config=job_config) # await completion job.result() return gcs_to_df(staging_file_path)
def download_table_as_file(self, full_table_id, dest, staging_location, file_type): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: (str) path to the downloaded file """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bq.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.gcs.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def _test_download_file(self, mocker, type): mocked_gcs_folder_to_file = mocker.patch( "feast.sdk.utils.bq_util.gcs_folder_to_file", return_value=None) staging_path = "gs://temp" temp_folder = "temp_0" full_table_id = "project_id.dataset_id.table_id" dst_path = "/tmp/myfile.csv" exp_staging_folder = os.path.join(staging_path, temp_folder) exp_staging_path = os.path.join(exp_staging_folder, "shard_*") table_dldr = TableDownloader() table_dldr._bqclient = _Mock_BQ_Client() mocker.patch.object(table_dldr._bqclient, "extract_table", return_value=_Job()) table_dldr.download_table_as_file(full_table_id, dst_path, staging_location=staging_path, file_type=type) assert len(table_dldr._bqclient.extract_table.call_args_list) == 1 args, kwargs = table_dldr._bqclient.extract_table.call_args_list[0] assert args[0].full_table_id == Table.from_string( full_table_id).full_table_id assert args[1] == exp_staging_path assert kwargs["job_config"].destination_format == str(type) mocked_gcs_folder_to_file.assert_called_once_with( exp_staging_folder, dst_path)
def download_table_as_file(self, full_table_id, dest, file_type, staging_location=None): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format staging_location (str, optional): url to staging_location (currently support a folder in GCS) Returns: (str) path to the downloaded file """ if not staging_location: df = self.download_table_as_df(full_table_id) if file_type == FileType.CSV: df.to_csv(dest, index=False) elif file_type == FileType.JSON: df.to_json(dest, index=False) else: raise ValueError( "Only FileType: CSV and JSON are supported for download_table_as_file without staging location" ) return dest if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bqclient.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.storageclient.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def __extract_table_to_shard_folder(self, full_table_id, staging_location, file_type): shard_folder = os.path.join(staging_location, 'temp_%d' % int(round(time.time() * 1000))) staging_file_path = os.path.join(shard_folder, "shard_*") job_config = ExtractJobConfig() job_config.destination_format = file_type job = self.bqclient.extract_table( Table.from_string(full_table_id), staging_file_path, job_config=job_config ) # await completion job.result() return shard_folder