Пример #1
0
    def _test_download_file(self, mocker, type):
        staging_path = "gs://temp/"
        staging_file_name = "temp_0"
        dst_path = "/tmp/myfile.csv"
        table_id = "project_id.dataset_id.table_id"

        table_dldr = TableDownloader()
        mock_blob = _Blob()
        mocker.patch.object(mock_blob, "download_to_filename")
        table_dldr._bq = _Mock_BQ_Client()
        mocker.patch.object(table_dldr._bq,
                            "extract_table",
                            return_value=_Job())
        table_dldr._gcs = _Mock_GCS_Client()
        mocker.patch.object(table_dldr._gcs,
                            "get_bucket",
                            return_value=_Bucket(mock_blob))

        table_dldr.download_table_as_file(table_id,
                                          dst_path,
                                          staging_location=staging_path,
                                          file_type=type)

        exp_staging_path = os.path.join(staging_path, staging_file_name)
        assert len(table_dldr._bq.extract_table.call_args_list) == 1
        args, kwargs = table_dldr._bq.extract_table.call_args_list[0]
        assert args[0].full_table_id == Table.from_string(
            table_id).full_table_id
        assert args[1] == exp_staging_path
        assert kwargs["job_config"].destination_format == str(type)

        mock_blob.download_to_filename.assert_called_once_with(dst_path)
Пример #2
0
    def test_download_table_as_df(self, mocker):
        self._stop_time(mocker)
        mocked_gcs_to_df = mocker.patch("feast.sdk.utils.bq_util.gcs_to_df",
                                        return_value=None)

        staging_path = "gs://temp/"
        staging_file_name = "temp_0"
        table_id = "project_id.dataset_id.table_id"

        table_dldr = TableDownloader()
        exp_staging_path = os.path.join(staging_path, staging_file_name)

        table_dldr._bq = _Mock_BQ_Client()
        mocker.patch.object(table_dldr._bq,
                            "extract_table",
                            return_value=_Job())

        table_dldr.download_table_as_df(table_id,
                                        staging_location=staging_path)

        assert len(table_dldr._bq.extract_table.call_args_list) == 1
        args, kwargs = table_dldr._bq.extract_table.call_args_list[0]
        assert args[0].full_table_id == Table.from_string(
            table_id).full_table_id
        assert args[1] == exp_staging_path
        assert kwargs["job_config"].destination_format == "CSV"
        mocked_gcs_to_df.assert_called_once_with(exp_staging_path)
Пример #3
0
    def download_table_as_df(self, full_table_id, staging_location):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = DestinationFormat.CSV
        job = self.bq.extract_table(Table.from_string(full_table_id),
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()
        return gcs_to_df(staging_file_path)
Пример #4
0
    def download_table_as_file(self, full_table_id, dest, staging_location,
                               file_type):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            staging_location (str): url to staging_location (currently
                support a folder in GCS)
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
        Returns: (str) path to the downloaded file

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bq.extract_table(src_table,
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.gcs.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Пример #5
0
    def _test_download_file(self, mocker, type):
        mocked_gcs_folder_to_file = mocker.patch(
            "feast.sdk.utils.bq_util.gcs_folder_to_file", return_value=None)

        staging_path = "gs://temp"
        temp_folder = "temp_0"
        full_table_id = "project_id.dataset_id.table_id"
        dst_path = "/tmp/myfile.csv"

        exp_staging_folder = os.path.join(staging_path, temp_folder)
        exp_staging_path = os.path.join(exp_staging_folder, "shard_*")

        table_dldr = TableDownloader()
        table_dldr._bqclient = _Mock_BQ_Client()
        mocker.patch.object(table_dldr._bqclient,
                            "extract_table",
                            return_value=_Job())

        table_dldr.download_table_as_file(full_table_id,
                                          dst_path,
                                          staging_location=staging_path,
                                          file_type=type)

        assert len(table_dldr._bqclient.extract_table.call_args_list) == 1
        args, kwargs = table_dldr._bqclient.extract_table.call_args_list[0]
        assert args[0].full_table_id == Table.from_string(
            full_table_id).full_table_id
        assert args[1] == exp_staging_path
        assert kwargs["job_config"].destination_format == str(type)
        mocked_gcs_folder_to_file.assert_called_once_with(
            exp_staging_folder, dst_path)
Пример #6
0
    def download_table_as_file(self,
                               full_table_id,
                               dest,
                               file_type,
                               staging_location=None):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
            staging_location (str, optional): url to staging_location (currently
                support a folder in GCS)
        Returns: (str) path to the downloaded file

        """
        if not staging_location:
            df = self.download_table_as_df(full_table_id)
            if file_type == FileType.CSV:
                df.to_csv(dest, index=False)
            elif file_type == FileType.JSON:
                df.to_json(dest, index=False)
            else:
                raise ValueError(
                    "Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
                )
            return dest

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bqclient.extract_table(src_table,
                                          staging_file_path,
                                          job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.storageclient.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Пример #7
0
    def __extract_table_to_shard_folder(self, full_table_id,
                                        staging_location, file_type):
        shard_folder = os.path.join(staging_location,
                                    'temp_%d' % int(round(time.time() * 1000)))
        staging_file_path = os.path.join(shard_folder, "shard_*")

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        job = self.bqclient.extract_table(
            Table.from_string(full_table_id),
            staging_file_path,
            job_config=job_config
        )
        # await completion
        job.result()
        return shard_folder