def download_table_as_df(self, full_table_id, staging_location): """ Download a BigQuery table as Pandas Dataframe Args: full_table_id (src) : fully qualified BigQuery table id staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = DestinationFormat.CSV job = self.bq.extract_table(Table.from_string(full_table_id), staging_file_path, job_config=job_config) # await completion job.result() return gcs_to_df(staging_file_path)
def download_table_as_file(self, full_table_id, dest, staging_location, file_type): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: (str) path to the downloaded file """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bq.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.gcs.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def from_csv(cls, path, entity, granularity, owner, staging_location=None, id_column=None, feature_columns=None, timestamp_column=None, timestamp_value=None, serving_store=None, warehouse_store=None): """Creates an importer from a given csv dataset. This file can be either local or remote (in gcs). If it's a local file then staging_location must be determined. Args: path (str): path to csv file entity (str): entity id granularity (Granularity): granularity of data owner (str): owner staging_location (str, optional): Defaults to None. Staging location for ingesting a local csv file. id_column (str, optional): Defaults to None. Id column in the csv. If not set, will default to the `entity` argument. feature_columns ([str], optional): Defaults to None. Feature columns to ingest. If not set, the importer will by default ingest all available columns. timestamp_column (str, optional): Defaults to None. Timestamp column in the csv. If not set, defaults to timestamp value. timestamp_value (datetime, optional): Defaults to current datetime. Timestamp value to assign to all features in the dataset. serving_store (feast.sdk.resources.feature.DataStore): Defaults to None. Serving store to write the features in this instance to. warehouse_store (feast.sdk.resources.feature.DataStore): Defaults to None. Warehouse store to write the features in this instance to. Returns: Importer: the importer for the dataset provided. """ import_spec_options = {"format": "csv"} import_spec_options["path"], require_staging = \ _get_remote_location(path, staging_location) if is_gs_path(path): df = gcs_to_df(path) else: df = pd.read_csv(path) schema, features = \ _detect_schema_and_feature(entity, granularity, owner, id_column, feature_columns, timestamp_column, timestamp_value, serving_store, warehouse_store, df) iport_spec = _create_import("file", import_spec_options, entity, schema) props = (_properties("csv", len(df.index), require_staging, import_spec_options["path"])) specs = _specs(iport_spec, Entity(name=entity), features) return cls(specs, df, props)
def download_table_as_file( self, full_table_id, dest, file_type, staging_location=None ): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format staging_location (str, optional): url to staging_location (currently support a folder in GCS) Returns: (str) path to the downloaded file """ if not staging_location: df = self.download_table_as_df(full_table_id) if file_type == FileType.CSV: df.to_csv(dest, index=False) elif file_type == FileType.JSON: df.to_json(dest, index=False) else: raise ValueError( "Only FileType: CSV and JSON are supported for download_table_as_file without staging location" ) return dest if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") shard_folder = self.__extract_table_to_shard_folder( full_table_id, staging_location, file_type) return gcs_folder_to_file(shard_folder, dest)
def _get_remote_location(path, staging_location): """Get the remote location of the file Args: path (str): raw path of the file staging_location (str): path to stage the file """ if is_gs_path(path): return path, False if staging_location is None: raise ValueError( "Specify staging_location for importing local file/dataframe") if not is_gs_path(staging_location): raise ValueError("Staging location must be in GCS") filename = ntpath.basename(path) return staging_location + "/" + filename, True
def _validate_csv_importer(self, importer, csv_path, entity_name, feature_granularity, owner, staging_location=None, id_column=None, feature_columns=None, timestamp_column=None, timestamp_value=None): df = pd.read_csv(csv_path) assert not importer.require_staging == is_gs_path(csv_path) if importer.require_staging: assert importer.remote_path == "{}/{}".format( staging_location, ntpath.basename(csv_path)) # check features created for feature in importer.features.values(): assert feature.name in df.columns assert feature.id == "{}.{}.{}".format( entity_name, Granularity_pb2.Enum.Name(feature_granularity.value).lower(), feature.name) import_spec = importer.spec assert import_spec.type == "file" path = importer.remote_path if importer.require_staging else csv_path assert import_spec.options == {"format": "csv", "path": path} assert import_spec.entities == [entity_name] schema = import_spec.schema assert schema.entityIdColumn == id_column if id_column is not None else entity_name if timestamp_column is not None: assert schema.timestampColumn == timestamp_column elif timestamp_value is not None: assert schema.timestampValue == timestamp_value if feature_columns is None: feature_columns = list(df.columns.values) feature_columns.remove(id_column) feature_columns.remove(timestamp_column) # check schema's field for col, field in zip(df.columns.values, schema.fields): assert col == field.name if col in feature_columns: assert field.featureId == "{}.{}.{}".format( entity_name, Granularity_pb2.Enum.Name( feature_granularity.value).lower(), col)
def download_table_as_file(self, full_table_id, dest, file_type, staging_location=None): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format staging_location (str, optional): url to staging_location (currently support a folder in GCS) Returns: (str) path to the downloaded file """ if not staging_location: df = self.download_table_as_df(full_table_id) if file_type == FileType.CSV: df.to_csv(dest, index=False) elif file_type == FileType.JSON: df.to_json(dest, index=False) else: raise ValueError( "Only FileType: CSV and JSON are supported for download_table_as_file without staging location" ) return dest if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bqclient.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.storageclient.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def download_table_as_df(self, full_table_id, staging_location=None): """ Download a BigQuery table as Pandas Dataframe Args: full_table_id (src) : fully qualified BigQuery table id staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ if not staging_location: table = bigquery.TableReference.from_string(full_table_id) rows = self.bqclient.list_rows(table) return rows.to_dataframe(bqstorage_client=self.bqstorageclient) if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") shard_folder = self.__extract_table_to_shard_folder( full_table_id, staging_location, DestinationFormat.CSV) return gcs_folder_to_df(shard_folder)
def test_is_gs_path(): assert is_gs_path("gs://valid/gs/file.csv") == True assert is_gs_path("local/path/file.csv") == False