def _test_download_file(self, mocker, type): staging_path = "gs://temp/" staging_file_name = "temp_0" dst_path = "/tmp/myfile.csv" table_id = "project_id.dataset_id.table_id" table_dldr = TableDownloader() mock_blob = _Blob() mocker.patch.object(mock_blob, "download_to_filename") table_dldr._bq = _Mock_BQ_Client() mocker.patch.object(table_dldr._bq, "extract_table", return_value=_Job()) table_dldr._gcs = _Mock_GCS_Client() mocker.patch.object(table_dldr._gcs, "get_bucket", return_value=_Bucket(mock_blob)) table_dldr.download_table_as_file(table_id, dst_path, staging_location=staging_path, file_type=type) exp_staging_path = os.path.join(staging_path, staging_file_name) assert len(table_dldr._bq.extract_table.call_args_list) == 1 args, kwargs = table_dldr._bq.extract_table.call_args_list[0] assert args[0].full_table_id == Table.from_string( table_id).full_table_id assert args[1] == exp_staging_path assert kwargs["job_config"].destination_format == str(type) mock_blob.download_to_filename.assert_called_once_with(dst_path)
def _test_download_file(self, mocker, type): mocked_gcs_folder_to_file = mocker.patch( "feast.sdk.utils.bq_util.gcs_folder_to_file", return_value=None) staging_path = "gs://temp" temp_folder = "temp_0" full_table_id = "project_id.dataset_id.table_id" dst_path = "/tmp/myfile.csv" exp_staging_folder = os.path.join(staging_path, temp_folder) exp_staging_path = os.path.join(exp_staging_folder, "shard_*") table_dldr = TableDownloader() table_dldr._bqclient = _Mock_BQ_Client() mocker.patch.object(table_dldr._bqclient, "extract_table", return_value=_Job()) table_dldr.download_table_as_file(full_table_id, dst_path, staging_location=staging_path, file_type=type) assert len(table_dldr._bqclient.extract_table.call_args_list) == 1 args, kwargs = table_dldr._bqclient.extract_table.call_args_list[0] assert args[0].full_table_id == Table.from_string( full_table_id).full_table_id assert args[1] == exp_staging_path assert kwargs["job_config"].destination_format == str(type) mocked_gcs_folder_to_file.assert_called_once_with( exp_staging_folder, dst_path)
def test_download_invalid_staging_url(self): table_id = "project_id.dataset_id.table_id" table_dldr = TableDownloader() with pytest.raises(ValueError, match="staging_uri must be a directory in " "GCS"): table_dldr.download_table_as_file(table_id, "/tmp/dst", "/local/directory", FileType.CSV) with pytest.raises(ValueError, match="staging_uri must be a directory in " "GCS"): table_dldr.download_table_as_df(table_id, "/local/directory")
class Client: def __init__(self, core_url=None, serving_url=None, verbose=False): """Create an instance of Feast client which is connected to feast endpoint specified in the parameter. If no url is provided, the client will default to the url specified in the environment variable FEAST_CORE_URL. Args: core_url (str, optional): feast's grpc endpoint URL (e.g.: "my.feast.com:8433") serving_url (str, optional): feast serving's grpc endpoint URL (e.g.: "my.feast.com:8433") """ if core_url is None: core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) self._core_url = core_url if serving_url is None: serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) self._serving_url = serving_url self.__core_channel = None self.__serving_channel = None self._core_service_stub = None self._job_service_stub = None self._dataset_service_stub = None self._serving_service_stub = None self._verbose = verbose self._table_downloader = TableDownloader() @property def core_url(self): if self._core_url is None: self._core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) if self._core_url is None: raise ValueError( "Core API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_CORE_URL_ENV_KEY)) return self._core_url @core_url.setter def core_url(self, value): self._core_url = value @property def serving_url(self): if self._serving_url is None: self._serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) if self._serving_url is None: raise ValueError( "Serving API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_SERVING_URL_ENV_KEY)) return self._serving_url @serving_url.setter def serving_url(self, value): self._serving_url = value @property def verbose(self): return self._verbose @verbose.setter def verbose(self, val): if not isinstance(val, bool): raise TypeError("verbose should be a boolean value") self._verbose = val def apply(self, obj): """Create or update one or many feast's resource (feature, entity, importer, storage). Args: obj (object): one or many feast's resource // create_entity (bool, optional): (default: {None}) // create_features (bool, optional): [description] (default: {None}) """ if isinstance(obj, list): ids = [] for resource in obj: ids.append(self._apply(resource)) return ids else: return self._apply(obj) def run(self, importer, name_override=None, apply_entity=False, apply_features=False): """ Run an import job Args: importer (feast.sdk.importer.Importer): importer instance name_override (str, optional): Job name override apply_entity (bool, optional): (default: False) create/update entity inside importer apply_features (bool, optional): (default: False) create/update features inside importer Returns: (str) job ID of the import job """ request = JobServiceTypes.SubmitImportJobRequest( importSpec=importer.spec) if name_override is not None: request.name = name_override if apply_entity: self._apply_entity(importer.entity) if apply_features: for feature in importer.features: self._apply_feature(importer.features[feature]) if importer.require_staging: print("Staging file to remote path {}".format( importer.remote_path)) importer.stage() print("Submitting job with spec:\n {}".format( spec_to_yaml(importer.spec))) self._connect_core() response = self._job_service_stub.SubmitJob(request) print("Submitted job with id: {}".format(response.jobId)) return response.jobId def create_dataset(self, feature_set, start_date, end_date, limit=None, name_prefix=None): """ Create training dataset for a feature set. The training dataset will be bounded by event timestamp between start_date and end_date. Specify limit to limit number of row returned. The training dataset will reside in a bigquery table specified by destination. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted start_date (str): starting date of the training data in ISO 8601 format (e.g.: "2018-12-31") end_date (str): end date of training data in ISO 8601 format (e.g.: "2018-12-31") limit (int, optional): (default: None) maximum number of row returned name_prefix (str, optional): (default: None) name prefix. :return: feast.resources.feature_set.DatasetInfo: DatasetInfo containing the information of training dataset """ self._check_create_dataset_args(feature_set, start_date, end_date, limit) req = DatasetServiceTypes.CreateDatasetRequest( featureSet=feature_set.proto, startDate=_timestamp_from_datetime(_parse_date(start_date)), endDate=_timestamp_from_datetime(_parse_date(end_date)), limit=limit, namePrefix=name_prefix, ) if self.verbose: print("creating training dataset for features: " + str(feature_set.features)) self._connect_core() resp = self._dataset_service_stub.CreateDataset(req) if self.verbose: print("created dataset {}: {}".format(resp.datasetInfo.name, resp.datasetInfo.tableUrl)) return DatasetInfo(resp.datasetInfo.name, resp.datasetInfo.tableUrl) def get_serving_data(self, feature_set, entity_keys, ts_range=None): """Get feature value from feast serving API. If server_url is not provided, the value stored in the environment variable FEAST_SERVING_URL is used to connect to the serving server instead. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted entity_keys (:obj: `list` of :obj: `str): list of entity keys ts_range (:obj: `list` of str, optional): size 2 list of start and end time, in datetime type. It will filter out any feature value having event timestamp outside of the ts_range. Returns: pandas.DataFrame: DataFrame of results """ start = None end = None if ts_range is not None: if len(ts_range) != 2: raise ValueError("ts_range must have len 2") start = ts_range[0] end = ts_range[1] if type(start) is not datetime or type(end) is not datetime: raise TypeError("start and end must be datetime type") request = self._build_serving_request(feature_set, entity_keys) self._connect_serving() return self._response_to_df( feature_set, self._serving_service_stub.QueryFeatures(request), start, end) def download_dataset(self, dataset_info, dest, staging_location, file_type=FileType.CSV): """ Download training dataset as file Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded dest (str): destination's file path staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: str: path to the downloaded file """ return self._table_downloader.download_table_as_file( dataset_info.full_table_id, dest, staging_location, file_type) def download_dataset_to_df(self, dataset_info, staging_location): """ Download training dataset as Pandas Dataframe Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ return self._table_downloader.download_table_as_df( dataset_info.full_table_id, staging_location) def close(self): """ Close underlying connection to Feast's core and serving end points. """ self.__core_channel.close() self.__core_channel = None self.__serving_channel.close() self.__serving_channel = None def _connect_core(self): """Connect to core api""" if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) self._core_service_stub = CoreServiceStub(self.__core_channel) self._job_service_stub = JobServiceStub(self.__core_channel) self._dataset_service_stub = DatasetServiceStub( self.__core_channel) def _connect_serving(self): """Connect to serving api""" if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) self._serving_service_stub = ServingAPIStub(self.__serving_channel) def _build_serving_request(self, feature_set, entity_keys): """Helper function to build serving service request.""" return QueryFeaturesRequest( entityName=feature_set.entity, entityId=entity_keys, featureId=feature_set.features, ) def _response_to_df(self, feature_set, response, start=None, end=None): is_filter_time = start is not None and end is not None df = pd.DataFrame(columns=[feature_set.entity] + feature_set.features) dtypes = {} for entity_id in response.entities: feature_map = response.entities[entity_id].features row = {response.entityName: entity_id} for feature_id in feature_map: v = feature_map[feature_id].value if is_filter_time: ts = feature_map[feature_id].timestamp.ToDatetime() if ts < start or ts > end: continue feast_valuetype = v.WhichOneof("val") if feast_valuetype not in dtypes: dtypes[feature_id] = types.FEAST_VALUETYPE_TO_DTYPE[ feast_valuetype] v = getattr(v, v.WhichOneof("val")) row[feature_id] = v df = df.append(row, ignore_index=True) return df.astype(dtypes).reset_index(drop=True) def _apply(self, obj): """Applies a single object to feast core. Args: obj (object): one of [Feature, Entity, FeatureGroup, Storage, Importer] """ if isinstance(obj, Feature): return self._apply_feature(obj) elif isinstance(obj, Entity): return self._apply_entity(obj) elif isinstance(obj, FeatureGroup): return self._apply_feature_group(obj) elif isinstance(obj, Storage): return self._apply_storage(obj) else: raise TypeError("Apply can only be passed one of the following \ types: [Feature, Entity, FeatureGroup, Storage, Importer]") def _apply_feature(self, feature): """Apply the feature to the core API Args: feature (feast.sdk.resources.feature.Feature): feature to apply """ self._connect_core() response = self._core_service_stub.ApplyFeature(feature.spec) if self.verbose: print("Successfully applied feature with id: {}\n---\n{}".format( response.featureId, feature)) return response.featureId def _apply_entity(self, entity): """Apply the entity to the core API Args: entity (feast.sdk.resources.entity.Entity): entity to apply """ self._connect_core() response = self._core_service_stub.ApplyEntity(entity.spec) if self.verbose: print("Successfully applied entity with name: {}\n---\n{}".format( response.entityName, entity)) return response.entityName def _apply_feature_group(self, feature_group): """Apply the feature group to the core API Args: feature_group (feast.sdk.resources.feature_group.FeatureGroup): feature group to apply """ self._connect_core() response = self._core_service_stub.ApplyFeatureGroup( feature_group.spec) if self.verbose: print("Successfully applied feature group with id: " + "{}\n---\n{}".format(response.featureGroupId, feature_group)) return response.featureGroupId def _apply_storage(self, storage): """Apply the storage to the core API Args: storage (feast.sdk.resources.storage.Storage): storage to apply """ self._connect_core() response = self._core_service_stub.ApplyStorage(storage.spec) if self.verbose: print("Successfully applied storage with id: " + "{}\n{}".format(response.storageId, storage)) return response.storageId def _check_create_dataset_args(self, feature_set, start_date, end_date, limit): if len(feature_set.features) < 1: raise ValueError("feature set is empty") start = _parse_date(start_date) end = _parse_date(end_date) if end < start: raise ValueError("end_date is before start_date") if limit is not None and limit < 1: raise ValueError("limit is not a positive integer")
class Client: def __init__(self, core_url=None, serving_url=None, verbose=False): """Create an instance of Feast client which is connected to feast endpoint specified in the parameter. If no url is provided, the client will default to the url specified in the environment variable FEAST_CORE_URL. Args: core_url (str, optional): feast's grpc endpoint URL (e.g.: "my.feast.com:8433") serving_url (str, optional): feast serving's grpc endpoint URL (e.g.: "my.feast.com:8433") """ if core_url is None: core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) self._core_url = core_url if serving_url is None: serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) self._serving_url = serving_url self.__core_channel = None self.__serving_channel = None self._core_service_stub = None self._job_service_stub = None self._dataset_service_stub = None self._serving_service_stub = None self._verbose = verbose self._table_downloader = TableDownloader() @property def core_url(self): if self._core_url is None: self._core_url = os.getenv(FEAST_CORE_URL_ENV_KEY) if self._core_url is None: raise ValueError( "Core API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_CORE_URL_ENV_KEY)) return self._core_url @core_url.setter def core_url(self, value): self._core_url = value @property def serving_url(self): if self._serving_url is None: self._serving_url = os.getenv(FEAST_SERVING_URL_ENV_KEY) if self._serving_url is None: raise ValueError( "Serving API URL not set. Either set the " + "environment variable {} or set it explicitly.".format( FEAST_SERVING_URL_ENV_KEY)) return self._serving_url @serving_url.setter def serving_url(self, value): self._serving_url = value @property def verbose(self): return self._verbose @verbose.setter def verbose(self, val): if not isinstance(val, bool): raise TypeError("verbose should be a boolean value") self._verbose = val def apply(self, obj): """Create or update one or many feast's resource (feature, entity, importer, storage). Args: obj (object): one or many feast's resource // create_entity (bool, optional): (default: {None}) // create_features (bool, optional): [description] (default: {None}) """ if isinstance(obj, list): ids = [] for resource in obj: ids.append(self._apply(resource)) return ids else: return self._apply(obj) def run(self, importer, name_override=None, apply_entity=False, apply_features=False): """ Run an import job Args: importer (feast.sdk.importer.Importer): importer instance name_override (str, optional): Job name override apply_entity (bool, optional): (default: False) create/update entity inside importer apply_features (bool, optional): (default: False) create/update features inside importer Returns: (str) job ID of the import job """ request = JobServiceTypes.SubmitImportJobRequest( importSpec=importer.spec) if name_override is not None: request.name = name_override if apply_entity: self._apply_entity(importer.entity) if apply_features: for feature in importer.features: self._apply_feature(feature) if importer.require_staging: print("Staging file to remote path {}".format( importer.remote_path)) importer.stage() print("Submitting job with spec:\n {}".format( spec_to_yaml(importer.spec))) self._connect_core() response = self._job_service_stub.SubmitJob(request) print("Submitted job with id: {}".format(response.jobId)) return response.jobId def create_dataset(self, feature_set, start_date, end_date, limit=None, name_prefix=None): """ Create training dataset for a feature set. The training dataset will be bounded by event timestamp between start_date and end_date. Specify limit to limit number of row returned. The training dataset will reside in a bigquery table specified by destination. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted start_date (str): starting date of the training data in ISO 8601 format (e.g.: "2018-12-31") end_date (str): end date of training data in ISO 8601 format (e.g.: "2018-12-31") limit (int, optional): (default: None) maximum number of row returned name_prefix (str, optional): (default: None) name prefix. :return: feast.resources.feature_set.DatasetInfo: DatasetInfo containing the information of training dataset """ self._check_create_dataset_args(feature_set, start_date, end_date, limit) req = DatasetServiceTypes.CreateDatasetRequest( featureSet=feature_set.proto, startDate=_timestamp_from_datetime(_parse_date(start_date)), endDate=_timestamp_from_datetime(_parse_date(end_date)), limit=limit, namePrefix=name_prefix) if self.verbose: print("creating training dataset for features: " + str(feature_set.features)) self._connect_core() resp = self._dataset_service_stub.CreateDataset(req) if self.verbose: print("created dataset {}: {}".format(resp.datasetInfo.name, resp.datasetInfo.tableUrl)) return DatasetInfo(resp.datasetInfo.name, resp.datasetInfo.tableUrl) def get_serving_data(self, feature_set, entity_keys, request_type=ServingRequestType.LAST, ts_range=[], limit=10): """Get data from the feast serving layer. You can either retrieve the the latest value, or a list of the latest values, up to a provided limit. If server_url is not provided, the value stored in the environment variable FEAST_SERVING_URL is used to connect to the serving server instead. Args: feature_set (feast.sdk.resources.feature_set.FeatureSet): feature set representing the data wanted entity_keys (:obj: `list` of :obj: `str): list of entity keys request_type (feast.sdk.utils.types.ServingRequestType): (default: feast.sdk.utils.types.ServingRequestType.LAST) type of request: one of [LIST, LAST] ts_range (:obj: `list` of str, optional): size 2 list of start timestamp and end timestamp, in ISO 8601 format. Only required if request_type is set to LIST limit (int, optional): (default: 10) number of values to get. Only required if request_type is set to LIST Returns: pandas.DataFrame: DataFrame of results """ ts_range = [ _timestamp_from_datetime(dateutil.parser.parse(dt)) for dt in ts_range ] request = self._build_serving_request(feature_set, entity_keys, request_type, ts_range, limit) self._connect_serving() return self._response_to_df( feature_set, self._serving_service_stub.QueryFeatures(request)) def download_dataset(self, dataset_info, dest, staging_location, file_type=FileType.CSV): """ Download training dataset as file Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded dest (str): destination's file path staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: str: path to the downloaded file """ return self._table_downloader.download_table_as_file( dataset_info.table_id, dest, staging_location, file_type) def download_dataset_to_df(self, dataset_info, staging_location): """ Download training dataset as Pandas Dataframe Args: dataset_info (feast.sdk.resources.feature_set.DatasetInfo) : dataset_info to be downloaded staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ return self._table_downloader.download_table_as_df( dataset_info.table_id, staging_location) def close(self): """ Close underlying connection to Feast's core and serving end points. """ self.__core_channel.close() self.__core_channel = None self.__serving_channel.close() self.__serving_channel = None def _connect_core(self): """Connect to core api""" if self.__core_channel is None: self.__core_channel = grpc.insecure_channel(self.core_url) self._core_service_stub = CoreServiceStub(self.__core_channel) self._job_service_stub = JobServiceStub(self.__core_channel) self._dataset_service_stub = DatasetServiceStub( self.__core_channel) def _connect_serving(self): """Connect to serving api""" if self.__serving_channel is None: self.__serving_channel = grpc.insecure_channel(self.serving_url) self._serving_service_stub = ServingAPIStub(self.__serving_channel) def _build_serving_request(self, feature_set, entity_keys, request_type, ts_range, limit): """Helper function to build serving service request.""" request = QueryFeatures.Request(entityName=feature_set.entity, entityId=entity_keys) features = [ RequestDetail(featureId=feat_id, type=request_type.value) for feat_id in feature_set.features ] if request_type == ServingRequestType.LIST: ts_range = TimestampRange(start=ts_range[0], end=ts_range[1]) request.timestampRange.CopyFrom(ts_range) for feature in features: feature.limit = limit request.requestDetails.extend(features) return request def _response_to_df(self, feature_set, response): entity_tables = [] for entity_key in response.entities: feature_tables = [] features = response.entities[entity_key].features for feature_name in features: rows = [] v_list = features[feature_name].valueList v_list = getattr(v_list, v_list.WhichOneof("valueList")).val for idx in range(len(v_list)): row = { response.entityName: entity_key, feature_name: v_list[idx] } if features[feature_name].HasField("timestampList"): ts_seconds = \ features[feature_name].timestampList.val[idx].seconds row["timestamp"] = datetime.fromtimestamp(ts_seconds) rows.append(row) feature_tables.append(pd.DataFrame(rows)) entity_table = feature_tables[0] for idx in range(1, len(feature_tables)): entity_table = pd.merge(left=entity_table, right=feature_tables[idx], how='outer') entity_tables.append(entity_table) if len(entity_tables) == 0: return pd.DataFrame(columns=[feature_set.entity, "timestamp"] + feature_set.features) df = pd.concat(entity_tables) return df.reset_index(drop=True) def _apply(self, obj): """Applies a single object to feast core. Args: obj (object): one of [Feature, Entity, FeatureGroup, Storage, Importer] """ if isinstance(obj, Feature): return self._apply_feature(obj) elif isinstance(obj, Entity): return self._apply_entity(obj) elif isinstance(obj, FeatureGroup): return self._apply_feature_group(obj) elif isinstance(obj, Storage): return self._apply_storage(obj) else: raise TypeError('Apply can only be passed one of the following \ types: [Feature, Entity, FeatureGroup, Storage, Importer]') def _apply_feature(self, feature): """Apply the feature to the core API Args: feature (feast.sdk.resources.feature.Feature): feature to apply """ self._connect_core() response = self._core_service_stub.ApplyFeature(feature.spec) if self.verbose: print("Successfully applied feature with id: {}\n---\n{}".format( response.featureId, feature)) return response.featureId def _apply_entity(self, entity): """Apply the entity to the core API Args: entity (feast.sdk.resources.entity.Entity): entity to apply """ self._connect_core() response = self._core_service_stub.ApplyEntity(entity.spec) if self.verbose: print("Successfully applied entity with name: {}\n---\n{}".format( response.entityName, entity)) return response.entityName def _apply_feature_group(self, feature_group): """Apply the feature group to the core API Args: feature_group (feast.sdk.resources.feature_group.FeatureGroup): feature group to apply """ self._connect_core() response = self._core_service_stub.ApplyFeatureGroup( feature_group.spec) if self.verbose: print("Successfully applied feature group with id: " + "{}\n---\n{}".format(response.featureGroupId, feature_group)) return response.featureGroupId def _apply_storage(self, storage): """Apply the storage to the core API Args: storage (feast.sdk.resources.storage.Storage): storage to apply """ self._connect_core() response = self._core_service_stub.ApplyStorage(storage.spec) if self.verbose: print("Successfully applied storage with id: " + "{}\n{}".format(response.storageId, storage)) return response.storageId def _check_create_dataset_args(self, feature_set, start_date, end_date, limit): if len(feature_set.features) < 1: raise ValueError("feature set is empty") start = _parse_date(start_date) end = _parse_date(end_date) if end < start: raise ValueError("end_date is before start_date") if limit is not None and limit < 1: raise ValueError("limit is not a positive integer")