def __init__(self, feature_store_id, features=[], training_dataset_version=None): self._training_dataset_version = training_dataset_version self._features = features self._prepared_statement_engine = None self._prepared_statements = None self._serving_keys = None self._pkname_by_serving_index = None self._prefix_by_serving_index = None self._external = True self._feature_store_id = feature_store_id self._training_dataset_api = training_dataset_api.TrainingDatasetApi( feature_store_id ) self._feature_view_api = feature_view_api.FeatureViewApi(feature_store_id) self._storage_connector_api = storage_connector_api.StorageConnectorApi( feature_store_id ) self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( feature_store_id ) ) self._feature_view_engine = feature_view_engine.FeatureViewEngine( feature_store_id )
def __init__( self, name, version, description, data_format, location, featurestore_id, storage_connector=None, splits=None, seed=None, cluster_analysis=None, created=None, creator=None, descriptive_statistics=None, feature_correlation_matrix=None, features=None, features_histogram=None, featurestore_name=None, id=None, jobs=None, inode_id=None, storage_connector_name=None, storage_connector_id=None, storage_connector_type=None, training_dataset_type=None, ): self._id = id self._name = name self._version = version self._description = description self._data_format = data_format self._seed = seed self._location = location self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id) self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine( featurestore_id) self._storage_connector_api = storage_connector_api.StorageConnectorApi( featurestore_id) # set up depending on user initialized or coming from backend response if training_dataset_type is None: # no type -> user init self._features = features self.storage_connector = storage_connector self.splits = splits else: # type available -> init from backend response # make rest call to get all connector information, description etc. self._storage_connector = self._storage_connector_api.get_by_id( storage_connector_id, storage_connector_type) self._features = [ feature.Feature.from_response_json(feat) for feat in features ] self._splits = splits self._training_dataset_type = training_dataset_type
def __init__(self, feature_store_id): self._training_dataset_api = training_dataset_api.TrainingDatasetApi( feature_store_id ) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) self._storage_connector_api = storage_connector_api.StorageConnectorApi( feature_store_id )
def __init__(self, feature_store_id): self._feature_store_id = feature_store_id self._training_dataset_api = training_dataset_api.TrainingDatasetApi( feature_store_id) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE) self._storage_connector_api = storage_connector_api.StorageConnectorApi( feature_store_id) self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( feature_store_id))
def __init__( self, featurestore_id, featurestore_name, created, hdfs_store_path, project_name, project_id, featurestore_description, inode_id, offline_featurestore_name, hive_endpoint, online_enabled, num_feature_groups=None, num_training_datasets=None, num_storage_connectors=None, online_featurestore_name=None, mysql_server_endpoint=None, online_featurestore_size=None, ): self._id = featurestore_id self._name = featurestore_name self._created = created self._hdfs_store_path = hdfs_store_path self._project_name = project_name self._project_id = project_id self._description = featurestore_description self._inode_id = inode_id self._online_feature_store_name = online_featurestore_name self._online_feature_store_size = online_featurestore_size self._offline_feature_store_name = offline_featurestore_name self._hive_endpoint = hive_endpoint self._mysql_server_endpoint = mysql_server_endpoint self._online_enabled = online_enabled self._num_feature_groups = num_feature_groups self._num_training_datasets = num_training_datasets self._num_storage_connectors = num_storage_connectors self._feature_group_api = feature_group_api.FeatureGroupApi(self._id) self._storage_connector_api = storage_connector_api.StorageConnectorApi( self._id) self._training_dataset_api = training_dataset_api.TrainingDatasetApi( self._id) self._expectations_api = expectations_api.ExpectationsApi(self._id) self._feature_group_engine = feature_group_engine.FeatureGroupEngine( self._id) self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( self._id)) self._feature_view_engine = feature_view_engine.FeatureViewEngine( self._id)
def write_training_dataset( self, training_dataset, dataset, user_write_options, save_mode, feature_view_obj=None, to_df=False, ): if not feature_view_obj and not isinstance(dataset, query.Query): raise Exception( "Currently only query based training datasets are supported by the Python engine" ) # As for creating a feature group, users have the possibility of passing # a spark_job_configuration object as part of the user_write_options with the key "spark" spark_job_configuration = user_write_options.pop("spark", None) td_app_conf = training_dataset_job_conf.TrainingDatsetJobConf( query=dataset, overwrite=(save_mode == "overwrite"), write_options=user_write_options, spark_job_configuration=spark_job_configuration, ) if feature_view_obj: fv_api = feature_view_api.FeatureViewApi(feature_view_obj.featurestore_id) td_job = fv_api.compute_training_dataset( feature_view_obj.name, feature_view_obj.version, training_dataset.version, td_app_conf, ) else: td_api = training_dataset_api.TrainingDatasetApi( training_dataset.feature_store_id ) td_job = td_api.compute(training_dataset, td_app_conf) print( "Training dataset job started successfully, you can follow the progress at \n{}".format( self._get_job_url(td_job.href) ) ) # If the user passed the wait_for_job option consider it, # otherwise use the default True self._wait_for_job(td_job, user_write_options) return td_job
def write_training_dataset(self, training_dataset, features, user_write_options, save_mode): if not isinstance(features, query.Query): raise "Currently only query based training datasets are supported by the Python engine" # As for creating a feature group, users have the possibility of passing # a spark_job_configuration object as part of the user_write_options with the key "spark" spark_job_configuration = user_write_options.pop("spark", None) td_app_conf = training_dataset_job_conf.TrainingDatsetJobConf( query=features, overwrite=(save_mode == "overwrite"), write_options=user_write_options, spark_job_configuration=spark_job_configuration, ) td_api = training_dataset_api.TrainingDatasetApi( training_dataset.feature_store_id) td_job = td_api.compute(training_dataset, td_app_conf) print( "Training dataset job started successfully, you can follow the progress at {}" .format(self._get_job_url(td_job.href)))
def __init__( self, featurestore_id, featurestore_name, created, hdfs_store_path, project_name, project_id, featurestore_description, inode_id, offline_featurestore_name, hive_endpoint, online_enabled, online_featurestore_name=None, mysql_server_endpoint=None, online_featurestore_size=None, ): self._id = featurestore_id self._name = featurestore_name self._created = created self._hdfs_store_path = hdfs_store_path self._project_name = project_name self._project_id = project_id self._description = featurestore_description self._inode_id = inode_id self._online_feature_store_name = online_featurestore_name self._online_feature_store_size = online_featurestore_size self._offline_feature_store_name = offline_featurestore_name self._hive_endpoint = hive_endpoint self._mysql_server_endpoint = mysql_server_endpoint self._online_enabled = online_enabled self._feature_group_api = feature_group_api.FeatureGroupApi(self._id) self._storage_connector_api = storage_connector_api.StorageConnectorApi( self._id) self._training_dataset_api = training_dataset_api.TrainingDatasetApi( self._id) self._feature_group_engine = feature_group_engine.FeatureGroupEngine( self._id)
def __init__( self, name, version, data_format, featurestore_id, location="", event_start_time=None, event_end_time=None, coalesce=False, description=None, storage_connector=None, splits=None, validation_size=None, test_size=None, train_start=None, train_end=None, validation_start=None, validation_end=None, test_start=None, test_end=None, seed=None, created=None, creator=None, features=None, statistics_config=None, featurestore_name=None, id=None, inode_id=None, training_dataset_type=None, from_query=None, querydto=None, label=None, transformation_functions=None, train_split=None, ): self._id = id self._name = name self._version = version self._description = description self._data_format = data_format self._start_time = self._convert_event_time_to_timestamp( event_start_time) self._end_time = self._convert_event_time_to_timestamp(event_end_time) self._validation_size = validation_size self._test_size = test_size self._train_start = train_start self._train_end = train_end self._validation_start = validation_start self._validation_end = validation_end self._test_start = test_start self._test_end = test_end self._coalesce = coalesce self._seed = seed self._location = location self._from_query = from_query self._querydto = querydto self._feature_store_id = featurestore_id self._transformation_functions = transformation_functions self._train_split = train_split self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id) self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine( featurestore_id) self._statistics_engine = statistics_engine.StatisticsEngine( featurestore_id, self.ENTITY_TYPE) self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE) self._transformation_function_engine = ( transformation_function_engine.TransformationFunctionEngine( featurestore_id)) if training_dataset_type: self.training_dataset_type = training_dataset_type else: self._training_dataset_type = None # set up depending on user initialized or coming from backend response if created is None: # no type -> user init self._features = features self.storage_connector = storage_connector self.splits = splits self.statistics_config = statistics_config self._label = label if validation_size or test_size: self._train_split = TrainingDatasetSplit.TRAIN self.splits = { TrainingDatasetSplit.TRAIN: 1 - (validation_size or 0) - (test_size or 0), TrainingDatasetSplit.VALIDATION: validation_size, TrainingDatasetSplit.TEST: test_size, } self._set_time_splits( train_start, train_end, validation_start, validation_end, test_start, test_end, ) else: # type available -> init from backend response # make rest call to get all connector information, description etc. self._storage_connector = StorageConnector.from_response_json( storage_connector) if features is None: features = [] self._features = [ training_dataset_feature.TrainingDatasetFeature. from_response_json(feat) for feat in features ] self._splits = [ TrainingDatasetSplit.from_response_json(split) for split in splits ] self._statistics_config = StatisticsConfig.from_response_json( statistics_config) self._label = [ feat.name.lower() for feat in self._features if feat.label ] self._vector_server = vector_server.VectorServer( featurestore_id, features=self._features)
def __init__( self, name, version, data_format, location, featurestore_id, coalesce=False, description=None, storage_connector=None, splits=None, seed=None, created=None, creator=None, features=None, statistics_config=None, featurestore_name=None, id=None, inode_id=None, training_dataset_type=None, from_query=None, querydto=None, label=None, ): self._id = id self._name = name self._version = version self._description = description self._data_format = data_format self._coalesce = coalesce self._seed = seed self._location = location self._from_query = from_query self._querydto = querydto self._feature_store_id = featurestore_id self._prepared_statement_connection = None self._prepared_statements = None self._serving_keys = None self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id) self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine( featurestore_id) self._statistics_engine = statistics_engine.StatisticsEngine( featurestore_id, self.ENTITY_TYPE) # set up depending on user initialized or coming from backend response if training_dataset_type is None: # no type -> user init self._features = features self.storage_connector = storage_connector self.splits = splits self.statistics_config = statistics_config self._label = label else: # type available -> init from backend response # make rest call to get all connector information, description etc. self._storage_connector = StorageConnector.from_response_json( storage_connector) self._features = [ training_dataset_feature.TrainingDatasetFeature. from_response_json(feat) for feat in features ] self._splits = splits self._training_dataset_type = training_dataset_type self._statistics_config = StatisticsConfig.from_response_json( statistics_config) self._label = [ feat.name.lower() for feat in self._features if feat.label ]
def __init__(self, feature_store_id): self._training_dataset_api = training_dataset_api.TrainingDatasetApi( feature_store_id) self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
def __init__( self, name, version, data_format, location, featurestore_id, description=None, storage_connector=None, splits=None, seed=None, created=None, creator=None, features=None, statistics_config=None, featurestore_name=None, id=None, jobs=None, inode_id=None, storage_connector_name=None, storage_connector_id=None, storage_connector_type=None, training_dataset_type=None, from_query=None, querydto=None, ): self._id = id self._name = name self._version = version self._description = description self._data_format = data_format self._seed = seed self._location = location self._from_query = from_query self._querydto = querydto self._training_dataset_api = training_dataset_api.TrainingDatasetApi( featurestore_id) self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine( featurestore_id) self._storage_connector_api = storage_connector_api.StorageConnectorApi( featurestore_id) self._statistics_engine = statistics_engine.StatisticsEngine( featurestore_id, self.ENTITY_TYPE) # set up depending on user initialized or coming from backend response if training_dataset_type is None: # no type -> user init self._features = features self.storage_connector = storage_connector self.splits = splits self.statistics_config = statistics_config else: # type available -> init from backend response # make rest call to get all connector information, description etc. self._storage_connector = self._storage_connector_api.get_by_id( storage_connector_id, storage_connector_type) self._features = [ training_dataset_feature.TrainingDatasetFeature. from_response_json(feat) for feat in features ] self._splits = splits self._training_dataset_type = training_dataset_type self.statistics_config = None
def __init__(self, feature_store_id): self._training_dataset_api = training_dataset_api.TrainingDatasetApi( feature_store_id) self._tags_api = tags_api.TagsApi(feature_store_id, "trainingdatasets")