예제 #1
0
 def __init__(self, feature_store_id, features=[], training_dataset_version=None):
     self._training_dataset_version = training_dataset_version
     self._features = features
     self._prepared_statement_engine = None
     self._prepared_statements = None
     self._serving_keys = None
     self._pkname_by_serving_index = None
     self._prefix_by_serving_index = None
     self._external = True
     self._feature_store_id = feature_store_id
     self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
         feature_store_id
     )
     self._feature_view_api = feature_view_api.FeatureViewApi(feature_store_id)
     self._storage_connector_api = storage_connector_api.StorageConnectorApi(
         feature_store_id
     )
     self._transformation_function_engine = (
         transformation_function_engine.TransformationFunctionEngine(
             feature_store_id
         )
     )
     self._feature_view_engine = feature_view_engine.FeatureViewEngine(
         feature_store_id
     )
예제 #2
0
    def __init__(
        self,
        name,
        version,
        description,
        data_format,
        location,
        featurestore_id,
        storage_connector=None,
        splits=None,
        seed=None,
        cluster_analysis=None,
        created=None,
        creator=None,
        descriptive_statistics=None,
        feature_correlation_matrix=None,
        features=None,
        features_histogram=None,
        featurestore_name=None,
        id=None,
        jobs=None,
        inode_id=None,
        storage_connector_name=None,
        storage_connector_id=None,
        storage_connector_type=None,
        training_dataset_type=None,
    ):
        self._id = id
        self._name = name
        self._version = version
        self._description = description
        self._data_format = data_format
        self._seed = seed
        self._location = location

        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            featurestore_id)

        self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine(
            featurestore_id)

        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            featurestore_id)

        # set up depending on user initialized or coming from backend response
        if training_dataset_type is None:
            # no type -> user init
            self._features = features
            self.storage_connector = storage_connector
            self.splits = splits
        else:
            # type available -> init from backend response
            # make rest call to get all connector information, description etc.
            self._storage_connector = self._storage_connector_api.get_by_id(
                storage_connector_id, storage_connector_type)
            self._features = [
                feature.Feature.from_response_json(feat) for feat in features
            ]
            self._splits = splits
            self._training_dataset_type = training_dataset_type
 def __init__(self, feature_store_id):
     self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
         feature_store_id
     )
     self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
     self._storage_connector_api = storage_connector_api.StorageConnectorApi(
         feature_store_id
     )
예제 #4
0
    def __init__(self, feature_store_id):
        self._feature_store_id = feature_store_id

        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            feature_store_id)
        self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            feature_store_id)
        self._transformation_function_engine = (
            transformation_function_engine.TransformationFunctionEngine(
                feature_store_id))
예제 #5
0
    def __init__(
        self,
        featurestore_id,
        featurestore_name,
        created,
        hdfs_store_path,
        project_name,
        project_id,
        featurestore_description,
        inode_id,
        offline_featurestore_name,
        hive_endpoint,
        online_enabled,
        num_feature_groups=None,
        num_training_datasets=None,
        num_storage_connectors=None,
        online_featurestore_name=None,
        mysql_server_endpoint=None,
        online_featurestore_size=None,
    ):
        self._id = featurestore_id
        self._name = featurestore_name
        self._created = created
        self._hdfs_store_path = hdfs_store_path
        self._project_name = project_name
        self._project_id = project_id
        self._description = featurestore_description
        self._inode_id = inode_id
        self._online_feature_store_name = online_featurestore_name
        self._online_feature_store_size = online_featurestore_size
        self._offline_feature_store_name = offline_featurestore_name
        self._hive_endpoint = hive_endpoint
        self._mysql_server_endpoint = mysql_server_endpoint
        self._online_enabled = online_enabled
        self._num_feature_groups = num_feature_groups
        self._num_training_datasets = num_training_datasets
        self._num_storage_connectors = num_storage_connectors

        self._feature_group_api = feature_group_api.FeatureGroupApi(self._id)
        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            self._id)
        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            self._id)
        self._expectations_api = expectations_api.ExpectationsApi(self._id)

        self._feature_group_engine = feature_group_engine.FeatureGroupEngine(
            self._id)

        self._transformation_function_engine = (
            transformation_function_engine.TransformationFunctionEngine(
                self._id))
        self._feature_view_engine = feature_view_engine.FeatureViewEngine(
            self._id)
예제 #6
0
    def write_training_dataset(
        self,
        training_dataset,
        dataset,
        user_write_options,
        save_mode,
        feature_view_obj=None,
        to_df=False,
    ):
        if not feature_view_obj and not isinstance(dataset, query.Query):
            raise Exception(
                "Currently only query based training datasets are supported by the Python engine"
            )

        # As for creating a feature group, users have the possibility of passing
        # a spark_job_configuration object as part of the user_write_options with the key "spark"
        spark_job_configuration = user_write_options.pop("spark", None)
        td_app_conf = training_dataset_job_conf.TrainingDatsetJobConf(
            query=dataset,
            overwrite=(save_mode == "overwrite"),
            write_options=user_write_options,
            spark_job_configuration=spark_job_configuration,
        )

        if feature_view_obj:
            fv_api = feature_view_api.FeatureViewApi(feature_view_obj.featurestore_id)
            td_job = fv_api.compute_training_dataset(
                feature_view_obj.name,
                feature_view_obj.version,
                training_dataset.version,
                td_app_conf,
            )
        else:
            td_api = training_dataset_api.TrainingDatasetApi(
                training_dataset.feature_store_id
            )
            td_job = td_api.compute(training_dataset, td_app_conf)
        print(
            "Training dataset job started successfully, you can follow the progress at \n{}".format(
                self._get_job_url(td_job.href)
            )
        )

        # If the user passed the wait_for_job option consider it,
        # otherwise use the default True
        self._wait_for_job(td_job, user_write_options)

        return td_job
예제 #7
0
    def write_training_dataset(self, training_dataset, features,
                               user_write_options, save_mode):
        if not isinstance(features, query.Query):
            raise "Currently only query based training datasets are supported by the Python engine"

        # As for creating a feature group, users have the possibility of passing
        # a spark_job_configuration object as part of the user_write_options with the key "spark"
        spark_job_configuration = user_write_options.pop("spark", None)
        td_app_conf = training_dataset_job_conf.TrainingDatsetJobConf(
            query=features,
            overwrite=(save_mode == "overwrite"),
            write_options=user_write_options,
            spark_job_configuration=spark_job_configuration,
        )

        td_api = training_dataset_api.TrainingDatasetApi(
            training_dataset.feature_store_id)
        td_job = td_api.compute(training_dataset, td_app_conf)
        print(
            "Training dataset job started successfully, you can follow the progress at {}"
            .format(self._get_job_url(td_job.href)))
예제 #8
0
    def __init__(
        self,
        featurestore_id,
        featurestore_name,
        created,
        hdfs_store_path,
        project_name,
        project_id,
        featurestore_description,
        inode_id,
        offline_featurestore_name,
        hive_endpoint,
        online_enabled,
        online_featurestore_name=None,
        mysql_server_endpoint=None,
        online_featurestore_size=None,
    ):
        self._id = featurestore_id
        self._name = featurestore_name
        self._created = created
        self._hdfs_store_path = hdfs_store_path
        self._project_name = project_name
        self._project_id = project_id
        self._description = featurestore_description
        self._inode_id = inode_id
        self._online_feature_store_name = online_featurestore_name
        self._online_feature_store_size = online_featurestore_size
        self._offline_feature_store_name = offline_featurestore_name
        self._hive_endpoint = hive_endpoint
        self._mysql_server_endpoint = mysql_server_endpoint
        self._online_enabled = online_enabled

        self._feature_group_api = feature_group_api.FeatureGroupApi(self._id)
        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            self._id)
        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            self._id)

        self._feature_group_engine = feature_group_engine.FeatureGroupEngine(
            self._id)
    def __init__(
        self,
        name,
        version,
        data_format,
        featurestore_id,
        location="",
        event_start_time=None,
        event_end_time=None,
        coalesce=False,
        description=None,
        storage_connector=None,
        splits=None,
        validation_size=None,
        test_size=None,
        train_start=None,
        train_end=None,
        validation_start=None,
        validation_end=None,
        test_start=None,
        test_end=None,
        seed=None,
        created=None,
        creator=None,
        features=None,
        statistics_config=None,
        featurestore_name=None,
        id=None,
        inode_id=None,
        training_dataset_type=None,
        from_query=None,
        querydto=None,
        label=None,
        transformation_functions=None,
        train_split=None,
    ):
        self._id = id
        self._name = name
        self._version = version
        self._description = description
        self._data_format = data_format
        self._start_time = self._convert_event_time_to_timestamp(
            event_start_time)
        self._end_time = self._convert_event_time_to_timestamp(event_end_time)
        self._validation_size = validation_size
        self._test_size = test_size
        self._train_start = train_start
        self._train_end = train_end
        self._validation_start = validation_start
        self._validation_end = validation_end
        self._test_start = test_start
        self._test_end = test_end
        self._coalesce = coalesce
        self._seed = seed
        self._location = location
        self._from_query = from_query
        self._querydto = querydto
        self._feature_store_id = featurestore_id
        self._transformation_functions = transformation_functions
        self._train_split = train_split

        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            featurestore_id)

        self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine(
            featurestore_id)

        self._statistics_engine = statistics_engine.StatisticsEngine(
            featurestore_id, self.ENTITY_TYPE)

        self._code_engine = code_engine.CodeEngine(featurestore_id,
                                                   self.ENTITY_TYPE)

        self._transformation_function_engine = (
            transformation_function_engine.TransformationFunctionEngine(
                featurestore_id))
        if training_dataset_type:
            self.training_dataset_type = training_dataset_type
        else:
            self._training_dataset_type = None
        # set up depending on user initialized or coming from backend response
        if created is None:
            # no type -> user init
            self._features = features
            self.storage_connector = storage_connector
            self.splits = splits
            self.statistics_config = statistics_config
            self._label = label
            if validation_size or test_size:
                self._train_split = TrainingDatasetSplit.TRAIN
                self.splits = {
                    TrainingDatasetSplit.TRAIN:
                    1 - (validation_size or 0) - (test_size or 0),
                    TrainingDatasetSplit.VALIDATION:
                    validation_size,
                    TrainingDatasetSplit.TEST:
                    test_size,
                }
            self._set_time_splits(
                train_start,
                train_end,
                validation_start,
                validation_end,
                test_start,
                test_end,
            )
        else:
            # type available -> init from backend response
            # make rest call to get all connector information, description etc.
            self._storage_connector = StorageConnector.from_response_json(
                storage_connector)

            if features is None:
                features = []
            self._features = [
                training_dataset_feature.TrainingDatasetFeature.
                from_response_json(feat) for feat in features
            ]
            self._splits = [
                TrainingDatasetSplit.from_response_json(split)
                for split in splits
            ]
            self._statistics_config = StatisticsConfig.from_response_json(
                statistics_config)
            self._label = [
                feat.name.lower() for feat in self._features if feat.label
            ]

        self._vector_server = vector_server.VectorServer(
            featurestore_id, features=self._features)
예제 #10
0
    def __init__(
        self,
        name,
        version,
        data_format,
        location,
        featurestore_id,
        coalesce=False,
        description=None,
        storage_connector=None,
        splits=None,
        seed=None,
        created=None,
        creator=None,
        features=None,
        statistics_config=None,
        featurestore_name=None,
        id=None,
        inode_id=None,
        training_dataset_type=None,
        from_query=None,
        querydto=None,
        label=None,
    ):
        self._id = id
        self._name = name
        self._version = version
        self._description = description
        self._data_format = data_format
        self._coalesce = coalesce
        self._seed = seed
        self._location = location
        self._from_query = from_query
        self._querydto = querydto
        self._feature_store_id = featurestore_id
        self._prepared_statement_connection = None
        self._prepared_statements = None
        self._serving_keys = None

        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            featurestore_id)

        self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine(
            featurestore_id)

        self._statistics_engine = statistics_engine.StatisticsEngine(
            featurestore_id, self.ENTITY_TYPE)

        # set up depending on user initialized or coming from backend response
        if training_dataset_type is None:
            # no type -> user init
            self._features = features
            self.storage_connector = storage_connector
            self.splits = splits
            self.statistics_config = statistics_config
            self._label = label
        else:
            # type available -> init from backend response
            # make rest call to get all connector information, description etc.
            self._storage_connector = StorageConnector.from_response_json(
                storage_connector)

            self._features = [
                training_dataset_feature.TrainingDatasetFeature.
                from_response_json(feat) for feat in features
            ]
            self._splits = splits
            self._training_dataset_type = training_dataset_type
            self._statistics_config = StatisticsConfig.from_response_json(
                statistics_config)
            self._label = [
                feat.name.lower() for feat in self._features if feat.label
            ]
 def __init__(self, feature_store_id):
     self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
         feature_store_id)
     self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
    def __init__(
        self,
        name,
        version,
        data_format,
        location,
        featurestore_id,
        description=None,
        storage_connector=None,
        splits=None,
        seed=None,
        created=None,
        creator=None,
        features=None,
        statistics_config=None,
        featurestore_name=None,
        id=None,
        jobs=None,
        inode_id=None,
        storage_connector_name=None,
        storage_connector_id=None,
        storage_connector_type=None,
        training_dataset_type=None,
        from_query=None,
        querydto=None,
    ):
        self._id = id
        self._name = name
        self._version = version
        self._description = description
        self._data_format = data_format
        self._seed = seed
        self._location = location
        self._from_query = from_query
        self._querydto = querydto

        self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
            featurestore_id)

        self._training_dataset_engine = training_dataset_engine.TrainingDatasetEngine(
            featurestore_id)

        self._storage_connector_api = storage_connector_api.StorageConnectorApi(
            featurestore_id)

        self._statistics_engine = statistics_engine.StatisticsEngine(
            featurestore_id, self.ENTITY_TYPE)

        # set up depending on user initialized or coming from backend response
        if training_dataset_type is None:
            # no type -> user init
            self._features = features
            self.storage_connector = storage_connector
            self.splits = splits
            self.statistics_config = statistics_config
        else:
            # type available -> init from backend response
            # make rest call to get all connector information, description etc.
            self._storage_connector = self._storage_connector_api.get_by_id(
                storage_connector_id, storage_connector_type)
            self._features = [
                training_dataset_feature.TrainingDatasetFeature.
                from_response_json(feat) for feat in features
            ]
            self._splits = splits
            self._training_dataset_type = training_dataset_type
            self.statistics_config = None
예제 #13
0
 def __init__(self, feature_store_id):
     self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
         feature_store_id)
     self._tags_api = tags_api.TagsApi(feature_store_id, "trainingdatasets")