def insert(
        self,
        feature_group,
        feature_dataframe,
        overwrite,
        operation,
        storage,
        write_options,
    ):
        validation_id = None
        if feature_group.validation_type != "NONE":
            validation = feature_group.validate(feature_dataframe)
            validation_id = validation.validation_id

        offline_write_options = write_options
        online_write_options = self.get_kafka_config(write_options)

        if not feature_group.online_enabled and storage == "online":
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group.")

        if overwrite:
            self._feature_group_api.delete_content(feature_group)

        engine.get_instance().save_dataframe(
            feature_group,
            feature_dataframe,
            "bulk_insert" if overwrite else operation,
            feature_group.online_enabled,
            storage,
            offline_write_options,
            online_write_options,
            validation_id,
        )
    def commit_details(self, feature_group, wallclock_time, limit):
        if (feature_group._time_travel_format is None
                or feature_group._time_travel_format.upper() != "HUDI"):
            raise exceptions.FeatureStoreException(
                "commit_details can only be used on time travel enabled feature groups"
            )

        wallclock_timestamp = (
            util.get_timestamp_from_date_string(wallclock_time)
            if wallclock_time is not None else None)
        feature_group_commits = self._feature_group_api.get_commit_details(
            feature_group, wallclock_timestamp, limit)
        commit_details = {}
        for feature_group_commit in feature_group_commits:
            commit_details[feature_group_commit.commitid] = {
                "committedOn":
                util.get_hudi_datestr_from_timestamp(
                    feature_group_commit.commitid),
                "rowsUpdated":
                feature_group_commit.rows_updated,
                "rowsInserted":
                feature_group_commit.rows_inserted,
                "rowsDeleted":
                feature_group_commit.rows_deleted,
            }
        return commit_details
    def insert_stream(
        self,
        feature_group,
        dataframe,
        query_name,
        output_mode,
        await_termination,
        timeout,
        write_options,
    ):
        if not feature_group.online_enabled:
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group. "
                "It is currently only possible to stream to the online storage."
            )

        if feature_group.validation_type != "NONE":
            warnings.warn(
                "Stream ingestion for feature group `{}`, with version `{}` will not perform validation."
                .format(feature_group.name, feature_group.version),
                util.ValidationWarning,
            )

        return engine.get_instance().save_stream_dataframe(
            feature_group,
            dataframe,
            query_name,
            output_mode,
            await_termination,
            timeout,
            self.get_kafka_config(write_options),
        )
    def compute_statistics(self, metadata_instance, feature_dataframe=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":
            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            feature_dataframe = (feature_dataframe if feature_dataframe else
                                 metadata_instance.read())

            commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_str, content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
Пример #5
0
    def _wait_for_job(self, job, user_write_options=None):
        # If the user passed the wait_for_job option consider it,
        # otherwise use the default True
        while user_write_options is None or user_write_options.get(
            "wait_for_job", True
        ):
            executions = self._job_api.last_execution(job)
            if len(executions) > 0:
                execution = executions[0]
            else:
                return

            if execution.final_status.lower() == "succeeded":
                return
            elif execution.final_status.lower() == "failed":
                raise exceptions.FeatureStoreException(
                    "The Hopsworks Job failed, use the Hopsworks UI to access the job logs"
                )
            elif execution.final_status.lower() == "killed":
                raise exceptions.FeatureStoreException("The Hopsworks Job was stopped")

            time.sleep(3)
    def insert(
        self,
        feature_group,
        feature_dataframe,
        overwrite,
        operation,
        storage,
        write_options,
        validation_options,
    ):

        if not feature_group._id:
            self._save_feature_group_metadata(feature_group, feature_dataframe,
                                              write_options)

        # deequ validation only on spark
        validation = feature_group._data_validation_engine.ingest_validate(
            feature_group, feature_dataframe)
        validation_id = validation.validation_id if validation is not None else None

        # ge validation on python and non stream feature groups on spark
        ge_report = feature_group._great_expectation_engine.validate(
            feature_group, feature_dataframe, True, validation_options)

        if ge_report is not None and ge_report.ingestion_result == "REJECTED":
            return None, ge_report

        offline_write_options = write_options
        online_write_options = self.get_kafka_config(write_options)

        if not feature_group.online_enabled and storage == "online":
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group.")

        if overwrite:
            self._feature_group_api.delete_content(feature_group)

        return (
            engine.get_instance().save_dataframe(
                feature_group,
                feature_dataframe,
                "bulk_insert" if overwrite else operation,
                feature_group.online_enabled,
                storage,
                offline_write_options,
                online_write_options,
                validation_id,
            ),
            ge_report,
        )
 def _set_label_features(self):
     for f_name in self._label:
         found = False
         for f in self._features:
             if f_name == f.name:
                 f.label = True
                 found = True
                 break
         if not found:
             raise exceptions.FeatureStoreException(
                 "The specified label `{}` could not be found among the features: {}.".format(
                     f_name, [feat.name for feat in self._features]
                 )
             )
Пример #8
0
    def profile_transformation_fn_statistics(self, feature_dataframe, columns,
                                             label_encoder_features):
        if (engine.get_type() == "spark"
                and len(feature_dataframe.select(*columns).head(1)) == 0) or (
                    (engine.get_type() == "hive" or engine.get_type()
                     == "python") and len(feature_dataframe.head()) == 0):
            raise exceptions.FeatureStoreException(
                "There is no data in the entity that you are trying to compute "
                "statistics for. A possible cause might be that you inserted only data "
                "to the online storage of a feature group.")
        content_str = engine.get_instance().profile(feature_dataframe, columns,
                                                    False, True, False)

        # add unique value profile to String type columns
        return self.profile_unique_values(feature_dataframe,
                                          label_encoder_features, content_str)
    def insert(
        self,
        feature_group,
        feature_dataframe,
        overwrite,
        operation,
        storage,
        write_options,
    ):
        validation_id = None
        if feature_group.validation_type != "NONE":
            validation = feature_group.validate(feature_dataframe)
            validation_id = validation.validation_id

        offline_write_options = write_options
        online_write_options = write_options

        if not feature_group.online_enabled and storage == "online":
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group.")
        elif (feature_group.online_enabled
              and storage != "offline") or storage == "online":
            # Add JDBC connection configuration in case of online feature group
            online_conn = self._storage_connector_api.get_online_connector()

            jdbc_options = online_conn.spark_options()
            jdbc_options["dbtable"] = self._get_online_table_name(
                feature_group)

            online_write_options = {**jdbc_options, **online_write_options}

        if overwrite:
            self._feature_group_api.delete_content(feature_group)

        engine.get_instance().save_dataframe(
            self._get_table_name(feature_group),
            feature_group,
            feature_dataframe,
            self.APPEND,
            "bulk_insert" if overwrite else operation,
            feature_group.online_enabled,
            storage,
            offline_write_options,
            online_write_options,
            validation_id,
        )
Пример #10
0
 def compute_statistics(self, metadata_instance, feature_dataframe):
     """Compute statistics for a dataframe and send the result json to Hopsworks."""
     commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
     if len(feature_dataframe.head(1)) == 0:
         raise exceptions.FeatureStoreException(
             "There is no data in the entity that you are trying to compute "
             "statistics for. A possible cause might be that you inserted only data "
             "to the online storage of a feature group.")
     content_str = engine.get_instance().profile(
         feature_dataframe,
         metadata_instance.statistics_config.columns,
         metadata_instance.statistics_config.correlations,
         metadata_instance.statistics_config.histograms,
     )
     stats = statistics.Statistics(commit_str, content_str)
     self._statistics_api.post(metadata_instance, stats)
     return stats
Пример #11
0
def init(engine_type):
    global _engine_type
    global _engine
    if not _engine:
        if engine_type == "spark":
            _engine_type = "spark"
            _engine = spark.Engine()
        elif engine_type == "hive":
            try:
                from hsfs.engine import hive
            except ImportError:
                raise exceptions.FeatureStoreException(
                    "Trying to instantiate Hive as engine, but 'hive' extras are "
                    "missing in HSFS installation. Install with `pip install "
                    "hsfs[hive]`.")
            _engine_type = "hive"
            _engine = hive.Engine()
Пример #12
0
def init(engine_type):
    global _engine_type
    global _engine
    if not _engine:
        if engine_type == "spark":
            _engine_type = "spark"
            _engine = spark.Engine()
        elif engine_type in ["hive", "python", "training"]:
            try:
                from hsfs.engine import python
            except ImportError:
                raise exceptions.FeatureStoreException(
                    "Trying to instantiate Python as engine, but 'python' extras are "
                    "missing in HSFS installation. Install with `pip install "
                    "hsfs[python]`.")
            _engine_type = "python"
            _engine = python.Engine()
        elif engine_type == "training":
            _engine = "training"
Пример #13
0
    def compute_statistics(self,
                           metadata_instance,
                           feature_dataframe=None,
                           feature_group_commit_id=None):
        """Compute statistics for a dataframe and send the result json to Hopsworks."""
        if engine.get_type() == "spark":

            # If the feature dataframe is None, then trigger a read on the metadata instance
            # We do it here to avoid making a useless request when using the Hive engine
            # and calling compute_statistics
            if feature_dataframe is None:
                if feature_group_commit_id is not None:
                    feature_dataframe = (metadata_instance.select_all().as_of(
                        util.get_hudi_datestr_from_timestamp(
                            feature_group_commit_id)).read(
                                online=False,
                                dataframe_type="default",
                                read_options={}))
                else:
                    feature_dataframe = metadata_instance.read()

            commit_time = int(
                float(datetime.datetime.now().timestamp()) * 1000)
            if len(feature_dataframe.head(1)) == 0:
                raise exceptions.FeatureStoreException(
                    "There is no data in the entity that you are trying to compute "
                    "statistics for. A possible cause might be that you inserted only data "
                    "to the online storage of a feature group.")
            content_str = engine.get_instance().profile(
                feature_dataframe,
                metadata_instance.statistics_config.columns,
                metadata_instance.statistics_config.correlations,
                metadata_instance.statistics_config.histograms,
            )
            stats = statistics.Statistics(commit_time, feature_group_commit_id,
                                          content_str)
            self._statistics_api.post(metadata_instance, stats)
            return stats

        else:
            # Hive engine
            engine.get_instance().profile(metadata_instance)
    def insert_stream(
        self,
        feature_group,
        dataframe,
        query_name,
        output_mode,
        await_termination,
        timeout,
        checkpoint_dir,
        write_options,
    ):

        if not feature_group.online_enabled and not feature_group.stream:
            raise exceptions.FeatureStoreException(
                "Online storage is not enabled for this feature group. "
                "It is currently only possible to stream to the online storage."
            )

        if not feature_group._id:
            self._save_feature_group_metadata(feature_group, dataframe,
                                              write_options)

            if not feature_group.stream:
                # insert_stream method was called on non stream feature group object that has not been saved.
                # we will use save_dataframe method on empty dataframe to create directory structure
                offline_write_options = write_options
                online_write_options = self.get_kafka_config(write_options)
                engine.get_instance().save_dataframe(
                    feature_group,
                    engine.get_instance().create_empty_df(dataframe),
                    hudi_engine.HudiEngine.HUDI_BULK_INSERT
                    if feature_group.time_travel_format == "HUDI" else None,
                    feature_group.online_enabled,
                    None,
                    offline_write_options,
                    online_write_options,
                )

        if not feature_group.stream:
            warnings.warn(
                "`insert_stream` method in the next release be available only for feature groups created with "
                "`stream=True`.")

        if feature_group.validation_type != "NONE":
            warnings.warn(
                "Stream ingestion for feature group `{}`, with version `{}` will not perform validation."
                .format(feature_group.name, feature_group.version),
                util.ValidationWarning,
            )

        streaming_query = engine.get_instance().save_stream_dataframe(
            feature_group,
            dataframe,
            query_name,
            output_mode,
            await_termination,
            timeout,
            checkpoint_dir,
            self.get_kafka_config(write_options),
        )

        return streaming_query
Пример #15
0
    def validate(self, feature_group, feature_dataframe):
        """Perform data validation for a dataframe and send the result json to Hopsworks."""
        validation_time = int(round(time.time() * 1000))
        if len(feature_dataframe.head(1)) == 0:
            raise exceptions.FeatureStoreException(
                "There is no data in the entity that you are trying to validate data "
                "for. A possible cause might be that you inserted only data "
                "to the online storage of a feature group.")

        expectations = self._expectations_api.get(feature_group=feature_group)

        expectation_results_java = engine.get_instance().validate(
            feature_dataframe, expectations)
        # Loop through Java object and convert to Python
        expectation_results = []
        for exp_res in expectation_results_java:
            # Create the Expectation
            exp = exp_res.getExpectation()
            rules_python = []
            for exp_rule in exp.getRules():
                legal_values = []
                if exp_rule.getLegalValues() is not None:
                    for legal_value in exp_rule.getLegalValues():
                        legal_values.append(legal_value)
                rules_python.append(
                    rule.Rule(
                        name=exp_rule.getName().name(),
                        level=exp_rule.getLevel().name(),
                        min=exp_rule.getMin(),
                        max=exp_rule.getMax(),
                        value=exp_rule.getValue(),
                        pattern=exp_rule.getPattern(),
                        accepted_type=exp_rule.getAcceptedType().name()
                        if exp_rule.getAcceptedType() is not None else None,
                        legal_values=legal_values,
                    ))

            features_python = []
            for feature in exp.getFeatures():
                features_python.append(feature)
            expectation_python = expectation.Expectation(
                name=exp.getName(),
                description=exp.getDescription(),
                features=features_python,
                rules=rules_python,
            )
            # Create the ValidationResult
            validation_results_python = []
            for validation_result_java in exp_res.getResults():
                # Create rule python
                legal_values = []
                if validation_result_java.getRule().getLegalValues(
                ) is not None:
                    for (legal_value) in validation_result_java.getRule(
                    ).getLegalValues():
                        legal_values.append(legal_value)
                validation_rule_python = rule.Rule(
                    name=validation_result_java.getRule().getName().name(),
                    level=validation_result_java.getRule().getLevel().name(),
                    min=validation_result_java.getRule().getMin(),
                    max=validation_result_java.getRule().getMax(),
                    value=validation_result_java.getRule().getValue(),
                    pattern=validation_result_java.getRule().getPattern(),
                    accepted_type=validation_result_java.getRule(
                    ).getAcceptedType().name()
                    if validation_result_java.getRule().getAcceptedType()
                    is not None else None,
                    legal_values=legal_values,
                )

                features = [
                    feature
                    for feature in validation_result_java.getFeatures()
                ]

                validation_results_python.append(
                    validation_result.ValidationResult(
                        status=validation_result_java.getStatus().name(),
                        message=validation_result_java.getMessage(),
                        value=validation_result_java.getValue(),
                        features=features,
                        rule=validation_rule_python,
                    ))

            expectation_result_python = expectation_result.ExpectationResult(
                expectation=expectation_python,
                results=validation_results_python)
            expectation_results.append(expectation_result_python)
        validation_python = feature_group_validation.FeatureGroupValidation(
            validation_time=validation_time,
            expectation_results=expectation_results,
        )
        return self._feature_group_validation_api.put(feature_group,
                                                      validation_python)