def insert( self, feature_group, feature_dataframe, overwrite, operation, storage, write_options, ): validation_id = None if feature_group.validation_type != "NONE": validation = feature_group.validate(feature_dataframe) validation_id = validation.validation_id offline_write_options = write_options online_write_options = self.get_kafka_config(write_options) if not feature_group.online_enabled and storage == "online": raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group.") if overwrite: self._feature_group_api.delete_content(feature_group) engine.get_instance().save_dataframe( feature_group, feature_dataframe, "bulk_insert" if overwrite else operation, feature_group.online_enabled, storage, offline_write_options, online_write_options, validation_id, )
def commit_details(self, feature_group, wallclock_time, limit): if (feature_group._time_travel_format is None or feature_group._time_travel_format.upper() != "HUDI"): raise exceptions.FeatureStoreException( "commit_details can only be used on time travel enabled feature groups" ) wallclock_timestamp = ( util.get_timestamp_from_date_string(wallclock_time) if wallclock_time is not None else None) feature_group_commits = self._feature_group_api.get_commit_details( feature_group, wallclock_timestamp, limit) commit_details = {} for feature_group_commit in feature_group_commits: commit_details[feature_group_commit.commitid] = { "committedOn": util.get_hudi_datestr_from_timestamp( feature_group_commit.commitid), "rowsUpdated": feature_group_commit.rows_updated, "rowsInserted": feature_group_commit.rows_inserted, "rowsDeleted": feature_group_commit.rows_deleted, } return commit_details
def insert_stream( self, feature_group, dataframe, query_name, output_mode, await_termination, timeout, write_options, ): if not feature_group.online_enabled: raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group. " "It is currently only possible to stream to the online storage." ) if feature_group.validation_type != "NONE": warnings.warn( "Stream ingestion for feature group `{}`, with version `{}` will not perform validation." .format(feature_group.name, feature_group.version), util.ValidationWarning, ) return engine.get_instance().save_stream_dataframe( feature_group, dataframe, query_name, output_mode, await_termination, timeout, self.get_kafka_config(write_options), )
def compute_statistics(self, metadata_instance, feature_dataframe=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics feature_dataframe = (feature_dataframe if feature_dataframe else metadata_instance.read()) commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S") if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_str, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def _wait_for_job(self, job, user_write_options=None): # If the user passed the wait_for_job option consider it, # otherwise use the default True while user_write_options is None or user_write_options.get( "wait_for_job", True ): executions = self._job_api.last_execution(job) if len(executions) > 0: execution = executions[0] else: return if execution.final_status.lower() == "succeeded": return elif execution.final_status.lower() == "failed": raise exceptions.FeatureStoreException( "The Hopsworks Job failed, use the Hopsworks UI to access the job logs" ) elif execution.final_status.lower() == "killed": raise exceptions.FeatureStoreException("The Hopsworks Job was stopped") time.sleep(3)
def insert( self, feature_group, feature_dataframe, overwrite, operation, storage, write_options, validation_options, ): if not feature_group._id: self._save_feature_group_metadata(feature_group, feature_dataframe, write_options) # deequ validation only on spark validation = feature_group._data_validation_engine.ingest_validate( feature_group, feature_dataframe) validation_id = validation.validation_id if validation is not None else None # ge validation on python and non stream feature groups on spark ge_report = feature_group._great_expectation_engine.validate( feature_group, feature_dataframe, True, validation_options) if ge_report is not None and ge_report.ingestion_result == "REJECTED": return None, ge_report offline_write_options = write_options online_write_options = self.get_kafka_config(write_options) if not feature_group.online_enabled and storage == "online": raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group.") if overwrite: self._feature_group_api.delete_content(feature_group) return ( engine.get_instance().save_dataframe( feature_group, feature_dataframe, "bulk_insert" if overwrite else operation, feature_group.online_enabled, storage, offline_write_options, online_write_options, validation_id, ), ge_report, )
def _set_label_features(self): for f_name in self._label: found = False for f in self._features: if f_name == f.name: f.label = True found = True break if not found: raise exceptions.FeatureStoreException( "The specified label `{}` could not be found among the features: {}.".format( f_name, [feat.name for feat in self._features] ) )
def profile_transformation_fn_statistics(self, feature_dataframe, columns, label_encoder_features): if (engine.get_type() == "spark" and len(feature_dataframe.select(*columns).head(1)) == 0) or ( (engine.get_type() == "hive" or engine.get_type() == "python") and len(feature_dataframe.head()) == 0): raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile(feature_dataframe, columns, False, True, False) # add unique value profile to String type columns return self.profile_unique_values(feature_dataframe, label_encoder_features, content_str)
def insert( self, feature_group, feature_dataframe, overwrite, operation, storage, write_options, ): validation_id = None if feature_group.validation_type != "NONE": validation = feature_group.validate(feature_dataframe) validation_id = validation.validation_id offline_write_options = write_options online_write_options = write_options if not feature_group.online_enabled and storage == "online": raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group.") elif (feature_group.online_enabled and storage != "offline") or storage == "online": # Add JDBC connection configuration in case of online feature group online_conn = self._storage_connector_api.get_online_connector() jdbc_options = online_conn.spark_options() jdbc_options["dbtable"] = self._get_online_table_name( feature_group) online_write_options = {**jdbc_options, **online_write_options} if overwrite: self._feature_group_api.delete_content(feature_group) engine.get_instance().save_dataframe( self._get_table_name(feature_group), feature_group, feature_dataframe, self.APPEND, "bulk_insert" if overwrite else operation, feature_group.online_enabled, storage, offline_write_options, online_write_options, validation_id, )
def compute_statistics(self, metadata_instance, feature_dataframe): """Compute statistics for a dataframe and send the result json to Hopsworks.""" commit_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S") if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_str, content_str) self._statistics_api.post(metadata_instance, stats) return stats
def init(engine_type): global _engine_type global _engine if not _engine: if engine_type == "spark": _engine_type = "spark" _engine = spark.Engine() elif engine_type == "hive": try: from hsfs.engine import hive except ImportError: raise exceptions.FeatureStoreException( "Trying to instantiate Hive as engine, but 'hive' extras are " "missing in HSFS installation. Install with `pip install " "hsfs[hive]`.") _engine_type = "hive" _engine = hive.Engine()
def init(engine_type): global _engine_type global _engine if not _engine: if engine_type == "spark": _engine_type = "spark" _engine = spark.Engine() elif engine_type in ["hive", "python", "training"]: try: from hsfs.engine import python except ImportError: raise exceptions.FeatureStoreException( "Trying to instantiate Python as engine, but 'python' extras are " "missing in HSFS installation. Install with `pip install " "hsfs[python]`.") _engine_type = "python" _engine = python.Engine() elif engine_type == "training": _engine = "training"
def compute_statistics(self, metadata_instance, feature_dataframe=None, feature_group_commit_id=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics if feature_dataframe is None: if feature_group_commit_id is not None: feature_dataframe = (metadata_instance.select_all().as_of( util.get_hudi_datestr_from_timestamp( feature_group_commit_id)).read( online=False, dataframe_type="default", read_options={})) else: feature_dataframe = metadata_instance.read() commit_time = int( float(datetime.datetime.now().timestamp()) * 1000) if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_time, feature_group_commit_id, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def insert_stream( self, feature_group, dataframe, query_name, output_mode, await_termination, timeout, checkpoint_dir, write_options, ): if not feature_group.online_enabled and not feature_group.stream: raise exceptions.FeatureStoreException( "Online storage is not enabled for this feature group. " "It is currently only possible to stream to the online storage." ) if not feature_group._id: self._save_feature_group_metadata(feature_group, dataframe, write_options) if not feature_group.stream: # insert_stream method was called on non stream feature group object that has not been saved. # we will use save_dataframe method on empty dataframe to create directory structure offline_write_options = write_options online_write_options = self.get_kafka_config(write_options) engine.get_instance().save_dataframe( feature_group, engine.get_instance().create_empty_df(dataframe), hudi_engine.HudiEngine.HUDI_BULK_INSERT if feature_group.time_travel_format == "HUDI" else None, feature_group.online_enabled, None, offline_write_options, online_write_options, ) if not feature_group.stream: warnings.warn( "`insert_stream` method in the next release be available only for feature groups created with " "`stream=True`.") if feature_group.validation_type != "NONE": warnings.warn( "Stream ingestion for feature group `{}`, with version `{}` will not perform validation." .format(feature_group.name, feature_group.version), util.ValidationWarning, ) streaming_query = engine.get_instance().save_stream_dataframe( feature_group, dataframe, query_name, output_mode, await_termination, timeout, checkpoint_dir, self.get_kafka_config(write_options), ) return streaming_query
def validate(self, feature_group, feature_dataframe): """Perform data validation for a dataframe and send the result json to Hopsworks.""" validation_time = int(round(time.time() * 1000)) if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to validate data " "for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") expectations = self._expectations_api.get(feature_group=feature_group) expectation_results_java = engine.get_instance().validate( feature_dataframe, expectations) # Loop through Java object and convert to Python expectation_results = [] for exp_res in expectation_results_java: # Create the Expectation exp = exp_res.getExpectation() rules_python = [] for exp_rule in exp.getRules(): legal_values = [] if exp_rule.getLegalValues() is not None: for legal_value in exp_rule.getLegalValues(): legal_values.append(legal_value) rules_python.append( rule.Rule( name=exp_rule.getName().name(), level=exp_rule.getLevel().name(), min=exp_rule.getMin(), max=exp_rule.getMax(), value=exp_rule.getValue(), pattern=exp_rule.getPattern(), accepted_type=exp_rule.getAcceptedType().name() if exp_rule.getAcceptedType() is not None else None, legal_values=legal_values, )) features_python = [] for feature in exp.getFeatures(): features_python.append(feature) expectation_python = expectation.Expectation( name=exp.getName(), description=exp.getDescription(), features=features_python, rules=rules_python, ) # Create the ValidationResult validation_results_python = [] for validation_result_java in exp_res.getResults(): # Create rule python legal_values = [] if validation_result_java.getRule().getLegalValues( ) is not None: for (legal_value) in validation_result_java.getRule( ).getLegalValues(): legal_values.append(legal_value) validation_rule_python = rule.Rule( name=validation_result_java.getRule().getName().name(), level=validation_result_java.getRule().getLevel().name(), min=validation_result_java.getRule().getMin(), max=validation_result_java.getRule().getMax(), value=validation_result_java.getRule().getValue(), pattern=validation_result_java.getRule().getPattern(), accepted_type=validation_result_java.getRule( ).getAcceptedType().name() if validation_result_java.getRule().getAcceptedType() is not None else None, legal_values=legal_values, ) features = [ feature for feature in validation_result_java.getFeatures() ] validation_results_python.append( validation_result.ValidationResult( status=validation_result_java.getStatus().name(), message=validation_result_java.getMessage(), value=validation_result_java.getValue(), features=features, rule=validation_rule_python, )) expectation_result_python = expectation_result.ExpectationResult( expectation=expectation_python, results=validation_results_python) expectation_results.append(expectation_result_python) validation_python = feature_group_validation.FeatureGroupValidation( validation_time=validation_time, expectation_results=expectation_results, ) return self._feature_group_validation_api.put(feature_group, validation_python)