def _setup_hudi_read_opts(self, start_timestamp, end_timestamp, read_options): _hudi_commit_start_time = util.get_hudi_datestr_from_timestamp(start_timestamp) _hudi_commit_end_time = util.get_hudi_datestr_from_timestamp(end_timestamp) hudi_options = { self.HUDI_QUERY_TYPE_OPT_KEY: self.HUDI_QUERY_TYPE_INCREMENTAL_OPT_VAL, self.HUDI_BEGIN_INSTANTTIME_OPT_KEY: _hudi_commit_start_time, self.HUDI_END_INSTANTTIME_OPT_KEY: _hudi_commit_end_time, } if read_options: hudi_options.update(read_options) return hudi_options
def commit_details(self, feature_group, wallclock_time, limit): if (feature_group._time_travel_format is None or feature_group._time_travel_format.upper() != "HUDI"): raise exceptions.FeatureStoreException( "commit_details can only be used on time travel enabled feature groups" ) wallclock_timestamp = ( util.get_timestamp_from_date_string(wallclock_time) if wallclock_time is not None else None) feature_group_commits = self._feature_group_api.get_commit_details( feature_group, wallclock_timestamp, limit) commit_details = {} for feature_group_commit in feature_group_commits: commit_details[feature_group_commit.commitid] = { "committedOn": util.get_hudi_datestr_from_timestamp( feature_group_commit.commitid), "rowsUpdated": feature_group_commit.rows_updated, "rowsInserted": feature_group_commit.rows_inserted, "rowsDeleted": feature_group_commit.rows_deleted, } return commit_details
def compute_statistics(self, metadata_instance, feature_dataframe=None, feature_group_commit_id=None): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark": # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Hive engine # and calling compute_statistics if feature_dataframe is None: if feature_group_commit_id is not None: feature_dataframe = (metadata_instance.select_all().as_of( util.get_hudi_datestr_from_timestamp( feature_group_commit_id)).read( online=False, dataframe_type="default", read_options={})) else: feature_dataframe = metadata_instance.read() commit_time = int( float(datetime.datetime.now().timestamp()) * 1000) if len(feature_dataframe.head(1)) == 0: raise exceptions.FeatureStoreException( "There is no data in the entity that you are trying to compute " "statistics for. A possible cause might be that you inserted only data " "to the online storage of a feature group.") content_str = engine.get_instance().profile( feature_dataframe, metadata_instance.statistics_config.columns, metadata_instance.statistics_config.correlations, metadata_instance.statistics_config.histograms, ) stats = statistics.Statistics(commit_time, feature_group_commit_id, content_str) self._statistics_api.post(metadata_instance, stats) return stats else: # Hive engine engine.get_instance().profile(metadata_instance)
def compute_statistics( self, metadata_instance, feature_dataframe=None, feature_group_commit_id=None, feature_view_obj=None, ): """Compute statistics for a dataframe and send the result json to Hopsworks.""" if engine.get_type() == "spark" or feature_view_obj is not None: # If the feature dataframe is None, then trigger a read on the metadata instance # We do it here to avoid making a useless request when using the Python engine # and calling compute_statistics if feature_dataframe is None: if feature_group_commit_id is not None: feature_dataframe = (metadata_instance.select_all().as_of( util.get_hudi_datestr_from_timestamp( feature_group_commit_id)).read( online=False, dataframe_type="default", read_options={})) else: feature_dataframe = metadata_instance.read() commit_time = int( float(datetime.datetime.now().timestamp()) * 1000) content_str = self.profile_statistics(metadata_instance, feature_dataframe) if content_str: stats = statistics.Statistics( commit_time=commit_time, content=content_str, feature_group_commit_id=feature_group_commit_id, ) self._save_statistics(stats, metadata_instance, feature_view_obj) else: # Python engine engine.get_instance().profile_by_spark(metadata_instance)