def pull_changes(self, wallclock_start_time, wallclock_end_time): self.left_feature_group_start_time = util.get_timestamp_from_date_string( wallclock_start_time ) self.left_feature_group_end_time = util.get_timestamp_from_date_string( wallclock_end_time ) return self
def commit_details(self, feature_group, wallclock_time, limit): if (feature_group._time_travel_format is None or feature_group._time_travel_format.upper() != "HUDI"): raise exceptions.FeatureStoreException( "commit_details can only be used on time travel enabled feature groups" ) wallclock_timestamp = ( util.get_timestamp_from_date_string(wallclock_time) if wallclock_time is not None else None) feature_group_commits = self._feature_group_api.get_commit_details( feature_group, wallclock_timestamp, limit) commit_details = {} for feature_group_commit in feature_group_commits: commit_details[feature_group_commit.commitid] = { "committedOn": util.get_hudi_datestr_from_timestamp( feature_group_commit.commitid), "rowsUpdated": feature_group_commit.rows_updated, "rowsInserted": feature_group_commit.rows_inserted, "rowsDeleted": feature_group_commit.rows_deleted, } return commit_details
def as_of(self, wallclock_time): wallclock_timestamp = util.get_timestamp_from_date_string( wallclock_time) for join in self._joins: join.query.left_feature_group_end_time = wallclock_timestamp self.left_feature_group_end_time = wallclock_timestamp return self
def _get_last_commit_metadata(spark_context, base_path): hopsfs_conf = spark_context._jvm.org.apache.hadoop.fs.FileSystem.get( spark_context._jsc.hadoopConfiguration() ) commit_timeline = spark_context._jvm.org.apache.hudi.HoodieDataSourceHelpers.allCompletedCommitsCompactions( hopsfs_conf, base_path ) commits_to_return = commit_timeline.getInstantDetails( commit_timeline.lastInstant().get() ).get() commit_metadata = spark_context._jvm.org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes( commits_to_return, spark_context._jvm.org.apache.hudi.common.model.HoodieCommitMetadata().getClass(), ) return feature_group_commit.FeatureGroupCommit( commitid=None, commit_date_string=commit_timeline.lastInstant().get().getTimestamp(), commit_time=util.get_timestamp_from_date_string( commit_timeline.lastInstant().get().getTimestamp() ), rows_inserted=commit_metadata.fetchTotalInsertRecordsWritten(), rows_updated=commit_metadata.fetchTotalUpdateRecordsWritten(), rows_deleted=commit_metadata.getTotalRecordsDeleted(), )
def _convert_event_time_to_timestamp(self, event_time): if not event_time: return None if isinstance(event_time, str): return util.get_timestamp_from_date_string(event_time, timezone.utc) elif isinstance(event_time, int): if event_time < 1000: raise ValueError( "Timestamp should be greater than or equal to 1000 ms") return event_time else: raise ValueError( "Given event time should be in `str` or `int` type")
def get( self, metadata_instance, commit_time, for_transformation=False, training_dataset_version=None, ): """Get Statistics with the specified commit time of an entity.""" commit_timestamp = util.get_timestamp_from_date_string(commit_time) return self._statistics_api.get( metadata_instance, commit_timestamp, for_transformation, training_dataset_version, )
def as_of(self, wallclock_time): """Perform time travel on the given Query. This method returns a new Query object at the specified point in time. This can then either be read into a Dataframe or used further to perform joins or construct a training dataset. # Arguments wallclock_time: Datetime string. The String should be formatted in one of the following formats `%Y%m%d`, `%Y%m%d%H`, `%Y%m%d%H%M`, or `%Y%m%d%H%M%S`. # Returns `Query`. The query object with the applied time travel condition. """ wallclock_timestamp = util.get_timestamp_from_date_string(wallclock_time) for join in self._joins: join.query.left_feature_group_end_time = wallclock_timestamp self.left_feature_group_end_time = wallclock_timestamp return self
def get(self, metadata_instance, commit_time): """Get Statistics with the specified commit time of an entity.""" commit_timestamp = util.get_timestamp_from_date_string(commit_time) return self._statistics_api.get(metadata_instance, commit_timestamp)