예제 #1
0
 def pull_changes(self, wallclock_start_time, wallclock_end_time):
     self.left_feature_group_start_time = util.get_timestamp_from_date_string(
         wallclock_start_time
     )
     self.left_feature_group_end_time = util.get_timestamp_from_date_string(
         wallclock_end_time
     )
     return self
    def commit_details(self, feature_group, wallclock_time, limit):
        if (feature_group._time_travel_format is None
                or feature_group._time_travel_format.upper() != "HUDI"):
            raise exceptions.FeatureStoreException(
                "commit_details can only be used on time travel enabled feature groups"
            )

        wallclock_timestamp = (
            util.get_timestamp_from_date_string(wallclock_time)
            if wallclock_time is not None else None)
        feature_group_commits = self._feature_group_api.get_commit_details(
            feature_group, wallclock_timestamp, limit)
        commit_details = {}
        for feature_group_commit in feature_group_commits:
            commit_details[feature_group_commit.commitid] = {
                "committedOn":
                util.get_hudi_datestr_from_timestamp(
                    feature_group_commit.commitid),
                "rowsUpdated":
                feature_group_commit.rows_updated,
                "rowsInserted":
                feature_group_commit.rows_inserted,
                "rowsDeleted":
                feature_group_commit.rows_deleted,
            }
        return commit_details
예제 #3
0
 def as_of(self, wallclock_time):
     wallclock_timestamp = util.get_timestamp_from_date_string(
         wallclock_time)
     for join in self._joins:
         join.query.left_feature_group_end_time = wallclock_timestamp
     self.left_feature_group_end_time = wallclock_timestamp
     return self
예제 #4
0
    def _get_last_commit_metadata(spark_context, base_path):
        hopsfs_conf = spark_context._jvm.org.apache.hadoop.fs.FileSystem.get(
            spark_context._jsc.hadoopConfiguration()
        )
        commit_timeline = spark_context._jvm.org.apache.hudi.HoodieDataSourceHelpers.allCompletedCommitsCompactions(
            hopsfs_conf, base_path
        )

        commits_to_return = commit_timeline.getInstantDetails(
            commit_timeline.lastInstant().get()
        ).get()
        commit_metadata = spark_context._jvm.org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(
            commits_to_return,
            spark_context._jvm.org.apache.hudi.common.model.HoodieCommitMetadata().getClass(),
        )
        return feature_group_commit.FeatureGroupCommit(
            commitid=None,
            commit_date_string=commit_timeline.lastInstant().get().getTimestamp(),
            commit_time=util.get_timestamp_from_date_string(
                commit_timeline.lastInstant().get().getTimestamp()
            ),
            rows_inserted=commit_metadata.fetchTotalInsertRecordsWritten(),
            rows_updated=commit_metadata.fetchTotalUpdateRecordsWritten(),
            rows_deleted=commit_metadata.getTotalRecordsDeleted(),
        )
 def _convert_event_time_to_timestamp(self, event_time):
     if not event_time:
         return None
     if isinstance(event_time, str):
         return util.get_timestamp_from_date_string(event_time,
                                                    timezone.utc)
     elif isinstance(event_time, int):
         if event_time < 1000:
             raise ValueError(
                 "Timestamp should be greater than or equal to 1000 ms")
         return event_time
     else:
         raise ValueError(
             "Given event time should be in `str` or `int` type")
예제 #6
0
 def get(
     self,
     metadata_instance,
     commit_time,
     for_transformation=False,
     training_dataset_version=None,
 ):
     """Get Statistics with the specified commit time of an entity."""
     commit_timestamp = util.get_timestamp_from_date_string(commit_time)
     return self._statistics_api.get(
         metadata_instance,
         commit_timestamp,
         for_transformation,
         training_dataset_version,
     )
예제 #7
0
    def as_of(self, wallclock_time):
        """Perform time travel on the given Query.

        This method returns a new Query object at the specified point in time.
        This can then either be read into a Dataframe or used further to perform joins
        or construct a training dataset.

        # Arguments
            wallclock_time: Datetime string. The String should be formatted in one of the
                following formats `%Y%m%d`, `%Y%m%d%H`, `%Y%m%d%H%M`, or `%Y%m%d%H%M%S`.

        # Returns
            `Query`. The query object with the applied time travel condition.
        """
        wallclock_timestamp = util.get_timestamp_from_date_string(wallclock_time)
        for join in self._joins:
            join.query.left_feature_group_end_time = wallclock_timestamp
        self.left_feature_group_end_time = wallclock_timestamp
        return self
예제 #8
0
 def get(self, metadata_instance, commit_time):
     """Get Statistics with the specified commit time of an entity."""
     commit_timestamp = util.get_timestamp_from_date_string(commit_time)
     return self._statistics_api.get(metadata_instance, commit_timestamp)