示例#1
0
    def _extract_ts_features(self, ts_data, static_features, locations,
                             training_window_size):
        """Creates time-series feature dictionaries from data frame.

    This is an internal function to allow for feature engineering using both
    static and time series features.

    Args:
      ts_data: Time series DataFrame with columns constants.FEATURE_NAME_COLUMN,
        `ModelDefinition._LOCATION_COLUMN_NAME`, constants.DATE_COLUMN, and
        constants.FEATURE_VALUE_COLUMN.
      static_features: Static features.
      locations: Locations to be extracted.
      training_window_size: Time-series data points to use for training.

    Returns:
      The time series dictionary mapping features to values where the values are
        a mapping of locations to time series values, and the fitted scalers.
    """
        all_dates = preprocessing.get_all_valid_dates(ts_data)

        ts_features = preprocessing.ts_feature_df_to_nested_dict(
            ts_data,
            locations,
            all_dates,
            self.get_ts_features(),
            self._LOCATION_COLUMN_NAME,
        )
        proc_features, feature_scalers = self.transform_ts_features(
            ts_features=ts_features,
            static_features=static_features,
            initial_train_window_size=training_window_size)
        return proc_features, feature_scalers
示例#2
0
def extract_ts_overrides(ts_data, locations, ts_categorical_features):
    """Extract time-series overrides.

  Args:
    ts_data: A dataframe that contains all time-series overrides.
    locations: A list of locations for which the features are needed.
    ts_categorical_features: A list of names of categorical features.

  Returns:
    A mapping from the feature name to its value, the value of each feature
    is map from location to np.ndarray.
  """

    all_dates = preprocessing.get_all_valid_dates(ts_data)
    all_feature_names = ts_data["feature_name"].unique().tolist()
    ts_features = collections.defaultdict(
        functools.partial(
            collections.defaultdict,
            functools.partial(np.zeros,
                              shape=(len(all_dates)),
                              dtype="float32")))

    # 3 level dictionary with defaults representing feature_name, location and
    # date_index.
    # pylint: disable=g-long-lambda
    dct = collections.defaultdict(lambda: collections.defaultdict(
        lambda: collections.defaultdict(lambda: 1.0)))
    # Default value for ts categorical overrides must be -1.0 to be no-op.
    for feature_name in ts_categorical_features:
        for location in locations:
            for date_index, _ in enumerate(all_dates):
                dct[feature_name][location][date_index] = -1.0
    dt_index = {
        pd.Timestamp(dt).to_pydatetime(): idx
        for idx, dt in enumerate(all_dates)
    }
    for _, row in ts_data.iterrows():
        if row[constants.DATE_COLUMN] not in dt_index:
            continue
        dct[row[constants.FEATURE_NAME_COLUMN]][row[constants.GEO_ID_COLUMN]][
            dt_index[row[constants.DATE_COLUMN]]] = row[
                constants.FEATURE_VALUE_COLUMN]

    for feature_key in all_feature_names:
        for location in locations:
            for date_index, _ in enumerate(all_dates):
                ts_features[feature_key][location][date_index] = dct[
                    feature_key][location][date_index]

            key_list = list(ts_features.keys())
            if key_list:
                # pylint: disable=protected-access
                # noinspection PyProtectedMember
                preprocessing._assert_feature_lengths_for_location(
                    ts_features, location, reference_key=key_list[0])
                # pylint: enable=protected-access

    return ts_features
 def extract_all_features(
     self,
     static_data,
     ts_data,
     locations,
     training_window_size,
 ):
     (static_features,
      static_scalers), (ts_features,
                        ts_scalers) = (super().extract_all_features(
                            static_data, ts_data, locations,
                            training_window_size))
     if generic_seir_specs_county.INCLUDE_AGE_AS_TIME_COVARIATE:
         if static_data is None or ts_data is None:
             raise ValueError(
                 "Both static and time series data must be provided.")
         all_dates = preprocessing.get_all_valid_dates(ts_data)
         preprocessing.convert_static_features_to_constant_ts(
             static_features, static_scalers, ts_features, ts_scalers,
             constants.COUNTY_POP_AGE_RANGES.keys(), len(all_dates))
     return (static_features, static_scalers), (ts_features, ts_scalers)