def transform(self, X): """Replaces outliers with NaN. Parameters ---------- X : `pandas.DataFrame` Data to transform. e.g. each column is a timeseries. Columns are expected to be numeric. Returns ------- X_outlier : `pandas.DataFrame` A copy of the data frame with original values and outliers replaced with NaN. """ assert isinstance(X, pd.DataFrame) result = X.copy() if self.z_cutoff is not None: if self.use_fit_baseline: if self._is_fitted is None: raise NotFittedError( "This instance is not fitted yet. Call 'fit' with appropriate arguments " "before calling 'transform'.") mean = self.mean std = self.std else: mean = X.mean() std = X.std() outlier_indices = np.abs(X - mean) > std * self.z_cutoff if np.any(outlier_indices): total_na = outlier_indices.sum().sum() log_message(f"Detected {total_na} outlier(s).", LoggingLevelEnum.INFO) result = result.mask(outlier_indices) return result
def summary(self): """Creates human readable string of how the model works, including relevant diagnostics These details cannot be extracted from the forecast alone Prints model configuration. Extend this in child class to print the trained model parameters. Log message is printed to the cst.LOGGER_NAME logger. """ log_message(self, LoggingLevelEnum.DEBUG) # print model input parameters
def summary(self): """Prints input parameters and Prophet model parameters. Returns ------- log_message : str log message printed to logging.info() """ super().summary() if self.model is not None: log_message(pprint(self.model.params), LoggingLevelEnum.INFO)
def transform(self, X): """Imputes missing values in input time series. Checks the % of data points that are null, and provides warning if it exceeds ``self.max_frac``. Parameters ---------- X : `pandas.DataFrame` Data to transform. e.g. each column is a timeseries. Columns are expected to be numeric. Returns ------- X_imputed : `pandas.DataFrame` A copy of the data frame with original values and missing values imputed """ if self._is_fitted is None: raise NotFittedError( "This instance is not fitted yet. Call 'fit' with appropriate arguments " "before calling 'transform'.") assert isinstance(X, pd.DataFrame) self.null_frac = X.isna().mean() # fraction of NaNs in each column if np.any(self.null_frac > self.max_frac): warnings.warn(f"Input data has many null values. Missing {self.null_frac.max():.2%} of one input.", RuntimeWarning) if any(self.null_frac > 0.0): log_message(f"Missing data detected: {self.null_frac.mean():.2%} of all input values " f"are null. (If future external regressor(s) are used, some missing values in " f"`value_col` are expected.)", LoggingLevelEnum.INFO) if self.impute_algorithm is not None: if self.impute_algorithm == "interpolate": # Uses `pandas.DataFrame.interpolate` X_imputed = X.interpolate(**self.impute_params) elif self.impute_algorithm == "ts_interpolate": # Uses `impute_with_lags_multi` impute_info = impute_with_lags_multi(df=X, **self.impute_params) X_imputed = impute_info["df"] self.missing_info = impute_info["missing_info"] else: raise ValueError(f"`impute_algorithm` '{self.impute_algorithm}' is not recognized." f"Must be one of 'ts_interpolate', 'interpolate'") if self.impute_all: # A second pass is taken to make sure there are no NaNs. X_imputed = X_imputed.interpolate(**DEFAULT_PARAMS["interpolate"]) else: # no-op X_imputed = X.copy() return X_imputed
def fill_missing_dates(df, time_col=TIME_COL, freq=None): """Looks for gaps in df[time_col] and returns a pandas.DataFrame with the missing rows added in. Warning: if freq doesn't match intended freq, then values may be removed. Parameters ---------- df : `pandas.DataFrame` dataframe with column ``time_col`` time_col: `str` time column name, default TIME_COL freq: `str` timeseries frequency, DateOffset alias, default None (automatically inferred) Returns ------- full_df : `pandas.DataFrame` ``df`` with rows added for missing timestamps added_timepoints : `int` The number of rows added to ``df`` dropped_timepoints : `int` The number of rows removed from ``df``. If the timestamps in ``df`` are not evenly spaced, irregular timestamps may be removed. """ freq = freq if freq is not None else pd.infer_freq(df[time_col]) df = df.reset_index(drop=True) complete_dates = pd.DataFrame({ time_col: pd.date_range(start=min(df[time_col]), end=max(df[time_col]), freq=freq) }) full_df = pd.merge(complete_dates, df, how="left", on=time_col) # counts the timestamps in one but not the other before = set(df[time_col].values) after = set(full_df[time_col].values) added_timepoints = len(after - before) dropped_timepoints = len(before - after) if added_timepoints > 0: log_message( f"Added {added_timepoints} missing dates. There were {len(before)} values originally.", LoggingLevelEnum.INFO) if dropped_timepoints > 0: warnings.warn( f"Dropped {dropped_timepoints} dates when filling gaps in input data. Provide data frequency" f" and make sure data points are evenly spaced.") return full_df, added_timepoints, dropped_timepoints
def aggregate_array(ts_values, agg_periods=7, agg_func=np.sum): """Aggregates input array. Divides array from left to right into bins of size agg_periods, and applies agg_func to each block. Drops records from the left if needed to ensure all bins are full. :param ts_values: list, np.array, or pd.Series to aggregate :param agg_periods: number of periods to combine in aggregation :param agg_func: aggregation function, e.g. np.max, np.sum. Must take an array and returns a number :return: array, aggregated so that every agg_periods periods are combined into one Examples: >>> aggregate_array([1.0, 2.0, 3.0, 4.0], 2, np.sum) array([3., 7.]) >>> aggregate_array(pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]), 2, np.sum) array([5., 9.]) >>> aggregate_array(np.array([1.0, 2.0, 3.0, 4.0, 5.0]), 2, np.max) array([3., 5.]) """ ts_values = np.array(ts_values) n_periods = len(ts_values) drop_first_periods = n_periods % agg_periods # drop these periods from the front, to ensure all bins are full if drop_first_periods == n_periods: drop_first_periods = 0 warnings.warn( f"Requested agg_periods={agg_periods}, but there are only {n_periods}. Using all for aggregation" ) elif drop_first_periods > 0: log_message( f"Requested agg_periods={agg_periods} for data of length {n_periods}. Dropping first" f" {drop_first_periods} records before aggregation", LoggingLevelEnum.INFO) # creates dummy time index for aggregation dates = pd.date_range("2018-01-01", periods=n_periods - drop_first_periods, freq="1D") ts = pd.Series(ts_values[drop_first_periods:], index=dates) aggregated_array = ts.resample(f"{agg_periods}D", closed="left") \ .agg(lambda x: agg_func(x)) \ .values return aggregated_array
def predict(self, X, y=None): """Creates forecast for dates specified in X To enable caching, every subclass must call this at the beginning of its ``.predict()``. Before returning the result, the subclass ``.predict()`` must set ``self.cached_predictions_`` to the return value. Parameters ---------- X : `pandas.DataFrame` Input timeseries with timestamp column and any additional regressors. Timestamps are the dates for prediction. Value column, if provided in X, is ignored. y : ignored Returns ------- predictions : `pandas.DataFrame` Forecasted values for the dates in X. Columns: - TIME_COL dates - PREDICTED_COL predictions - PREDICTED_LOWER_COL lower bound of predictions, optional - PREDICTED_UPPER_COL upper bound of predictions, optional - [other columns], optional ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present if ``self.coverage`` is not None. """ if self.cached_predictions_ is not None and X.equals( self.last_predicted_X_): log_message("Returning cached predictions.", LoggingLevelEnum.DEBUG) return self.cached_predictions_ else: # Updates `last_predicted_X` to the new value. # To enable caching, the subclass must set # `self.cached_predictions` to the returned result. self.last_predicted_X_ = X return None
def get_hyperparameter_searcher( hyperparameter_grid, model, cv=None, hyperparameter_budget=None, n_jobs=1, verbose=1, **kwargs) -> RandomizedSearchCV: """Returns RandomizedSearchCV object for hyperparameter tuning via cross validation `sklearn.model_selection.RandomizedSearchCV` runs a full grid search if ``hyperparameter_budget`` is sufficient to exhaust the full ``hyperparameter_grid``, otherwise it samples uniformly at random from the space. Parameters ---------- hyperparameter_grid : `dict` or `list` [`dict`] Dictionary with parameters names (string) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). Lists of parameters are sampled uniformly. May also be a list of such dictionaries to avoid undesired combinations of parameters. Passed as ``param_distributions`` to `sklearn.model_selection.RandomizedSearchCV`, see docs for more info. model: estimator object A object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. cv: `int`, cross-validation generator, iterable, or None, default None Determines the cross-validation splitting strategy. See `sklearn.model_selection.RandomizedSearchCV`. hyperparameter_budget: `int` or None, default None max number of hyperparameter sets to try within the hyperparameter_grid search space If None, uses defaults: * exhaustive grid search if all values are constant * 10 if any value is a distribution to sample from n_jobs : `int` or None, default 1 Number of jobs to run in parallel (the maximum number of concurrently running workers). ``-1`` uses all CPUs. ``-2`` uses all CPUs but one. ``None`` is treated as 1 unless in a `joblib.Parallel` backend context that specifies otherwise. verbose : `int`, default 1 Verbosity level during CV. * if > 0, prints number of fits * if > 1, prints fit parameters, total score + fit time * if > 2, prints train/test scores kwargs : additional parameters Keyword arguments to pass to `~greykite.framework.pipeline.utils.get_scoring_and_refit`. Accepts the following parameters: - ``"score_func"`` - ``"score_func_greater_is_better"`` - ``"cv_report_metrics"`` - ``"agg_periods"`` - ``"agg_func"`` - ``"relative_error_tolerance"`` Returns ------- grid_search : `sklearn.model_selection.RandomizedSearchCV` Object that can run randomized search on hyper parameters. """ if hyperparameter_budget is None: # sets reasonable defaults when hyperparameter_budget is not provided try: # exhaustive search if explicit values are provided hyperparameter_budget = len(ParameterGrid(hyperparameter_grid)) log_message(f"Setting hyperparameter_budget to {hyperparameter_budget} for full grid search.", LoggingLevelEnum.DEBUG) except TypeError: # parameter value is not iterable # sets budget to 10 if distribution for randomized search is provided hyperparameter_budget = 10 log_message(f"Setting hyperparameter_budget to {hyperparameter_budget} to sample from" f" provided distributions (and lists).", LoggingLevelEnum.WARNING) scoring, refit = get_scoring_and_refit(**kwargs) # note: RandomizedSearchCV operates like GridSearchCV when hyperparameter_grid contains no distributions grid_search = RandomizedSearchCV( estimator=model, param_distributions=hyperparameter_grid, # a fixed list or distribution to sample from n_iter=hyperparameter_budget, # samples uniformly, up to hyperparameter_budget scoring=scoring, # model evaluation criteria (note: if None, uses the score function of the estimator) n_jobs=n_jobs, # parallelism refit=refit, # selects the best model cv=cv, verbose=verbose, pre_dispatch="2*n_jobs", # controls memory consumption return_train_score=True # NB: could be False for speedup ) return grid_search
def _get_estimators(self): """Gets the estimators for forecast one-by-one. If the given parameters indicate that multiple estimators are need for the forecast one-by-one algorithm, these estimators with proper parameters are initialized. Sets ``self.estimator_class``, ``self.estimators``, ``self.pred_indices`` and ``self.estimator_map_list``. """ # Only estimators in ``ONEBYONE_ESTIMATORS`` supports forecast one-by-one. if self.estimator not in ONEBYONE_ESTIMATORS: raise ValueError( f"Estimator {self.estimator} does not support forecast" f" one-by-one.") self.estimator_class = ONEBYONE_ESTIMATORS[self.estimator]["class"] if self.estimator_params is None: self.estimator_params = {} # Sets estimator base parameters, so the prediction confidence intervals can be pulled. if "score_func" not in self.estimator_params: self.estimator_params["score_func"] = self.score_func if "coverage" not in self.estimator_params: self.estimator_params["coverage"] = self.coverage if "null_model_params" not in self.estimator_params: self.estimator_params["null_model_params"] = self.null_model_params # Checks if any provided parameters depend on forecast horizon. params_depending_on_horizon = ONEBYONE_ESTIMATORS[ self.estimator]["params_depending_on_horizon"] depending_on_horizon = False if params_depending_on_horizon is not None: for param, values in params_depending_on_horizon.items(): if param in self.estimator_params: input_value = self.estimator_params.get(param) if input_value in values: depending_on_horizon = True if not depending_on_horizon: log_message( message="No parameters depending on forecast horizon found. " "Forecast one-by-one is not activated.", level=LoggingLevelEnum.INFO) # Checks if forecast horizon is a parameter in ``estimator_params``. # Forecast horizon should be different for different models. # If forecast horizon is a parameter, it need to be removed. # It will be added back differently for each estimator. forecast_horizon_param = ONEBYONE_ESTIMATORS[ self.estimator]["forecast_horizon_param"] if depending_on_horizon: if forecast_horizon_param in self.estimator_params: del self.estimator_params[forecast_horizon_param] # Initializes estimator instances. if depending_on_horizon and self.estimator_map is not False: if self.estimator_map is None or self.estimator_map is True: self.estimator_map_list = [1] * self.forecast_horizon elif isinstance(self.estimator_map, int): estimator_map = [ self.estimator_map for _ in range(self.forecast_horizon // self.estimator_map) ] if self.forecast_horizon % self.estimator_map: estimator_map.append(self.forecast_horizon % self.estimator_map) self.estimator_map_list = estimator_map else: if sum(self.estimator_map) != self.forecast_horizon: raise ValueError( "Sum of forecast one by one estimator map must equal to forecast horizon." ) self.estimator_map_list = deepcopy(self.estimator_map) self.estimators = [] self.pred_indices = [0] current_horizon = 0 for i in self.estimator_map_list: current_horizon += i self.estimator_params[forecast_horizon_param] = current_horizon self.estimators.append( deepcopy(self.estimator_class(**self.estimator_params))) self.pred_indices.append(current_horizon) else: self.estimator_map_list = [self.forecast_horizon] if forecast_horizon_param is not None: self.estimator_params[ forecast_horizon_param] = self.forecast_horizon self.estimators = [ deepcopy(self.estimator_class(**self.estimator_params)) ]
def __get_template_class( self, config: Optional[ForecastConfig] = None ) -> Type[TemplateInterface]: """Extracts template class (e.g. `SimpleSilverkiteTemplate`) from the config. Parameters ---------- config : :class:`~greykite.framework.templates.model_templates.ForecastConfig` or None Config object for template class to use. See :class:`~greykite.framework.templates.model_templates.ForecastConfig`. Returns ------- template_class : Type[`~greykite.framework.templates.template_interface.TemplateInterface`] An implementation of `~greykite.framework.templates.template_interface.TemplateInterface`. """ config = self.__get_config_with_default_model_template_and_components( config) if isinstance(config.model_template, list): # Parses `config.model_template` to extract the template class, with validation. # Handles a list of model templates. template_classes = [ self.__get_template_class(config=ForecastConfig( model_template=mt)) for mt in config.model_template ] for tc in template_classes: if tc != template_classes[0]: raise ValueError( "All model templates must use the same template class. " f"Found {template_classes}") template_class = template_classes[0] if not template_class().allow_model_template_list: raise ValueError( f"The template class {template_class} does not allow `model_template` to be a list. " f"Pass a string instead.") else: # Handles other situations (string, data class). try: # Tries to look up in `self.model_template_enum`. template_class = self.model_template_enum[ config.model_template].value.template_class except (KeyError, TypeError): # Template is not found in the enum. # NB: The logic in this clause is written for the default `self.model_template_enum`, # which contains only one template class that is a subclass of SimpleSilverkiteTemplate. # If a custom `self.model_template_enum` is provided it may be useful to override this logic. valid_names = ", ".join( self.model_template_enum.__dict__["_member_names_"]) # Checks if template enum has a template class that supports generic naming # i.e. a subclass of `SimpleSilverkiteTemplate`. subclass_simple_silverkite = [ mte for mte in self.model_template_enum if issubclass( mte.value.template_class, SimpleSilverkiteTemplate) ] if len(subclass_simple_silverkite) > 0: try: log_message( f"Model template {config.model_template} is not found in the template enum. " f"Checking if model template is suitable for `SimpleSilverkiteTemplate`.", LoggingLevelEnum.DEBUG) SimpleSilverkiteTemplate().check_template_type( config.model_template) possible_template_classes = unique_elements_in_list([ mte.value.template_class for mte in subclass_simple_silverkite ]) if len(possible_template_classes) > 1: log_message( f"Multiple template classes could be used for the model " f"template {config.model_template}: {possible_template_classes}", LoggingLevelEnum.DEBUG) # arbitrarily take a class that supports generic naming template_class = subclass_simple_silverkite[ 0].value.template_class log_message( f"Using template class {template_class} for the model " f"template {config.model_template}", LoggingLevelEnum.DEBUG) except ValueError: raise ValueError( f"Model Template '{config.model_template}' is not recognized! Must be one of: {valid_names}" " or satisfy the `SimpleSilverkiteTemplate` rules." ) else: raise ValueError( f"Model Template '{config.model_template}' is not recognized! Must be one of: {valid_names}." ) # Validates `model_components_param` compatibility with the template if not template_class( ).allow_model_components_param_list and isinstance( config.model_components_param, list): raise ValueError( f"Model template {config.model_template} does not support a list of `ModelComponentsParam`." ) return template_class
def pipeline_wrapper( # The arguments to this wrapper must be identical to forecast_pipeline() function. # We don't use **kwargs # because it's easier to check parameters directly. # input df: pd.DataFrame, time_col=TIME_COL, value_col=VALUE_COL, date_format=None, tz=None, freq=None, train_end_date=None, anomaly_info=None, # model pipeline=None, regressor_cols=None, lagged_regressor_cols=None, estimator=SimpleSilverkiteEstimator(), hyperparameter_grid=None, hyperparameter_budget=None, n_jobs=COMPUTATION_N_JOBS, verbose=1, # forecast forecast_horizon=None, coverage=0.95, test_horizon=None, periods_between_train_test=None, agg_periods=None, agg_func=None, # evaluation score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, score_func_greater_is_better=False, cv_report_metrics=None, null_model_params=None, relative_error_tolerance=None, # CV cv_horizon=None, cv_min_train_periods=None, cv_expanding_window=False, cv_use_most_recent_splits=False, cv_periods_between_splits=None, cv_periods_between_train_test=0, cv_max_splits=3): if coverage is not None and (coverage < 0 or coverage > 1): raise ValueError(f"coverage must be between 0 and 1, found {coverage}") if relative_error_tolerance is not None and relative_error_tolerance < 0: raise ValueError(f"relative_error_tolerance must non-negative, found {relative_error_tolerance}") # default values for forecast horizon, test, and cross-validation parameters period = min_gap_in_seconds(df=df, time_col=time_col) num_observations = df.shape[0] default_time_params = get_default_time_parameters( period=period, num_observations=num_observations, forecast_horizon=forecast_horizon, test_horizon=test_horizon, periods_between_train_test=periods_between_train_test, cv_horizon=cv_horizon, cv_min_train_periods=cv_min_train_periods, cv_periods_between_train_test=cv_periods_between_train_test) forecast_horizon = default_time_params.get("forecast_horizon") test_horizon = default_time_params.get("test_horizon") periods_between_train_test = default_time_params.get("periods_between_train_test") cv_horizon = default_time_params.get("cv_horizon") cv_min_train_periods = default_time_params.get("cv_min_train_periods") cv_periods_between_train_test = default_time_params.get("cv_periods_between_train_test") # ensures the values are integers in the proper domain if hyperparameter_budget is not None: hyperparameter_budget = get_integer( hyperparameter_budget, "hyperparameter_budget", min_value=1) if (cv_horizon == 0 or cv_max_splits == 0) and test_horizon == 0: raise ValueError("Either CV or backtest must be enabled." " Set cv_horizon and cv_max_splits to nonzero values to enable CV." " Set test_horizon to nonzero value to enable backtest." " It's important to check model" " performance on historical data.") if test_horizon == 0: warnings.warn("No data selected for test (test_horizon=0). " "It is important to check out of sample performance") # checks horizon against data size if num_observations < forecast_horizon * 2: warnings.warn(f"Not enough training data to forecast the full forecast_horizon." " Exercise extra caution with" f" forecasted values after {num_observations // 2} periods.") if test_horizon > num_observations: raise ValueError(f"test_horizon ({test_horizon}) is too large." " Must be less than the number " f"of input data points: {num_observations})") if test_horizon > forecast_horizon: warnings.warn(f"test_horizon should never be larger than forecast_horizon.") if test_horizon > num_observations // 3: warnings.warn(f"test_horizon should be <= than 1/3 of the data set size to allow enough data to train" f" a backtest model. Consider reducing to {num_observations // 3}. If this is smaller" f" than the forecast_horizon, you will need to make a trade-off between setting" f" test_horizon=forecast_horizon and having enough data left over to properly" f" train a realistic backtest model.") log_message(f"forecast_horizon: {forecast_horizon}", LoggingLevelEnum.INFO) log_message(f"test_horizon: {test_horizon}", LoggingLevelEnum.INFO) log_message(f"cv_horizon: {cv_horizon}", LoggingLevelEnum.INFO) return pipeline_function( df, time_col=time_col, value_col=value_col, date_format=date_format, tz=tz, freq=freq, train_end_date=train_end_date, anomaly_info=anomaly_info, pipeline=pipeline, regressor_cols=regressor_cols, lagged_regressor_cols=lagged_regressor_cols, estimator=estimator, hyperparameter_grid=hyperparameter_grid, hyperparameter_budget=hyperparameter_budget, n_jobs=n_jobs, verbose=verbose, forecast_horizon=forecast_horizon, coverage=coverage, test_horizon=test_horizon, periods_between_train_test=periods_between_train_test, agg_periods=agg_periods, agg_func=agg_func, score_func=score_func, score_func_greater_is_better=score_func_greater_is_better, cv_report_metrics=cv_report_metrics, null_model_params=null_model_params, relative_error_tolerance=relative_error_tolerance, cv_horizon=cv_horizon, cv_min_train_periods=cv_min_train_periods, cv_expanding_window=cv_expanding_window, cv_use_most_recent_splits=cv_use_most_recent_splits, cv_periods_between_splits=cv_periods_between_splits, cv_periods_between_train_test=cv_periods_between_train_test, cv_max_splits=cv_max_splits )
def forecast_pipeline( # input df: pd.DataFrame, time_col=TIME_COL, value_col=VALUE_COL, date_format=None, tz=None, freq=None, train_end_date=None, anomaly_info=None, # model pipeline=None, regressor_cols=None, lagged_regressor_cols=None, estimator=SimpleSilverkiteEstimator(), hyperparameter_grid=None, hyperparameter_budget=None, n_jobs=COMPUTATION_N_JOBS, verbose=1, # forecast forecast_horizon=None, coverage=0.95, test_horizon=None, periods_between_train_test=None, agg_periods=None, agg_func=None, # evaluation score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, score_func_greater_is_better=False, cv_report_metrics=CV_REPORT_METRICS_ALL, null_model_params=None, relative_error_tolerance=None, # CV cv_horizon=None, cv_min_train_periods=None, cv_expanding_window=False, cv_use_most_recent_splits=False, cv_periods_between_splits=None, cv_periods_between_train_test=None, cv_max_splits=3): """Computation pipeline for end-to-end forecasting. Trains a forecast model end-to-end: 1. checks input data 2. runs cross-validation to select optimal hyperparameters e.g. best model 3. evaluates best model on test set 4. provides forecast of best model (re-trained on all data) into the future Returns forecasts with methods to plot and see diagnostics. Also returns the fitted pipeline and CV results. Provides a high degree of customization over training and evaluation parameters: 1. model 2. cross validation 3. evaluation 4. forecast horizon See test cases for examples. Parameters ---------- df : `pandas.DataFrame` Timeseries data to forecast. Contains columns [`time_col`, `value_col`], and optional regressor columns Regressor columns should include future values for prediction time_col : `str`, default TIME_COL in constants.py name of timestamp column in df value_col : `str`, default VALUE_COL in constants.py name of value column in df (the values to forecast) date_format : `str` or None, default None strftime format to parse time column, eg ``%m/%d/%Y``. Note that ``%f`` will parse all the way up to nanoseconds. If None (recommended), inferred by `pandas.to_datetime`. tz : `str` or None, default None Passed to `pandas.tz_localize` to localize the timestamp freq : `str` or None, default None Frequency of input data. Used to generate future dates for prediction. Frequency strings can have multiples, e.g. '5H'. See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases for a list of frequency aliases. If None, inferred by `pandas.infer_freq`. Provide this parameter if ``df`` has missing timepoints. train_end_date : `datetime.datetime`, optional, default None Last date to use for fitting the model. Forecasts are generated after this date. If None, it is set to the last date with a non-null value in ``value_col`` of ``df``. anomaly_info : `dict` or `list` [`dict`] or None, default None Anomaly adjustment info. Anomalies in ``df`` are corrected before any forecasting is done. If None, no adjustments are made. A dictionary containing the parameters to `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`. See that function for details. The possible keys are: ``"value_col"`` : `str` The name of the column in ``df`` to adjust. You may adjust the value to forecast as well as any numeric regressors. ``"anomaly_df"`` : `pandas.DataFrame` Adjustments to correct the anomalies. ``"start_date_col"``: `str`, default START_DATE_COL Start date column in ``anomaly_df``. ``"end_date_col"``: `str`, default END_DATE_COL End date column in ``anomaly_df``. ``"adjustment_delta_col"``: `str` or None, default None Impact column in ``anomaly_df``. ``"filter_by_dict"``: `dict` or None, default None Used to filter ``anomaly_df`` to the relevant anomalies for the ``value_col`` in this dictionary. Key specifies the column name, value specifies the filter value. ``"filter_by_value_col""``: `str` or None, default None Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict`` if not None, for the ``value_col`` in this dictionary. ``"adjustment_method"`` : `str` ("add" or "subtract"), default "add" How to make the adjustment, if ``adjustment_delta_col`` is provided. Accepts a list of such dictionaries to adjust multiple columns in ``df``. pipeline : `sklearn.pipeline.Pipeline` or None, default None Pipeline to fit. The final named step must be called "estimator". If None, will use the default Pipeline from `~greykite.framework.pipeline.utils.get_basic_pipeline`. regressor_cols : `list` [`str`] or None, default None A list of regressor columns used in the training and prediction DataFrames. It should contain only the regressors that are being used in the grid search. If None, no regressor columns are used. Regressor columns that are unavailable in ``df`` are dropped. lagged_regressor_cols : `list` [`str`] or None, default None A list of additional columns needed for lagged regressors in the training and prediction DataFrames. This list can have overlap with ``regressor_cols``. If None, no additional columns are added to the DataFrame. Lagged regressor columns that are unavailable in ``df`` are dropped. estimator : instance of an estimator that implements `greykite.algo.models.base_forecast_estimator.BaseForecastEstimator` Estimator to use as the final step in the pipeline. Ignored if ``pipeline`` is provided. forecast_horizon : `int` or None, default None Number of periods to forecast into the future. Must be > 0. If None, default is determined from input data frequency coverage : `float` or None, default=0.95 Intended coverage of the prediction bands (0.0 to 1.0) If None, the upper/lower predictions are not returned Ignored if `pipeline` is provided. Uses coverage of the ``pipeline`` estimator instead. test_horizon : `int` or None, default None Numbers of periods held back from end of df for test. The rest is used for cross validation. If None, default is forecast_horizon. Set to 0 to skip backtest. periods_between_train_test : `int` or None, default None Number of periods for the gap between train and test data. If None, default is 0. agg_periods : `int` or None, default None Number of periods to aggregate before evaluation. Model is fit and forecasted on the dataset's original frequency. Before evaluation, the actual and forecasted values are aggregated, using rolling windows of size ``agg_periods`` and the function ``agg_func``. (e.g. if the dataset is hourly, use ``agg_periods=24, agg_func=np.sum``, to evaluate performance on the daily totals). If None, does not aggregate before evaluation. Currently, this is only used when calculating CV metrics and the R2_null_model_score metric in backtest/forecast. No pre-aggregation is applied for the other backtest/forecast evaluation metrics. agg_func : callable or None, default None Takes an array and returns a number, e.g. np.max, np.sum. Defines how to aggregate rolling windows of actual and predicted values before evaluation. Ignored if ``agg_periods`` is None. Currently, this is only used when calculating CV metrics and the R2_null_model_score metric in backtest/forecast. No pre-aggregation is applied for the other backtest/forecast evaluation metrics. score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name`` Score function used to select optimal model in CV. If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float. If a string, must be either a `~greykite.common.evaluation.EvaluationMetricEnum` member name or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`. score_func_greater_is_better : `bool`, default False True if ``score_func`` is a score function, meaning higher is better, and False if it is a loss function, meaning lower is better. Must be provided if ``score_func`` is a callable (custom function). Ignored if ``score_func`` is a string, because the direction is known. cv_report_metrics : `str`, or `list` [`str`], or None, default `~greykite.common.constants.CV_REPORT_METRICS_ALL` Additional metrics to compute during CV, besides the one specified by ``score_func``. - If the string constant `greykite.framework.constants.CV_REPORT_METRICS_ALL`, computes all metrics in ``EvaluationMetricEnum``. Also computes ``FRACTION_OUTSIDE_TOLERANCE`` if ``relative_error_tolerance`` is not None. The results are reported by the short name (``.get_metric_name()``) for ``EvaluationMetricEnum`` members and ``FRACTION_OUTSIDE_TOLERANCE_NAME`` for ``FRACTION_OUTSIDE_TOLERANCE``. These names appear in the keys of ``forecast_result.grid_search.cv_results_`` returned by this function. - If a list of strings, each of the listed metrics is computed. Valid strings are `~greykite.common.evaluation.EvaluationMetricEnum` member names and `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`. For example:: ["MeanSquaredError", "MeanAbsoluteError", "MeanAbsolutePercentError", "MedianAbsolutePercentError", "FractionOutsideTolerance2"] - If None, no additional metrics are computed. null_model_params : `dict` or None, default None Defines baseline model to compute ``R2_null_model_score`` evaluation metric. ``R2_null_model_score`` is the improvement in the loss function relative to a null model. It can be used to evaluate model quality with respect to a simple baseline. For details, see `~greykite.common.evaluation.r2_null_model_score`. The null model is a `~sklearn.dummy.DummyRegressor`, which returns constant predictions. Valid keys are "strategy", "constant", "quantile". See `~sklearn.dummy.DummyRegressor`. For example:: null_model_params = { "strategy": "mean", } null_model_params = { "strategy": "median", } null_model_params = { "strategy": "quantile", "quantile": 0.8, } null_model_params = { "strategy": "constant", "constant": 2.0, } If None, ``R2_null_model_score`` is not calculated. Note: CV model selection always optimizes ``score_func`, not the ``R2_null_model_score``. relative_error_tolerance : `float` or None, default None Threshold to compute the ``Outside Tolerance`` metric, defined as the fraction of forecasted values whose relative error is strictly greater than ``relative_error_tolerance``. For example, 0.05 allows for 5% relative error. If `None`, the metric is not computed. hyperparameter_grid : `dict`, `list` [`dict`] or None, default None Sets properties of the steps in the pipeline, and specifies combinations to search over. Should be valid input to `sklearn.model_selection.GridSearchCV` (param_grid) or `sklearn.model_selection.RandomizedSearchCV` (param_distributions). Prefix transform/estimator attributes by the name of the step in the pipeline. See details at: https://scikit-learn.org/stable/modules/compose.html#nested-parameters If None, uses the default pipeline parameters. hyperparameter_budget : `int` or None, default None Max number of hyperparameter sets to try within the ``hyperparameter_grid`` search space Runs a full grid search if ``hyperparameter_budget`` is sufficient to exhaust full ``hyperparameter_grid``, otherwise samples uniformly at random from the space. If None, uses defaults: * full grid search if all values are constant * 10 if any value is a distribution to sample from n_jobs : `int` or None, default `~greykite.framework.constants.COMPUTATION_N_JOBS` Number of jobs to run in parallel (the maximum number of concurrently running workers). ``-1`` uses all CPUs. ``-2`` uses all CPUs but one. ``None`` is treated as 1 unless in a `joblib.Parallel` backend context that specifies otherwise. verbose : `int`, default 1 Verbosity level during CV. if > 0, prints number of fits if > 1, prints fit parameters, total score + fit time if > 2, prints train/test scores cv_horizon : `int` or None, default None Number of periods in each CV test set If None, default is ``forecast_horizon``. Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV. cv_min_train_periods : `int` or None, default None Minimum number of periods for training each CV fold. If cv_expanding_window is False, every training period is this size If None, default is 2 * ``cv_horizon`` cv_expanding_window : `bool`, default False If True, training window for each CV split is fixed to the first available date. Otherwise, train start date is sliding, determined by ``cv_min_train_periods``. cv_use_most_recent_splits: `bool`, default False If True, splits from the end of the dataset are used. Else a sampling strategy is applied. Check `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit._sample_splits` for details. cv_periods_between_splits : `int` or None, default None Number of periods to slide the test window between CV splits If None, default is ``cv_horizon`` cv_periods_between_train_test : `int` or None, default None Number of periods for the gap between train and test in a CV split. If None, default is ``periods_between_train_test``. cv_max_splits : `int` or None, default 3 Maximum number of CV splits. Given the above configuration, samples up to max_splits train/test splits, preferring splits toward the end of available data. If None, uses all splits. Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV. Returns ------- forecast_result : :class:`~greykite.framework.pipeline.pipeline.ForecastResult` Forecast result. See :class:`~greykite.framework.pipeline.pipeline.ForecastResult` for details. * If ``cv_horizon=0``, ``forecast_result.grid_search.best_estimator_`` and ``forecast_result.grid_search.best_params_`` attributes are defined according to the provided single set of parameters. There must be a single set of parameters to skip cross-validation. * If ``test_horizon=0``, ``forecast_result.backtest`` is None. """ if hyperparameter_grid is None or hyperparameter_grid == []: hyperparameter_grid = {} # When hyperparameter_grid is a singleton list, unlist it if isinstance(hyperparameter_grid, list) and len(hyperparameter_grid) == 1: hyperparameter_grid = hyperparameter_grid[0] # Loads full dataset ts = UnivariateTimeSeries() ts.load_data( df=df, time_col=time_col, value_col=value_col, freq=freq, date_format=date_format, tz=tz, train_end_date=train_end_date, regressor_cols=regressor_cols, lagged_regressor_cols=lagged_regressor_cols, anomaly_info=anomaly_info) # Splits data into training and test sets. ts.df uses standardized column names if test_horizon == 0: train_df = ts.fit_df train_y = ts.fit_y test_df = pd.DataFrame(columns=list(df.columns)) else: # Make sure to refit best_pipeline appropriately train_df, test_df, train_y, test_y = train_test_split( ts.fit_df, ts.fit_y, train_size=ts.fit_df.shape[0] - test_horizon - periods_between_train_test, test_size=test_horizon + periods_between_train_test, shuffle=False) # this is important since this is timeseries forecasting! log_message(f"Train size: {train_df.shape[0]}. Test size: {test_df.shape[0]}", LoggingLevelEnum.INFO) # Defines default training pipeline if pipeline is None: pipeline = get_basic_pipeline( estimator=estimator, score_func=score_func, score_func_greater_is_better=score_func_greater_is_better, agg_periods=agg_periods, agg_func=agg_func, relative_error_tolerance=relative_error_tolerance, coverage=coverage, null_model_params=null_model_params, regressor_cols=ts.regressor_cols, lagged_regressor_cols=ts.lagged_regressor_cols) # Searches for the best parameters, and refits model with selected parameters on the entire training set if cv_horizon == 0 or cv_max_splits == 0: # No cross-validation. Only one set of hyperparameters is allowed. try: if len(ParameterGrid(hyperparameter_grid)) > 1: raise ValueError( "CV is required to identify the best model because there are multiple options " "in `hyperparameter_grid`. Either provide a single option or set `cv_horizon` and `cv_max_splits` " "to nonzero values.") except TypeError: # Parameter value is not iterable raise ValueError( "CV is required to identify the best model because `hyperparameter_grid` contains " "a distribution. Either remove the distribution or set `cv_horizon` and `cv_max_splits` " "to nonzero values.") # Fits model to entire train set. Params must be set manually since it's not done by grid search params = {k: v[0] for k, v in hyperparameter_grid.items()} # unpack lists, `v` is a singleton list with the parameter value best_estimator = pipeline.set_params(**params).fit(train_df, train_y) # Wraps this model in a dummy RandomizedSearchCV object to return the backtest model grid_search = get_hyperparameter_searcher( hyperparameter_grid=hyperparameter_grid, model=pipeline, cv=None, # no cross-validation hyperparameter_budget=hyperparameter_budget, n_jobs=n_jobs, verbose=verbose, score_func=score_func, score_func_greater_is_better=score_func_greater_is_better, cv_report_metrics=cv_report_metrics, agg_periods=agg_periods, agg_func=agg_func, relative_error_tolerance=relative_error_tolerance) # Sets relevant attributes. Others are undefined (cv_results_, best_score_, best_index_, scorer_, refit_time_) grid_search.best_estimator_ = best_estimator grid_search.best_params_ = params grid_search.n_splits_ = 0 else: # Defines cross-validation splitter cv = RollingTimeSeriesSplit( forecast_horizon=cv_horizon, min_train_periods=cv_min_train_periods, expanding_window=cv_expanding_window, use_most_recent_splits=cv_use_most_recent_splits, periods_between_splits=cv_periods_between_splits, periods_between_train_test=cv_periods_between_train_test, max_splits=cv_max_splits) # Defines grid search approach for CV grid_search = get_hyperparameter_searcher( hyperparameter_grid=hyperparameter_grid, model=pipeline, cv=cv, hyperparameter_budget=hyperparameter_budget, n_jobs=n_jobs, verbose=verbose, score_func=score_func, score_func_greater_is_better=score_func_greater_is_better, cv_report_metrics=cv_report_metrics, agg_periods=agg_periods, agg_func=agg_func, relative_error_tolerance=relative_error_tolerance) grid_search.fit(train_df, train_y) best_estimator = grid_search.best_estimator_ # Evaluates historical performance, fits model to all data (train+test) if test_horizon > 0: backtest_train_end_date = train_df[TIME_COL].max() # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON" backtest_test_start_date = pd.date_range( start=backtest_train_end_date, periods=periods_between_train_test + 2, # Adds 2 as start parameter is inclusive freq=ts.freq)[-1] backtest = get_forecast( df=ts.fit_df, # Backtest needs to happen on fit_df, not on the entire df trained_model=best_estimator, train_end_date=backtest_train_end_date, test_start_date=backtest_test_start_date, forecast_horizon=test_horizon, xlabel=time_col, ylabel=value_col, relative_error_tolerance=relative_error_tolerance) best_pipeline = clone(best_estimator) # Copies optimal parameters best_pipeline.fit(ts.fit_df, ts.y) # Refits this model on entire training dataset else: backtest = None # Backtest training metrics are the same as forecast training metrics best_pipeline = best_estimator # best_model is already fit to all data # Makes future predictions periods = forecast_horizon + periods_between_train_test future_df = ts.make_future_dataframe( periods=periods, include_history=True) forecast_train_end_date = ts.train_end_date # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON" forecast_test_start_date = pd.date_range( start=forecast_train_end_date, periods=periods_between_train_test + 2, # Adds 2 as start parameter is inclusive freq=ts.freq)[-1] forecast = get_forecast( df=future_df, trained_model=best_pipeline, train_end_date=forecast_train_end_date, test_start_date=forecast_test_start_date, forecast_horizon=forecast_horizon, xlabel=time_col, ylabel=value_col, relative_error_tolerance=relative_error_tolerance) result = ForecastResult( timeseries=ts, grid_search=grid_search, model=best_pipeline, backtest=backtest, forecast=forecast ) return result
def forecast_pipeline_rolling_evaluation(pipeline_params: Dict, tscv: RollingTimeSeriesSplit): """Runs ``forecast_pipeline`` on a rolling window basis. Parameters ---------- pipeline_params : `Dict` A dictionary containing the input to the :func:`~greykite.framework.pipeline.pipeline.forecast_pipeline`. tscv : `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` Cross-validation object that determines the rolling window evaluation. See :class:`~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` for details. Returns ------- rolling_evaluation : `dict` Stores benchmarking results for each split, e.g. split_0 contains result for first split, split_1 contains result for second split and so on. Number of splits is determined by the input parameters. Every split is a dictionary with keys "runtime_sec" and "pipeline_result". """ if pipeline_params["forecast_horizon"] != tscv.forecast_horizon: raise ValueError( "Forecast horizon in 'pipeline_params' does not match that of the 'tscv'." ) if pipeline_params[ "periods_between_train_test"] != tscv.periods_between_train_test: raise ValueError( "'periods_between_train_test' in 'pipeline_params' does not match that of the 'tscv'." ) df = pipeline_params["df"] time_col = pipeline_params.get("time_col", TIME_COL) date_format = pipeline_params.get("date_format") # Disables backtest. For rolling evaluation we know the actual values in forecast period. # So out of sample performance can be calculated using pipeline_result.forecast pipeline_params["test_horizon"] = 0 rolling_evaluation = {} with tqdm(list(tscv.split(X=df)), ncols=800, leave=True) as progress_bar: for (split_num, (train, test)) in enumerate(progress_bar): # Description will be displayed on the left of progress bar progress_bar.set_description(f"Split '{split_num}' ") train_end_date = pd.to_datetime(df.iloc[train[-1]][time_col], format=date_format, infer_datetime_format=True) pipeline_params["train_end_date"] = train_end_date start_time = timeit.default_timer() pipeline_result = forecast_pipeline(**pipeline_params) runtime = timeit.default_timer() - start_time pipeline_output = dict(runtime_sec=round(runtime, 3), pipeline_result=pipeline_result) rolling_evaluation[f"split_{split_num}"] = pipeline_output log_message(f"Completed evaluation for split {split_num}.", LoggingLevelEnum.DEBUG) return rolling_evaluation
def predict(self, X, y=None): """Creates forecast for dates specified in X The forecast one-by-one is supposed to forecast for the specified forecast horizon with the specified estimator-horizon mapping. If the size of ``X`` is different from forecast horizon, only the last model (trained with the longest forecast horizon) will be used. Parameters ---------- X : `pandas.DataFrame` Input timeseries with timestamp column and any additional regressors. Timestamps are the dates for prediction. Value column, if provided in X, is ignored. y : ignored Returns ------- predictions : `pandas.DataFrame` Forecasted values for the dates in X. Columns: - TIME_COL dates - PREDICTED_COL predictions - PREDICTED_LOWER_COL lower bound of predictions, optional - PREDICTED_UPPER_COL upper bound of predictions, optional - [other columns], optional ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present if ``self.coverage`` is not None. """ # Returns the cached result if applicable cached_predictions = super().predict(X=X) if cached_predictions is not None: return cached_predictions # Only one model. if len(self.estimators) == 1: return self.estimators[0].predict(X=X) # Forecast one-by-one is forecast-horizon-sensitive. # Checks future forecast horizon length to decide # how to make forecasts. is_future = pd.to_datetime(X[self.time_col_]) > self.train_end_date x_future = X.loc[is_future] # If the future prediction length is different from the forecast horizon, # use the last model only. if len(x_future) != self.forecast_horizon: log_message( message=f"The future x length is {len(x_future)}, " f"which doesn't match the model forecast horizon {self.forecast_horizon}, " f"using only the model with the longest forecast horizon for prediction.", level=LoggingLevelEnum.WARNING) return self.estimators[-1].predict(X) # From now on assume X is exactly the forecast horizon. # Makes predictions according to the estimator map. predictions = [ estimator.predict( x_future.iloc[self.pred_indices[i]:self.pred_indices[i + 1]]) for i, estimator in enumerate(self.estimators) ] # The past df is always forecasted with the last estimator. if not is_future.all(): past_prediction = self.estimators[-1].predict(X.loc[~is_future]) predictions = [past_prediction] + predictions return pd.concat(predictions)
def test_log_message(): with LogCapture(LOGGER_NAME) as log_capture: log_message("Test log message.", LoggingLevelEnum.CRITICAL) log_capture.check((LOGGER_NAME, "CRITICAL", "Test log message."))
def summary(self): log_message("Benchmark summary is not implemented yet.", LoggingLevelEnum.WARNING)
def flexible_grouping_evaluation(df, map_func_dict=None, groupby_col=None, agg_kwargs=None, extend_col_names=True, unpack_list=True, list_names_dict=None): """Flexible aggregation. Generates additional columns for evaluation via ``map_func_dict``, groups by ``groupby_col``, then aggregates according to ``agg_kwargs``. This function calls `pandas.DataFrame.apply` and `pandas.core.groupby.DataFrameGroupBy.agg` internally. Parameters ---------- df : `pandas.DataFrame` DataFrame to transform / aggregate map_func_dict : `dict` [`str`, `callable`] or None, default None Row-wise transformation functions to create new columns. If None, no new columns are added. key: new column name value: row-wise function to apply to ``df`` to generate the column value. Signature (row: `pandas.DataFrame`) -> transformed value: `float`. For example:: map_func_dict = { "residual": lambda row: row["predicted"] - row["actual"], "squared_error": lambda row: (row["predicted"] - row["actual"])**2 } groupby_col : `str` or None, default None Which column to group by. Can be in ``df`` or generated by ``map_func_dict``. If None, no grouping or aggregation is done. agg_kwargs : `dict` or None, default None Passed as keyword args to `pandas.core.groupby.DataFrameGroupBy.aggregate` after creating new columns and grouping by ``groupby_col``. Must be provided if ``groupby_col is not None``. To fully customize output column names, pass a dictionary as shown below. For example:: # Example 1, named aggregation to explicitly name output columns. # Assume ``df`` contains ``abs_percent_err``, ``abs_err`` columns. # Output columns are "MedAPE", "MAPE", "MAE", etc. in a single level index. from functools import partial agg_kwargs = { # output column name: (column to aggregate, aggregation function) "MedAPE": pd.NamedAgg(column="abs_percent_err", aggfunc=np.nanmedian), "MAPE": pd.NamedAgg(column="abs_percent_err", aggfunc=np.nanmean), "MAE": pd.NamedAgg(column="abs_err", aggfunc=np.nanmean), "q95_abs_err": pd.NamedAgg(column="abs_err", aggfunc=partial(np.nanquantile, q=0.95)), "q05_abs_err": pd.NamedAgg(column="abs_err", aggfunc=partial(np.nanquantile, q=0.05)), } # Example 2, multi-level aggregation using `func` parameter # to `pandas.core.groupby.DataFrameGroupBy.aggregate`. # Assume ``df`` contains ``y1``, ``y2`` columns. agg_kwargs = { "func": { "y1": [np.nanmedian, np.nanmean], "y2": [np.nanmedian, np.nanmax], } } # `extend_col_names` controls the output column names extend_col_names = True # output columns are "y1_nanmean", "y1_nanmedian", "y2_nanmean", "y2_nanmax" extend_col_names = False # output columns are "nanmean", "nanmedian", "nanmean", "nanmax" extend_col_names : `bool` or None, default True How to flatten index after aggregation. In some cases, the column index after aggregation is a multi-index. This parameter controls how to flatten an index with 2 levels to 1 level. - If None, the index is not flattened. - If True, column name is a composite: ``{index0}_{index1}`` Use this option if index1 is not unique. - If False, column name is simply ``{index1}`` Ignored if the ColumnIndex after aggregation has only one level (e.g. if named aggregation is used in ``agg_kwargs``). unpack_list : `bool`, default True Whether to unpack (flatten) columns that contain list/tuple after aggregation, to create one column per element of the list/tuple. If True, ``list_names_dict`` can be used to rename the unpacked columns. list_names_dict : `dict` [`str`, `list` [`str`]] or None, default None If ``unpack_list`` is True, this dictionary can optionally be used to rename the unpacked columns. - Key = column name after aggregation, before upacking. E.g. ``{index0}_{index1}`` or ``{index1}`` depending on ``extend_col_names``. - Value = list of names to use for the unpacked columns. Length must match the length of the lists contained in the column. If a particular list/tuple column is not found in this dictionary, appends 0, 1, 2, ..., n-1 to the original column name, where n = list length. For example, if the column contains a tuple of length 4 corresponding to quantiles 0.1, 0.25, 0.75, 0.9, then the following would be appropriate:: aggfunc = lambda grp: partial(np.nanquantile, q=[0.1, 0.25, 0.75, 0.9])(grp).tolist() agg_kwargs = { "value_Q": pd.NamedAgg(column="value", aggfunc=aggfunc) } list_names_dict = { # the key is the name of the unpacked column "value_Q" : ["Q0.10", "Q0.25", "Q0.75", "Q0.90"] } # Output columns are "Q0.10", "Q0.25", "Q0.75", "Q0.90" # In this example, if list_names_dict=None, the default output column names # would be: "value_Q0", "value_Q1", "value_Q2", "value_Q3" Returns ------- df_transformed : `pandas.DataFrame` df after transformation and optional aggregation. If ``groupby_col`` is None, returns ``df`` with additional columns as the keys in ``map_func_dict``. Otherwise, ``df`` is grouped by ``groupby_col`` and this becomes the index. Columns are determined by ``agg_kwargs`` and ``extend_col_names``. """ if groupby_col and not agg_kwargs: raise ValueError( "Must specify `agg_kwargs` if grouping is requested via `groupby_col`." ) if agg_kwargs and not groupby_col: log_message( f"`agg_kwargs` is ignored because `groupby_col` is None. " f"Specify `groupby_col` to allow aggregation.", LoggingLevelEnum.WARNING) df = df.copy() if map_func_dict is not None: for col_name, func in map_func_dict.items(): df[col_name] = df.apply(func, axis=1) if groupby_col is not None: groups = df.groupby(groupby_col) with warnings.catch_warnings(): # Ignores pandas FutureWarning. Use NamedAgg in pandas 0.25.+ warnings.filterwarnings( "ignore", message="using a dict with renaming is deprecated", category=FutureWarning) df_transformed = groups.agg(**agg_kwargs) if extend_col_names is not None and df_transformed.columns.nlevels > 1: # Flattens multi-level column index if extend_col_names: # By concatenating names df_transformed.columns = [ "_".join(col).strip("_") for col in df_transformed.columns ] else: # By using level 1 names df_transformed.columns = list( df_transformed.columns.get_level_values(1)) if np.any(df_transformed.columns.duplicated()): warnings.warn( "Column names are not unique. Use `extend_col_names=True` " "to uniquely identify every column.") else: # No grouping is requested df_transformed = df if unpack_list and df_transformed.shape[0] > 0: # Identifies the columns that contain list elements which_list_cols = df_transformed.iloc[0].apply( lambda x: isinstance(x, (list, tuple))) list_cols = list(which_list_cols[which_list_cols].index) for col in list_cols: if isinstance(df_transformed[col], pd.DataFrame): warnings.warn( f"Skipping list unpacking for `{col}`. There are multiple columns " f"with this name. Make sure column names are unique to enable unpacking." ) continue # Unpacks the column, creating one column for each list entry list_df = pd.DataFrame(df_transformed[col].to_list()) n_cols = list_df.shape[1] # Adds column names if list_names_dict is not None and col in list_names_dict: found_length = len(list_names_dict[col]) if found_length != n_cols: raise ValueError( f"list_names_dict['{col}'] has length {found_length}, " f"but there are {n_cols} columns to name. Example row(s):\n" f"{list_df.head(2)}") list_df.columns = [ f"{list_names_dict.get(col)[i]}" for i in range(n_cols) ] else: list_df.columns = [f"{col}{i}" for i in range(n_cols)] # replaces original column with new ones list_df.index = df_transformed.index del df_transformed[col] df_transformed = pd.concat([df_transformed, list_df], axis=1) if list_names_dict: unused_names = sorted( list(set(list_names_dict.keys()) - set(list_cols))) if len(unused_names) > 0: warnings.warn( "These names from `list_names_dict` are not used, because the " "column (key) is not found in the dataframe after aggregation:\n" f"{unused_names}.\nAvailable columns are:\n" f"{list_cols}.") return df_transformed
def summary(self): """Prints input parameters and DummyRegressor model parameters """ log_message(self, LoggingLevelEnum.DEBUG) log_message(self.model, LoggingLevelEnum.DEBUG)
def split(self, X, y=None, groups=None): """Generates indices to split data into training and test CV folds according to rolling window time series cross validation Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Must have `shape` method. y : array-like, shape (n_samples,), optional The target variable for supervised learning problems. Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Always ignored, exists for compatibility. Yields ------ train : `numpy.array` The training set indices for that split. test : `numpy.array` The testing set indices for that split. """ num_samples = X.shape[0] indices = np.arange(num_samples) n_splits_without_capping = self.get_n_splits_without_capping(X=X) n_splits = self.get_n_splits(X=X) if n_splits_without_capping == 0: warnings.warn( "There are no CV splits under the requested settings. Decrease `forecast_horizon` and/or" " `min_train_periods`. Using default 90/10 CV split") elif n_splits == 1: warnings.warn("There is only one CV split") elif n_splits >= 10: warnings.warn( f"There is a high number of CV splits ({n_splits}). If training is slow, increase " f"`periods_between_splits` or `min_train_periods`, or decrease `max_splits`" ) log_message(f"There are {n_splits} CV splits.", LoggingLevelEnum.INFO) if n_splits_without_capping == 0: # uses default split default_split_ratio = 0.9 train_samples = int(round(num_samples * default_split_ratio)) yield indices[:train_samples], indices[train_samples:] else: # determines which splits to keep so that up to max_splits are returned splits_to_keep = self._sample_splits(n_splits_without_capping) last_index = num_samples - 1 test_end_index = self.__starting_test_index + self._get_offset(X=X) current_split_index = 0 while test_end_index <= last_index: test_start_index = test_end_index - self.forecast_horizon + 1 train_end_index = test_start_index - self.periods_between_train_test - 1 train_start_index = 0 if self.expanding_window else train_end_index - self.min_train_periods + 1 assert train_start_index >= 0 # guaranteed by n_splits > 0 if current_split_index in splits_to_keep: log_message( f"CV split: Train {train_start_index} to {train_end_index}. " f"Test {test_start_index} to {test_end_index}.", LoggingLevelEnum.DEBUG) yield indices[train_start_index:train_end_index + 1], indices[test_start_index:test_end_index + 1] test_end_index += self.periods_between_splits current_split_index += 1