예제 #1
0
def create_fitted_model(
    specs: ModelSpecs,
    version: str,  # TODO: throw out?
    regression_frame: pd.DataFrame = None,
) -> MODEL_TYPES:
    """
    Create a new fitted model with the given specs.
    """
    if regression_frame is None:
        regression_frame = construct_features(time_range="train", specs=specs)

    # Remove any observation where data is missing.
    # Other parts of the workflow cannot handle missing data, so everything should be verified here.
    regression_frame = regression_frame.dropna(axis=1)
    if regression_frame.empty:
        raise MissingData(
            "Missing data (probably one of the regressors contains no data)")

    x_train = regression_frame.drop(columns=[specs.outcome_var.name])
    y_train = regression_frame[specs.outcome_var.name]

    package_str = specs.model_type.__module__.split(".")[0]
    if package_str == "statsmodels":
        model = specs.model_type(y_train, x_train, **specs.model_params)
        fitted_model = model.fit()
        fitted_model.get_params = fitted_model.params
    elif package_str == "sklearn":
        model = specs.model_type(**specs.model_params)
        fitted_model = model.fit(X=x_train, y=y_train)
        fitted_model.params = fitted_model.get_params
    else:
        raise UnsupportedModel("Unknown model type: %s " % specs.model_type)

    return fitted_model
예제 #2
0
def create_fitted_model(
    specs: ModelSpecs,
    version: str,  # TODO: throw out?
    regression_frame: pd.DataFrame = None,
) -> MODEL_TYPES:
    """
    Create a new fitted model with the given specs.
    """
    if regression_frame is None:
        regression_frame = construct_features(time_range="train", specs=specs)

    x_train = regression_frame.drop(columns=[specs.outcome_var.name])
    y_train = regression_frame[specs.outcome_var.name]

    if specs.library_name == "statsmodels":
        model = specs.model_type(y_train, x_train, **specs.model_params)
        fitted_model = model.fit()
        fitted_model.get_params = fitted_model.params
    elif specs.library_name == "sklearn":
        model = specs.model_type(**specs.model_params)
        fitted_model = model.fit(X=x_train, y=y_train)
        fitted_model.params = fitted_model.get_params
    else:
        raise UnsupportedModel(
            "Not sure which library your model is based on: %s."
            " See ModelSpecs.set_model." % specs.model_type)

    return fitted_model
예제 #3
0
def make_rolling_forecasts(
    start: datetime,  # Start of forecast period
    end: datetime,  # End of forecast period
    model_specs: ModelSpecs,
) -> Tuple[pd.Series, ModelState]:
    """
    Repeatedly call make_forecast - for all time steps the desired time window
    (end is excluding).
    The time window of the specs (training + test data) is moved forwards also step by step.
    Will fail if series specs do not allocate enough data.
    May create a model whenever the previous one is outdated.
    Return Pandas.Series as result, as well as the last ModelState.
    """

    # Prepare time range
    for dt in (start, end):
        if dt.tzinfo is None:
            dt.replace(tzinfo=pytz.utc)

    # First, compute one big feature frame, once.
    feature_frame: pd.DataFrame = construct_features(
        (model_specs.start_of_training, end), model_specs)

    pd_frequency = timedelta_to_pandas_freq_str(model_specs.frequency)
    values = pd.Series(index=pd.date_range(
        start, end, freq=pd_frequency, closed="left", tz=start.tzinfo))
    time_step = start
    model = None
    logger.info("Forecasting from %s to %s" % (start, end))
    while time_step < end:
        model, specs = update_model(time_step,
                                    model,
                                    model_specs,
                                    feature_frame=feature_frame).split()
        features = feature_frame.loc[time_step:time_step].iloc[:, 1:]
        values[time_step] = make_forecast_for(model_specs, features, model)
        time_step = time_step + model_specs.frequency

    return values, ModelState(model, model_specs)
예제 #4
0
def evaluate_models(m1: ModelState,
                    m2: Optional[ModelState] = None,
                    plot_path: str = None):
    """
    Run a model or two against test data and plot results.
    Useful to judge model performance or compare two models.
    Shows RMSE values, plots error distributions and prints the time it took to forecast.

    TODO: support testing m2 next to m1
    """
    fitted_m1, m1_specs = m1.split()

    regression_frame = construct_features(time_range="test", specs=m1_specs)

    x_test = regression_frame.iloc[:, 1:]
    y_test = np.array(regression_frame.iloc[:, 0])

    try:
        y_hat_test = fitted_m1.predict(x_test)
    except TypeError:
        y_hat_test = fitted_m1.predict(start=x_test.index[0],
                                       end=x_test.index[-1],
                                       exog=x_test)

    # Back-transform if the data was transformed
    if m1_specs.outcome_var.feature_transformation is not None:
        y_test = m1_specs.outcome_var.feature_transformation.back_transform_value(
            y_test)
        y_hat_test = m1_specs.outcome_var.feature_transformation.back_transform_value(
            y_hat_test)

    print("rmse = %s" % (str(
        round(sm.tools.eval_measures.rmse(y_test, y_hat_test, axis=0), 4))))

    plot_true_versus_predicted(regression_frame.index, y_test, y_hat_test,
                               None, None, plot_path)

    plot_error_graph(y_test, y_hat_test, plot_path=plot_path)