class RandomForestRegression(QuantileRegression):
    def __init__(self, qt, x, y, params={}):
        """
        Parameters
        ----------
        qt: float
            the quantile we want to estimate
        x: DataFrame
            feature dataset
        y: DataFrame
            target dataset
        params: dictionary
            a dictionary containing hyper-parameter key-value pairs of the model

        Internal Attributes
        -------------------
        self.random_forest : RandomForestQuantileRegressor Object or None
          the fitted model
        """
        super(RandomForestRegression, self).__init__(qt, x, y, params)
        self.random_forest = None
        self.fit_model()

    def fit_model(self):
        """
        fit the gradient boosting regression model using the train dataset

        Returns
        -------
        output: RandomForestQuantileRegressor object
            the random forest quantile regression model
        """
        x_train_dummy = pd.get_dummies(self.x)
        self.random_forest = RandomForestQuantileRegressor()
        self.random_forest.set_params(**self.params)
        self.random_forest = self.random_forest.fit(x_train_dummy, self.y)
        return self.random_forest

    def feature_importance(self):
        """
        Sort the features from the most important to the least important

        Returns
        -------
        output: Series
            Sort the features from the most important to the least important with corresponding values
        """
        feature_importances = self.random_forest.feature_importances_
        feature_importances = pd.Series(feature_importances, index=pd.get_dummies(self.x).columns)
        return feature_importances.sort_values(ascending=False)

    def predict(self, data):
        """
        predict the qt th quantile for new data

        Parameters
        ----------
        data: DataFrame
             new data

        Returns
        -------
        output: numpy.ndarray
            predicted quantile for new data
        """
        data_dummy = pd.get_dummies(data)
        return self.random_forest.predict(data_dummy, quantile=self.qt * 100)
Exemplo n.º 2
0
y = y.reshape(y.shape[0], )

kf = KFold(n_splits=6, random_state=0)
rfqr = RandomForestQuantileRegressor(random_state=0,
                                     min_samples_split=10,
                                     n_estimators=1000)

y_true_all = []
lower = []
upper = []

for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X[train_index], X[test_index],
                                        y[train_index], y[test_index])

    rfqr.set_params(max_features=X_train.shape[1] // 3)
    rfqr.fit(X_train, y_train)
    y_true_all = np.concatenate((y_true_all, y_test))
    upper = np.concatenate((upper, rfqr.predict(X_test, quantile=98.5)))
    lower = np.concatenate((lower, rfqr.predict(X_test, quantile=2.5)))

interval = upper - lower
sort_ind = np.argsort(interval)
y_true_all = y_true_all[sort_ind]
upper = upper[sort_ind]
lower = lower[sort_ind]
mean = (upper + lower) / 2

# Center such that the mean of the prediction interval is at 0.0
y_true_all -= mean
upper -= mean
Exemplo n.º 3
0
class ComponentForecast:
    def __init__(self,
                 dependent_var_str: str,
                 len_of_lag=48,
                 len_of_forecast=48,
                 min_samples_split=2,
                 len_of_test=48,
                 n_estimators=1000,
                 n_jobs=4):
        """
        initializing class
        :param dependent_var_str:  sets variable to be fit
        :param min_samples_split:  minimum number of samples needed to generate a new branch
        :param n_estimators:       number of estimators used
        """
        self.model = RandomForestQuantileRegressor(
            min_samples_split=min_samples_split,
            n_estimators=n_estimators,
            bootstrap=True,
            # min_weight_fraction_leaf=0.01, max_leaf_nodes=1000,
            n_jobs=n_jobs)
        self.dependent_var = dependent_var_str
        self.length_of_lag = len_of_lag
        self.length_of_test = len_of_test
        self.length_of_forecast = len_of_forecast

    def train(self, df: pandas.DataFrame):
        x, y = parse_data_for_training(df,
                                       self.dependent_var,
                                       length_of_lag=self.length_of_lag,
                                       length_of_test=self.length_of_test)
        self.model.set_params(max_features=x.shape[1])
        self.model.fit(x, y)

    def test(self, df: pandas.DataFrame):
        #x = parse_data_for_forecast(df[:df.index[-self.length_of_test]], self.dependent_var,
        #                            length_of_lag=self.length_of_lag, length_of_forecast=self.length_of_forecast)
        #values = self.model.predict(x)
        #fcst = pandas.Series(values, df.index[-self.length_of_test:])
        fcst = self.predict(df[:df.index[-self.length_of_test]])
        diff = (fcst - df.loc[df.index[-self.length_of_test]:, self.dependent_var]) / \
               df.loc[df.index[-self.length_of_test]:, self.dependent_var]
        rms_err = numpy.sqrt(numpy.nanmean(diff**2))
        print(' RMS error: {}'.format(rms_err))
        return rms_err

    def predict(self, df: pandas.DataFrame, quantile=None):
        x = parse_data_for_forecast(df,
                                    self.dependent_var,
                                    length_of_lag=self.length_of_lag,
                                    length_of_forecast=self.length_of_forecast)
        values = self.model.predict(x, quantile=quantile)
        index = pandas.date_range(
            start=df.index[-1] -
            datetime.timedelta(minutes=15 * self.length_of_lag),
            periods=self.length_of_forecast + self.length_of_lag,
            freq='15T')
        fcast = pandas.Series(values[1:], index=index)
        if numpy.nansum(fcast) == 0:
            scale = 1
        else:
            scale = numpy.nansum(df.loc[index[0]:,
                                        self.dependent_var]) / numpy.nansum(
                                            fcast[:df.index[-1]])
        return scale * fcast[df.index[-1] + datetime.timedelta(minutes=15):]