Пример #1
0
    def __modeling(self, data):
        '''

        :param data:
        :return:  因素名称和影响因子的字典
        '''
        heads = data.columns
        # (0,1) transformation
        scaler = MinMaxScaler(feature_range=(0, 1))
        data = pd.DataFrame(scaler.fit_transform(data))
        data.columns = heads
        # X,y
        poly_X = data.drop(['OP_TIME', self.__class__.__target[0]], axis=1)
        y = data[self.__class__.__target[0]]
        kf = TimeSeriesSplit(n_splits=3)
        kf.get_n_splits(poly_X)
        print("start trainning model...")
        # nested 3-fold TimeSeries cross-validation
        scores = []
        lasso_models = []
        for train_index, test_index in kf.split(poly_X):
            print("finding relatively better alpha...")
            X_train, X_test = poly_X.iloc[train_index], poly_X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            lassocv = linear_model.LassoCV(cv=10, max_iter=1500)
            lassocv.fit(X_train, y_train)
            lasso = linear_model.Lasso(alpha=lassocv.alpha_).fit(
                X_train, y_train)
            lasso_models.append(lasso)
            score = lasso.score(X_test, y_test)
            scores.append(score)
        scores_ndarray = np.asarray(scores)
        best_model = lasso_models[scores_ndarray.argmax()]
        cv_result = model_selection.cross_val_score(
            best_model, poly_X, y, cv=kf, scoring='neg_mean_squared_error')
        print('the mean neg_mse_score for LassoRegression is %s' %
              (np.mean(np.asarray(cv_result))))
        # 得到系数的list
        # factors = np.square(np.asarray(best_model.coef_))
        factors = np.abs(np.asarray(best_model.coef_))
        # 得到影响因子
        influence = factors / np.sum(factors)

        # 格式化一下小数,输出两位小数
        formatted_influence = map(lambda x: '%.2f' % x, influence)
        named_scores = zip(poly_X.columns, formatted_influence)
        # sorted_named_scores = sorted(named_scores, key=lambda influence: influence[1], reverse=True)
        return dict(named_scores)
Пример #2
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Lasso Regression model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=True)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up the machine learning model
            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)
                model = LassoCV(cv=folds, eps=1e-9, n_alphas=16, n_jobs=N_JOBS)
            else:
                model = Lasso(alpha=0.1, warm_start=True)
            if MULTI:
                model = MultiOutputRegressor(
                    model, n_jobs=1 if self.tune_model else N_JOBS)

            # set up a machine learning pipeline
            pipeline = Pipeline([
                ("var", VarianceThreshold()),
                # ('poly', PolynomialFeatures(2)),  # longer run time, potentially more accurate
                # ('var2', VarianceThreshold()),  # use this if 'poly' is used
                # ('shape', QuantileTransformer(output_distribution="normal")),  # make input variables normally distributed
                ("scale", MinMaxScaler()),
                ("model", model),
            ])

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")  # ignore common warning
                object.__setattr__(
                    self,
                    "_model",
                    pipeline.fit(X, Y)  # train the model
                )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
Пример #3
0
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2)
Пример #4
0
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2)
Пример #5
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Neural Network model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = MLPRegressor(
                max_iter=25,
                hidden_layer_sizes=(64, 64),
                learning_rate_init=0.001,
                batch_size=16,
                alpha=0,
                learning_rate="adaptive",
                activation="relu",
                solver="adam",
                warm_start=True,
                shuffle=False,
                random_state=42,
                verbose=False,
            )
            if MULTI:
                model = MultiOutputRegressor(
                    model,
                    n_jobs=N_JOBS,
                )
            pipeline = Pipeline(
                [
                    ("var", VarianceThreshold()),
                    ("scale", MinMaxScaler()),
                    ("model", model),
                ]
            )

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                str_ = ""
                if MULTI:
                    str_ = "estimator__"
                parameters = {
                    f"model__{str_}hidden_layer_sizes": (
                        (32, 32),
                        (64, 64),
                        (128, 128),
                    ),
                    f"model__{str_}batch_size": (16, 32),
                    f"model__{str_}learning_rate_init": (0.0001, 0.001, 0.01),
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1 if MULTI else N_JOBS,
                )

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self,
                        "_model",
                        grid.fit(X, Y).best_estimator_,  # search for the best model
                    )
            else:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self, "_model", pipeline.fit(X, Y)  # train the model
                    )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
    def foward_chain_cv(self, scoring_metric, greater_is_better=False):
        i = 1

        MAE = []
        Exp_var = []
        MSE = []
        r_squared = []
        params_used = {}

        y_pred_cont = []
        y_test_cont = []
        y_pred_cont_index = []
        split_dates = []

        fig = plt.figure()

        tscv = TimeSeriesSplit(n_splits=self.no_splits)
        for train_index, test_index in tqdm(tscv.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            X_test_index = X_test.index.values.tolist()

            if self.scalar is not None:
                # Scale Data
                scaler_X = self.scalar()
                scaler_y = self.scalar()
                scaler_X.fit(X_train)
                scaler_y.fit(y_train)
                X_train, X_test = scaler_X.transform(
                    X_train), scaler_X.transform(X_test)
                y_train, y_test = scaler_y.transform(
                    y_train), scaler_y.transform(y_test)
            else:
                X_train, X_test = np.asarray(X_train), np.asarray(X_test)
                y_train, y_test = np.asarray(y_train), np.asarray(y_test)

            # Find Best Params
            best_score, best_params = self.find_optimal_paramters(
                X_train, y_train, self.regressor, self.parameters,
                scoring_metric, greater_is_better)

            self.regressor.set_params(**best_params)
            self.regressor.fit(X_train, y_train.ravel())

            # predict y values
            y_pred = self.regressor.predict(X_test)

            if self.scalar is not None:
                # transform y values back to real scale for assessment
                y_pred = scaler_y.inverse_transform(y_pred)
                y_test = scaler_y.inverse_transform(y_test)

            # compute error metrics
            params_used[i] = best_params
            MAE.append(metrics.mean_absolute_error(y_test, y_pred))
            Exp_var.append(metrics.explained_variance_score(y_test, y_pred))
            MSE.append(metrics.mean_squared_error(y_test, y_pred))
            r_squared.append(metrics.r2_score(y_test, y_pred))

            # plot y_pred vs y_test
            y_df = pd.DataFrame(index=pd.to_datetime(X_test_index))
            y_pred = y_pred.reshape(len(y_pred), )
            y_test = y_test.reshape(len(y_test), )
            y_df['y_pred'] = y_pred
            y_df['y_test'] = y_test

            # plot the subplots
            ax = fig.add_subplot(int(sqrt(self.no_splits)),
                                 int(sqrt(self.no_splits) + 1), i)
            ax.xaxis.set_major_formatter(DateFormatter('%m-%y'))
            y_df.plot(title='Split{}'.format(i), ax=ax, legend=False)
            ax.tick_params(axis='x', rotation=45, labelsize=8)
            if i == 1:
                fig.legend(loc=4)

            # convert arrays to list and append continuous y_pred vs y_test
            y_pred_cont_index = y_pred_cont_index + X_test_index
            split_dates.append(y_pred_cont_index[-1])
            y_pred_list = y_pred.tolist()
            y_test_list = y_test.tolist()
            y_pred_cont = y_pred_cont + y_pred_list
            y_test_cont = y_test_cont + y_test_list

            i += 1

        # Plot the continuous chart
        y_continuous_df = pd.DataFrame(index=pd.to_datetime(y_pred_cont_index))
        y_pred_cont = np.asarray(y_pred_cont)
        y_test_cont = np.asarray(y_test_cont)
        y_continuous_df['Model'] = y_pred_cont
        y_continuous_df['Actual'] = y_test_cont
        y_continuous_df.plot(title='Running Performance')

        # add verticle lines to the running total output
        del split_dates[-1]
        for date in split_dates:
            date = datetime.strptime(date, '%m/%d/%Y %H:%M')
            plt.axvline(x=date,
                        linestyle=':',
                        color='red',
                        linewidth=1,
                        alpha=.8)

        # Calculate average metrics
        no_splits = tscv.get_n_splits()
        avg_mae = sum(MAE) / no_splits
        avg_exp_var = sum(Exp_var) / no_splits
        avg_mse = sum(MSE) / no_splits
        avg_rsquared = sum(r_squared) / no_splits

        print('\nMAE:{} \nMSE:{} \nExp Var Explained: {}\nr^2: {}\nParams:{}'.
              format(MAE, MSE, Exp_var, r_squared, params_used))
        print('\nAvg MAE:', avg_mae, '\nAverage Explained Variance:',
              avg_exp_var, '\nAvg MSE:', avg_mse, '\nAvg r^2:', avg_rsquared)
        print('end')
        fig.tight_layout()
        plt.show()
Пример #7
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Decision Tree model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = DecisionTreeRegressor(
                max_depth=12,
                min_samples_leaf=1,
                max_features="sqrt",
                random_state=42,
                n_jobs=N_JOBS,
                warm_start=True,
            )
            if MULTI:
                model = MultiOutputRegressor(model, n_jobs=1)
            pipeline = Pipeline(
                [
                    ("var", VarianceThreshold()),
                    ("model", model),
                ]
            )

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                str_ = ""
                if MULTI:
                    str_ = "estimator__"
                parameters = {
                    f"model__{str_}max_depth": [6, 10, 14, 18],
                    f"model__{str_}min_samples_leaf": [1, 3, 5, 10],
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1,
                )

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self,
                        "_model",
                        grid.fit(X, Y).best_estimator_,  # search for the best model
                    )
            else:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self, "_model", pipeline.fit(X, Y)  # train the model
                    )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
Пример #8
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Extreme Gradient Boosting Tree model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = XGBRegressor(
                booster="gbtree",
                n_estimators=25,
                learning_rate=0.1,
                max_depth=7,
                min_child_weight=1,
                colsample_bytree=0.8,
                subsample=0.8,
                random_state=42,
                n_jobs=N_JOBS,
            )
            model = MultiOutputRegressor(model, n_jobs=1)
            pipeline = Pipeline([
                ("var", VarianceThreshold()),
                ("model", model),
            ])

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                parameters = {
                    "model__estimator__n_estimators": [25, 50, 100],
                    "model__estimator__learning_rate": [0.001, 0.01, 0.1, 1],
                    "model__estimator__max_depth": [3, 6, 9, 12],
                    "model__estimator__min_child_weight": [1, 3, 5],
                    "model__estimator__colsample_bytree": [0.8],
                    "model__estimator__subsample": [0.8],
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1,
                )

                object.__setattr__(
                    self,
                    "_model",
                    grid.fit(X,
                             Y).best_estimator_,  # search for the best model
                )
            else:
                object.__setattr__(
                    self,
                    "_model",
                    pipeline.fit(X, Y)  # train the model
                )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
Пример #9
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Partial Least Squares Regression model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = PLSRegression(
                n_components=min(X.shape[1] - 1, int(X.shape[0] / 2)),
                scale=False,
            )
            pipeline = Pipeline([
                ("var", VarianceThreshold()),
                # ('poly', PolynomialFeatures(2)),  # longer run time, potentially more accurate
                # ('var2', VarianceThreshold()),  # use this if 'poly' is used
                # ('shape', QuantileTransformer(output_distribution="normal")),  # make input variables normally distributed
                ("scale", MinMaxScaler()),
                ("model", model),
            ])

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                max_components = min(X.shape[1] - 1, int(X.shape[0] * 0.75))
                n_models = 16  # number of models to search for
                parameters = {
                    "model__n_components":
                    np.arange(1,
                              max_components,
                              step=int(max_components / n_models)).tolist(),
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=n_models,
                    cv=folds,
                    random_state=0,
                    n_jobs=N_JOBS,
                )

                object.__setattr__(
                    self,
                    "_model",
                    grid.fit(X,
                             Y).best_estimator_,  # search for the best model
                )
            else:
                object.__setattr__(
                    self,
                    "_model",
                    pipeline.fit(X, Y)  # train the model
                )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
Пример #10
0
X["season"] = X["season"].astype(str)
X["weather"] = X["weather"].astype(str)

# determine which columns are strings (for X)
x_columns = X.columns
x_dtypes = X.dtypes
x_str = np.where(x_dtypes == "object")[0]

# convert any string columns to binary columns
X = pd.get_dummies(X, columns=x_columns[x_str])

# In[2]: Model the data

# set up cross validation for time series
tscv = TimeSeriesSplit(n_splits=5)
folds = tscv.get_n_splits(X)

# set up a machine learning pipeline
pipeline = Pipeline(
    [
        ("var1", VarianceThreshold()),
        # ('poly', PolynomialFeatures(2)),
        # ('var2', VarianceThreshold()),
        # ('shape', QuantileTransformer(output_distribution="normal"))
        ("scale", MinMaxScaler()),
        ("model", LassoCV(cv=folds, eps=1e-9, n_alphas=16, n_jobs=-1)),
    ]
)

# train a model
pipeline.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])
Пример #11
0
        features.index = stocks[i].index
        features = features.dropna()
        #features = features.iloc[np.where(features.index=='1998-5-5')[0][0]:np.where(features.index=='2015-5-5')[0][0]]
        stocks_indicators[i] = features
    return stocks_indicators


#create model


stocks_indicators = get_indicators(stocks,5)
for j in stocks:
    X = stocks_indicators[j].iloc[:, :-1].astype('float')
    y = stocks_indicators[j].iloc[:, -1].astype('float')
    tscv = TimeSeriesSplit(n_splits=2)
    tscv.get_n_splits(X)
    for train_index, test_index in tscv.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        classifier = Sequential()
        classifier.add(Dense(units=128, kernel_initializer='uniform', activation='relu', input_dim=X.shape[1]))
        classifier.add(Dense(units=128, kernel_initializer='uniform', activation='relu'))
        classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
        classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)
        y_pred = classifier.predict(X_test)
        y_pred[y_pred > 0.5] = 1
Пример #12
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Bayesian Ridge Regression model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = MultiOutputRegressor(BayesianRidge(), n_jobs=N_JOBS)
            pipeline = Pipeline(
                [
                    ("var", VarianceThreshold()),
                    # ('poly', PolynomialFeatures(2)),  # longer run time, potentially more accurate
                    # ('var2', VarianceThreshold()),  # use this if 'poly' is used
                    # ('shape', QuantileTransformer(output_distribution="normal")),  # make input variables normally distributed
                    ("scale", MinMaxScaler()),
                    ("model", model),
                ]
            )

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                parameters = {
                    "model__estimator__n_iter": [300],
                    "model__estimator__tol": [1e-3],
                    "model__estimator__alpha_1": [1e-2, 1e-6, 1e-10],
                    "model__estimator__lambda_1": [1e-2, 1e-6, 1e-10],
                    "model__estimator__alpha_2": [1e-2, 1e-6, 1e-10],
                    "model__estimator__lambda_2": [1e-2, 1e-6, 1e-10],
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1,
                )

                object.__setattr__(
                    self,
                    "_model",
                    grid.fit(X, Y).best_estimator_,  # search for the best model
                )
            else:
                object.__setattr__(
                    self, "_model", pipeline.fit(X, Y)  # train the model
                )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
Пример #13
0
# (0,1) transformation
scaler = MinMaxScaler(feature_range=(0, 1))
raw_data = pd.DataFrame(scaler.fit_transform(raw_data))
raw_data.columns = heads
# X,y
X = raw_data.drop(['OP_TIME', 'BILL_USER'], axis=1)
# poly = PolynomialFeatures(degree=2)

# poly_X = pd.DataFrame(poly.fit_transform(X))
# print(poly.get_feature_names(X.columns))
poly_X = X
y = raw_data['BILL_USER']
# kf = KFold(n_splits=10)
kf = TimeSeriesSplit(n_splits=3)

kf.get_n_splits(poly_X)
regressions = ['Lasso', 'Ridge', 'GradientBoostingRegression']

print("start trainning %s model..." % (regressions[0]))
# nested 10-fold cross-validation
scores = []
lasso_models = []
mean_performance = []
for train_index, test_index in kf.split(poly_X):
    # print("TRAIN:", train_index, "TEST:", test_index)
    print("start training...")
    X_train, X_test = poly_X.iloc[train_index], poly_X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lassocv = linear_model.LassoCV(cv=10, max_iter=1500)
    lassocv.fit(X_train, y_train)
    print("alpha is %s" % (lassocv.alpha_))