예제 #1
0
def logit_skl(y_ts, df_norm, keys=None, kwrgs_model=None):

    #%%
    '''
    X contains all precursor data, incl train and test
    X_train, y_train are split up by TrainIsTrue
    Preciction is made for whole timeseries
    '''

    if keys is None:
            no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask']
            keys = df_norm.columns
            keys = [k for k in keys if k not in no_data_col]
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
#    warnings.filterwarnings("ignore", category=FutureWarning)

    if kwrgs_model == None:
        # use Bram settings
        kwrgs_model = { 'class_weight':{ 0:1, 1:1},
                       'scoring':'neg_brier_score',
                       'penalty':'l2',
                       'solver':'lbfgs'}


    # find parameters for gridsearch optimization
    kwrgs_gridsearch = {k:i for k, i in kwrgs_model.items() if type(i) == list}
    # only the constant parameters are kept
    kwrgs = kwrgs_model.copy()
    [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()]
    if 'feat_sel' in kwrgs:
        feat_sel = kwrgs.pop('feat_sel')
    else:
        feat_sel = None

    # Get training years
    x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm)

    X = df_norm[keys]
    # X = add_constant(X)
    X_train = X[x_fit_mask.values]
    X_pred  = X[x_pred_mask.values]

    RV_bin_fit = y_ts['bin']
    # y_ts dates no longer align with x_fit  y_fit masks
    y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1
    y_train = RV_bin_fit[y_fit_mask].squeeze()

    # if y_pred_mask is not None:
    #     y_dates = RV_bin_fit[y_pred_mask.values].index
    # else:
    y_dates = RV_bin_fit.index

    X = X_train

    # Create stratified random shuffle which keeps together years as blocks.
    kwrgs_cv = ['kfold', 'seed']
    kwrgs_cv = {k:i for k, i in kwrgs.items() if k in kwrgs_cv}
    [kwrgs.pop(k) for k in kwrgs_cv.keys()]

    cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv)
    model = LogisticRegressionCV(fit_intercept=True,
                                 cv=cv,
                                 n_jobs=1,
                                 **kwrgs)
    if feat_sel is not None:
        if feat_sel['model'] is None:
            feat_sel['model'] = model
        model, new_features, rfecv = utils.feature_selection(X_train, y_train.values, **feat_sel)
        X_pred = X_pred[new_features]
    else:
        model.fit(X_train, y_train)


    y_pred = model.predict_proba(X_pred)[:,1]

    prediction = pd.DataFrame(y_pred, index=y_dates, columns=[0])
    model.X_pred = X_pred
    #%%
    return prediction, model
예제 #2
0
def logit(y_ts, df_norm, keys):
    #%%
    '''
    X contains all precursor data, incl train and test
    X_train, y_train are split up by TrainIsTrue
    Preciction is made for whole timeseries
    '''
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    if keys is None:
        no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask']
        keys = df_norm.columns
        keys = [k for k in keys if k not in no_data_col]

    # Get training years
    x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm)

    X = df_norm[keys]
    X = add_constant(X)
    X_train = X[x_fit_mask.values]
    X_pred  = X[x_pred_mask.values]

    RV_bin_fit = y_ts['bin']
    # y_ts dates no longer align with x_fit  y_fit masks
    y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1
    y_train = RV_bin_fit[y_fit_mask].squeeze()


    # if y_pred_mask is not None:
    #     y_dates = RV_bin_fit[y_pred_mask.values].index
    # else:
    y_dates = RV_bin_fit.index

    # Statsmodel wants the dataframes and that the indices are aligned.
    # Therefore making new dataframe for X_train
    try:
        model_set = sm.Logit(y_train,
                         pd.DataFrame(X_train.values, index=y_train.index), disp=0)
    except:
        print(x_fit_mask)
        print(X_train)
        print(y_train)

    try:
        model = model_set.fit( disp=0, maxfun=60 )
        prediction = model.predict(X_pred)
    except np.linalg.LinAlgError as err:
        if 'Singular matrix' in str(err):
            model = model_set.fit(method='bfgs', disp=0 )
            prediction = model.predict(X_pred)
        else:
            raise
    except Exception as e:
        print(e)
        model = model_set.fit(method='bfgs', disp=0 )
        prediction = model.predict_proba(X_pred)

    prediction = pd.DataFrame(prediction.values, index=y_dates, columns=[0])
    model.X_pred = X_pred
    #%%
    return prediction, model
예제 #3
0
def ridgeCV(y_ts, df_norm, keys=None, kwrgs_model=None):
    '''
    X contains all precursor data, incl train and test
    X_train, y_train are split up by TrainIsTrue
    Preciction is made for whole timeseries
    '''
    #%%
    if keys is None:
        no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask']
        keys = df_norm.columns
        keys = [k for k in keys if k not in no_data_col]
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    # warnings.filterwarnings("ignore", category=FutureWarning)

    if kwrgs_model == None:
        # use Bram settings
        kwrgs_model = {'fit_intercept': True, 'alphas': (.01, .1, 1.0, 10.0)}

    # find parameters for gridsearch optimization
    kwrgs_gridsearch = {
        k: i
        for k, i in kwrgs_model.items() if type(i) == list
    }
    # only the constant parameters are kept
    kwrgs = kwrgs_model.copy()
    [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()]
    if 'feat_sel' in kwrgs:
        feat_sel = kwrgs.pop('feat_sel')
    else:
        feat_sel = None

    # Get training years
    x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm)

    X = df_norm[keys]
    X = X.dropna(axis='columns')  # drop only nan columns
    # X = add_constant(X)
    X_train = X[x_fit_mask.values]
    X_pred = X[x_pred_mask.values]

    RV_fit = y_ts['ts'].loc[y_fit_mask.index]  # y_fit may be shortened
    # because X_test was used to predict y_train due to lag, hence train-test
    # leakage.

    # y_ts dates may no longer align with x_fit  y_fit masks
    y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values
    y_train = RV_fit[y_fit_mask].squeeze()

    # if y_pred_mask is not None:
    #     y_dates = RV_fit[y_pred_mask.values].index
    # else:
    # y_dates = RV_fit.index

    X = X_train

    # # Create stratified random shuffle which keeps together years as blocks.
    kwrgs_cv = ['kfold', 'seed']
    kwrgs_cv = {k: i for k, i in kwrgs.items() if k in kwrgs_cv}
    [kwrgs.pop(k) for k in kwrgs_cv.keys()]
    if len(kwrgs_cv) >= 1:
        cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv)
        kwrgs['store_cv_values'] = False
    else:
        cv = None
        kwrgs['store_cv_values'] = True
    model = RidgeCV(cv=cv, **kwrgs)

    if feat_sel is not None:
        if feat_sel['model'] is None:
            feat_sel['model'] = model
        model, new_features, rfecv = utils.feature_selection(
            X_train, y_train.values, **feat_sel)
        X_pred = X_pred[new_features]
    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_pred)

    prediction = pd.DataFrame(y_pred, index=y_pred_mask.index, columns=[0])
    model.X_pred = X_pred
    model.name = 'Ridge Regression'
    #%%
    return prediction, model
예제 #4
0
def GBC(y_ts, df_norm, keys, kwrgs_GBM=None, verbosity=0):
    #%%
    '''
    X contains all precursor data, incl train and test
    X_train, y_train are split up by TrainIsTrue
    Preciction is made for whole timeseries
    '''
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    if kwrgs_GBM == None:
        # use Bram settings
        kwrgs_GBM = {'max_depth':3,
                 'learning_rate':0.001,
                 'n_estimators' : 1250,
                 'max_features':'sqrt',
                 'subsample' : 0.5,
                 'min_samples_split':.15}

    # find parameters for gridsearch optimization
    kwrgs_gridsearch = {k:i for k, i in kwrgs_GBM.items() if type(i) == list}
    # only the constant parameters are kept
    kwrgs = kwrgs_GBM.copy()
    [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()]
    if 'scoringCV' in kwrgs.keys():
        scoring = kwrgs.pop('scoringCV')
         # sorted(sklearn.metrics.SCORERS.keys())
         # scoring   = 'neg_mean_squared_error'
         # scoring='roc_auc'
    if 'feat_sel' in kwrgs:
        feat_sel = kwrgs.pop('feat_sel')
    else:
        feat_sel = None

    # Get training years
    x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm)

    X = df_norm[keys]
    X_train = X[x_fit_mask.values]
    X_pred  = X[x_pred_mask.values]



    RV_bin_fit = y_ts['bin']
    # y_ts dates no longer align with x_fit  y_fit masks
    y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1
    y_train = RV_bin_fit[y_fit_mask].squeeze()

    # if y_pred_mask is not None:
    #     y_dates = RV_bin_fit[y_pred_mask.values].index
    # else:
    y_dates = RV_bin_fit.index


    model = GradientBoostingClassifier(**kwrgs)

    if feat_sel is not None:
        if feat_sel['model'] is None:
            feat_sel['model'] = model
        model, new_features, rfecv = utils.feature_selection(X_train, y_train.values.ravel(), **feat_sel)
        X_pred = X_pred[new_features] # subset predictors
        X_train = X_train[new_features] # subset predictors
    # else:
        # model.fit(X_train, y_train.values.ravel())



    if len(kwrgs_gridsearch) != 0:
        # get cross-validation splitter
        if 'kfold' in kwrgs.keys():
            kfold = kwrgs.pop('kfold')
        else:
            kfold = 5
        cv = utils.get_cv_accounting_for_years(len(y_train), kfold, seed=1)

        model = GridSearchCV(model,
                  param_grid=kwrgs_gridsearch,
                  scoring=scoring, cv=cv, refit=scoring,
                  return_train_score=True, iid=False)
        model = model.fit(X_train, y_train.values.ravel())
        if verbosity == 1:
            results = model.cv_results_
            scores = results['mean_test_score']
            greaterisbetter = model.scorer_._sign
            improv = int(100* greaterisbetter*(max(scores)- min(scores)) / max(scores))
            print("Hyperparam tuning led to {:}% improvement, best {:.2f}, "
                  "best params {}".format(
                    improv, model.best_score_, model.best_params_))
    else:
        model.fit(X_train, y_train.values.ravel())

    if len(kwrgs_gridsearch) != 0:
        prediction = pd.DataFrame(model.best_estimator_.predict_proba(X_pred)[:,1],
                              index=y_dates, columns=['GBR'])
    else:
        prediction = pd.DataFrame(model.predict_proba(X_pred)[:,1],
                              index=y_dates, columns=['GBR'])

    model.X_pred = X_pred


#    logit_pred.plot() ; plt.plot(RV.RV_bin)
#    plt.figure()
#    prediction.plot() ; plt.plot(RV.RV_ts)
#    metrics_sklearn(RV.RV_bin, logit_pred.values, y_pred_c)
    #%%
    return prediction, model
예제 #5
0
    def fit_wrapper(self, y_ts, df_norm, keys=None, kwrgs_model=None):
        '''
        X contains all precursor data, incl train and test
        X_train, y_train are split up by TrainIsTrue
        Preciction is made for whole timeseries
        '''
        #%%

        scikitmodel = self.scikitmodel

        if keys is None:
            no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask']
            keys = df_norm.columns
            keys = [k for k in keys if k not in no_data_col]
        import warnings
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        # warnings.filterwarnings("ignore", category=FutureWarning)

        if kwrgs_model == None:
            # use Bram settings
            kwrgs_model = {
                'fit_intercept': True,
                'alphas': (.01, .1, 1.0, 10.0)
            }

        # find parameters for gridsearch optimization
        kwrgs_gridsearch = {
            k: i
            for k, i in kwrgs_model.items() if type(i) == list
        }
        # only the constant parameters are kept
        kwrgs = kwrgs_model.copy()
        [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()]
        if 'scoringCV' in kwrgs.keys():
            scoring = kwrgs.pop('scoringCV')
        else:
            scoring = None
        # Get training years
        x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(
            df_norm)

        X = df_norm[keys]
        X = X.dropna(axis='columns')  # drop only nan columns
        # X = add_constant(X)
        X_train = X[x_fit_mask.values]
        X_pred = X[x_pred_mask.values]

        RV_fit = y_ts['ts'].loc[y_fit_mask.index]  # y_fit may be shortened
        # because X_test was used to predict y_train due to lag, hence train-test
        # leakage.

        # y_ts dates may no longer align with x_fit  y_fit masks
        y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values
        y_train = RV_fit[y_fit_mask].squeeze()

        # if y_pred_mask is not None:
        #     y_dates = RV_fit[y_pred_mask.values].index
        # else:
        # y_dates = RV_fit.index

        X = X_train

        # # Create stratified random shuffle which keeps together years as blocks.
        kwrgs_cv = ['kfold', 'seed']
        kwrgs_cv = {k: i for k, i in kwrgs.items() if k in kwrgs_cv}
        [kwrgs.pop(k) for k in kwrgs_cv.keys()]
        if len(kwrgs_cv) >= 1:
            cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv)
        else:
            cv = None
        try:
            model = scikitmodel(cv=cv, **kwrgs)
        except:
            model = scikitmodel(**kwrgs)

        if len(kwrgs_gridsearch) != 0:
            # get cross-validation splitter
            # if 'kfold' in kwrgs.keys():
            #     kfold = kwrgs.pop('kfold')
            # else:
            #     kfold = 5
            # cv = utils.get_cv_accounting_for_years(y_train, kfold, seed=1)

            model = GridSearchCV(model,
                                 param_grid=kwrgs_gridsearch,
                                 scoring=scoring,
                                 cv=cv,
                                 refit=True,
                                 return_train_score=True,
                                 verbose=self.verbosity,
                                 n_jobs=3)
            model.fit(X_train, y_train.values.ravel())
            model.best_estimator_.X_pred = X_pred  # add X_pred to model
            # if self.verbosity == 1:
            #     results = model.cv_results_
            #     scores = results['mean_test_score']
            #     greaterisbetter = model.scorer_._sign
            #     improv = int(100* greaterisbetter*(max(scores)- min(scores)) / max(scores))
            #     print("Hyperparam tuning led to {:}% improvement, best {:.2f}, "
            #           "best params {}".format(
            #             improv, model.best_score_, model.best_params_))
        else:
            model.fit(X_train, y_train.values.ravel())
            model.X_pred = X_pred  # add X_pred to model

        if np.unique(y_train).size < 5:
            y_pred = model.predict_proba(X_pred)[:,
                                                 1]  # prob. event prediction
        else:
            y_pred = model.predict(X_pred)

        prediction = pd.DataFrame(y_pred, index=y_pred_mask.index, columns=[0])
        #%%
        return prediction, model