def logit_skl(y_ts, df_norm, keys=None, kwrgs_model=None): #%% ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=FutureWarning) if kwrgs_model == None: # use Bram settings kwrgs_model = { 'class_weight':{ 0:1, 1:1}, 'scoring':'neg_brier_score', 'penalty':'l2', 'solver':'lbfgs'} # find parameters for gridsearch optimization kwrgs_gridsearch = {k:i for k, i in kwrgs_model.items() if type(i) == list} # only the constant parameters are kept kwrgs = kwrgs_model.copy() [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()] if 'feat_sel' in kwrgs: feat_sel = kwrgs.pop('feat_sel') else: feat_sel = None # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm) X = df_norm[keys] # X = add_constant(X) X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_bin_fit = y_ts['bin'] # y_ts dates no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1 y_train = RV_bin_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_bin_fit[y_pred_mask.values].index # else: y_dates = RV_bin_fit.index X = X_train # Create stratified random shuffle which keeps together years as blocks. kwrgs_cv = ['kfold', 'seed'] kwrgs_cv = {k:i for k, i in kwrgs.items() if k in kwrgs_cv} [kwrgs.pop(k) for k in kwrgs_cv.keys()] cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv) model = LogisticRegressionCV(fit_intercept=True, cv=cv, n_jobs=1, **kwrgs) if feat_sel is not None: if feat_sel['model'] is None: feat_sel['model'] = model model, new_features, rfecv = utils.feature_selection(X_train, y_train.values, **feat_sel) X_pred = X_pred[new_features] else: model.fit(X_train, y_train) y_pred = model.predict_proba(X_pred)[:,1] prediction = pd.DataFrame(y_pred, index=y_dates, columns=[0]) model.X_pred = X_pred #%% return prediction, model
def logit(y_ts, df_norm, keys): #%% ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' import warnings warnings.simplefilter(action='ignore', category=FutureWarning) if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm) X = df_norm[keys] X = add_constant(X) X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_bin_fit = y_ts['bin'] # y_ts dates no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1 y_train = RV_bin_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_bin_fit[y_pred_mask.values].index # else: y_dates = RV_bin_fit.index # Statsmodel wants the dataframes and that the indices are aligned. # Therefore making new dataframe for X_train try: model_set = sm.Logit(y_train, pd.DataFrame(X_train.values, index=y_train.index), disp=0) except: print(x_fit_mask) print(X_train) print(y_train) try: model = model_set.fit( disp=0, maxfun=60 ) prediction = model.predict(X_pred) except np.linalg.LinAlgError as err: if 'Singular matrix' in str(err): model = model_set.fit(method='bfgs', disp=0 ) prediction = model.predict(X_pred) else: raise except Exception as e: print(e) model = model_set.fit(method='bfgs', disp=0 ) prediction = model.predict_proba(X_pred) prediction = pd.DataFrame(prediction.values, index=y_dates, columns=[0]) model.X_pred = X_pred #%% return prediction, model
def ridgeCV(y_ts, df_norm, keys=None, kwrgs_model=None): ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' #%% if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=FutureWarning) if kwrgs_model == None: # use Bram settings kwrgs_model = {'fit_intercept': True, 'alphas': (.01, .1, 1.0, 10.0)} # find parameters for gridsearch optimization kwrgs_gridsearch = { k: i for k, i in kwrgs_model.items() if type(i) == list } # only the constant parameters are kept kwrgs = kwrgs_model.copy() [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()] if 'feat_sel' in kwrgs: feat_sel = kwrgs.pop('feat_sel') else: feat_sel = None # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm) X = df_norm[keys] X = X.dropna(axis='columns') # drop only nan columns # X = add_constant(X) X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_fit = y_ts['ts'].loc[y_fit_mask.index] # y_fit may be shortened # because X_test was used to predict y_train due to lag, hence train-test # leakage. # y_ts dates may no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values y_train = RV_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_fit[y_pred_mask.values].index # else: # y_dates = RV_fit.index X = X_train # # Create stratified random shuffle which keeps together years as blocks. kwrgs_cv = ['kfold', 'seed'] kwrgs_cv = {k: i for k, i in kwrgs.items() if k in kwrgs_cv} [kwrgs.pop(k) for k in kwrgs_cv.keys()] if len(kwrgs_cv) >= 1: cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv) kwrgs['store_cv_values'] = False else: cv = None kwrgs['store_cv_values'] = True model = RidgeCV(cv=cv, **kwrgs) if feat_sel is not None: if feat_sel['model'] is None: feat_sel['model'] = model model, new_features, rfecv = utils.feature_selection( X_train, y_train.values, **feat_sel) X_pred = X_pred[new_features] else: model.fit(X_train, y_train) y_pred = model.predict(X_pred) prediction = pd.DataFrame(y_pred, index=y_pred_mask.index, columns=[0]) model.X_pred = X_pred model.name = 'Ridge Regression' #%% return prediction, model
def GBC(y_ts, df_norm, keys, kwrgs_GBM=None, verbosity=0): #%% ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) if kwrgs_GBM == None: # use Bram settings kwrgs_GBM = {'max_depth':3, 'learning_rate':0.001, 'n_estimators' : 1250, 'max_features':'sqrt', 'subsample' : 0.5, 'min_samples_split':.15} # find parameters for gridsearch optimization kwrgs_gridsearch = {k:i for k, i in kwrgs_GBM.items() if type(i) == list} # only the constant parameters are kept kwrgs = kwrgs_GBM.copy() [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()] if 'scoringCV' in kwrgs.keys(): scoring = kwrgs.pop('scoringCV') # sorted(sklearn.metrics.SCORERS.keys()) # scoring = 'neg_mean_squared_error' # scoring='roc_auc' if 'feat_sel' in kwrgs: feat_sel = kwrgs.pop('feat_sel') else: feat_sel = None # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm) X = df_norm[keys] X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_bin_fit = y_ts['bin'] # y_ts dates no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values==1 y_train = RV_bin_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_bin_fit[y_pred_mask.values].index # else: y_dates = RV_bin_fit.index model = GradientBoostingClassifier(**kwrgs) if feat_sel is not None: if feat_sel['model'] is None: feat_sel['model'] = model model, new_features, rfecv = utils.feature_selection(X_train, y_train.values.ravel(), **feat_sel) X_pred = X_pred[new_features] # subset predictors X_train = X_train[new_features] # subset predictors # else: # model.fit(X_train, y_train.values.ravel()) if len(kwrgs_gridsearch) != 0: # get cross-validation splitter if 'kfold' in kwrgs.keys(): kfold = kwrgs.pop('kfold') else: kfold = 5 cv = utils.get_cv_accounting_for_years(len(y_train), kfold, seed=1) model = GridSearchCV(model, param_grid=kwrgs_gridsearch, scoring=scoring, cv=cv, refit=scoring, return_train_score=True, iid=False) model = model.fit(X_train, y_train.values.ravel()) if verbosity == 1: results = model.cv_results_ scores = results['mean_test_score'] greaterisbetter = model.scorer_._sign improv = int(100* greaterisbetter*(max(scores)- min(scores)) / max(scores)) print("Hyperparam tuning led to {:}% improvement, best {:.2f}, " "best params {}".format( improv, model.best_score_, model.best_params_)) else: model.fit(X_train, y_train.values.ravel()) if len(kwrgs_gridsearch) != 0: prediction = pd.DataFrame(model.best_estimator_.predict_proba(X_pred)[:,1], index=y_dates, columns=['GBR']) else: prediction = pd.DataFrame(model.predict_proba(X_pred)[:,1], index=y_dates, columns=['GBR']) model.X_pred = X_pred # logit_pred.plot() ; plt.plot(RV.RV_bin) # plt.figure() # prediction.plot() ; plt.plot(RV.RV_ts) # metrics_sklearn(RV.RV_bin, logit_pred.values, y_pred_c) #%% return prediction, model
def fit_wrapper(self, y_ts, df_norm, keys=None, kwrgs_model=None): ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' #%% scikitmodel = self.scikitmodel if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=FutureWarning) if kwrgs_model == None: # use Bram settings kwrgs_model = { 'fit_intercept': True, 'alphas': (.01, .1, 1.0, 10.0) } # find parameters for gridsearch optimization kwrgs_gridsearch = { k: i for k, i in kwrgs_model.items() if type(i) == list } # only the constant parameters are kept kwrgs = kwrgs_model.copy() [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()] if 'scoringCV' in kwrgs.keys(): scoring = kwrgs.pop('scoringCV') else: scoring = None # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks( df_norm) X = df_norm[keys] X = X.dropna(axis='columns') # drop only nan columns # X = add_constant(X) X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_fit = y_ts['ts'].loc[y_fit_mask.index] # y_fit may be shortened # because X_test was used to predict y_train due to lag, hence train-test # leakage. # y_ts dates may no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values y_train = RV_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_fit[y_pred_mask.values].index # else: # y_dates = RV_fit.index X = X_train # # Create stratified random shuffle which keeps together years as blocks. kwrgs_cv = ['kfold', 'seed'] kwrgs_cv = {k: i for k, i in kwrgs.items() if k in kwrgs_cv} [kwrgs.pop(k) for k in kwrgs_cv.keys()] if len(kwrgs_cv) >= 1: cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv) else: cv = None try: model = scikitmodel(cv=cv, **kwrgs) except: model = scikitmodel(**kwrgs) if len(kwrgs_gridsearch) != 0: # get cross-validation splitter # if 'kfold' in kwrgs.keys(): # kfold = kwrgs.pop('kfold') # else: # kfold = 5 # cv = utils.get_cv_accounting_for_years(y_train, kfold, seed=1) model = GridSearchCV(model, param_grid=kwrgs_gridsearch, scoring=scoring, cv=cv, refit=True, return_train_score=True, verbose=self.verbosity, n_jobs=3) model.fit(X_train, y_train.values.ravel()) model.best_estimator_.X_pred = X_pred # add X_pred to model # if self.verbosity == 1: # results = model.cv_results_ # scores = results['mean_test_score'] # greaterisbetter = model.scorer_._sign # improv = int(100* greaterisbetter*(max(scores)- min(scores)) / max(scores)) # print("Hyperparam tuning led to {:}% improvement, best {:.2f}, " # "best params {}".format( # improv, model.best_score_, model.best_params_)) else: model.fit(X_train, y_train.values.ravel()) model.X_pred = X_pred # add X_pred to model if np.unique(y_train).size < 5: y_pred = model.predict_proba(X_pred)[:, 1] # prob. event prediction else: y_pred = model.predict(X_pred) prediction = pd.DataFrame(y_pred, index=y_pred_mask.index, columns=[0]) #%% return prediction, model