def test_different_models_scale_within(base_list, considered_columns, r=[1]): """tests feature combinations w/ sklearn cv to get brier/roc scores for each model, for each combination of columns base_list columns are always included. Will get every possible combination of 'considered_column' possible given values in 'r' for more details refer to 'column_combinations' function and the 'Combinations' function from itertools Args: base_list (list): columns to always be included in test considered_columns (list): columns that will possibly be included r (list, optional): Size of the combinations made with 'considered_columns'. Defaults to [1], meaning ever 'considered_column' will be included once Returns: DataFrame: Dataframe with 4 columns: the name of the model, the columns (besides the base columns) considered, brier score, roc auc score """ X, y = training_data_Xy(subset=False) result = pd.DataFrame(columns=['model', 'columns', 'brier', 'roc_auc']) col_lst = column_combinations(base_list, considered_columns, r) model_dic = { 0: 'RandomForest', 1: 'GradientBoost', 2: 'AdaBoost', 3: 'KNN', 4: 'logistic' } # dictionary to get names of models model_lst = [ RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier(), KNeighborsClassifier(), LogisticRegression() ] # models to be considered count = 0 for idx, model in enumerate(model_lst): # iterate through model list print(idx) for cols in col_lst: # iterate through column list X_subset = X[ cols] # reduces X to just columns that ought to be considered X_cols = list(X_subset.columns) SS = StandardScaler() X_subset = pd.DataFrame(data=SS.fit_transform(X_subset), columns=X_cols) # scales X roc_auc, brier = cv(model, X_subset, y) # get scores using cv function for col in base_list: # removes columns from base list from X_cols to remove extra info from DataFrame X_cols.remove(col) result.loc[count] = [model_dic[idx], X_cols, brier, roc_auc] #inserting results into DataFrame count += 1 return result
def create_model_load_data(): """loads data and creates model fitted w/ data Returns: model: fitted model X_hold: Holdout Data features y_hold: Holdout data targets """ model = GradientBoostingClassifier(learning_rate=.01, n_estimators=90, min_samples_leaf=6, min_samples_split=4, max_features=3, max_depth=5, subsample=.6) X, y = training_data_Xy() X_hold, y_hold = read_in_holdout_Xy() model.fit(X, y) return model, X_hold, y_hold
import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import KFold from grid_and_thresh_funcs import threshold_testing, confusion, confusion_ratios from data_prep import training_data_Xy gb_final = GradientBoostingClassifier(learning_rate=.01, n_estimators=90, min_samples_leaf=6 , min_samples_split=4 ,max_features= 3,max_depth= 5,subsample= .6) X, y = training_data_Xy() def test_prediction_results(X, y, model, num_folds=5): """does cv to get predictions for entire dataframe Args: X (array): features y (array): target model (sklearn model): model to be tested num_folds (int, optional): number of folds in cross val. Defaults to 5. Returns: DataFrame: contains 2 columns, one for prediction, which is actual the predict_proba, so a float between 0-1, with 1 being the most positive and 0 being the least. The other column is the true value """ kf = KFold(n_splits=num_folds, shuffle=True) results = pd.DataFrame(columns=['prediction', 'truth'], index=X.index) for train, test in kf.split(X):
import pandas as pd import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.wrappers.scikit_learn import KerasClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold from tensorflow.keras.metrics import Precision, AUC from tensorflow.keras import backend as K from tensorflow.keras.callbacks import EarlyStopping from sklearn.model_selection import StratifiedKFold from tensorflow.keras.optimizers import Adam from sklearn.metrics import f1_score from sklearn.ensemble import GradientBoostingClassifier # load dataset X, y = training_data_Xy(scale=True) # encode class values as integers # baseline model def create_baseline(): # create model model = Sequential() # model.add(Dropout(0.1, input_shape=(10,))) model.add(Dense(11, input_dim=11, activation='relu')) model.add(Dense(22, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model