예제 #1
0
def test_different_models_scale_within(base_list, considered_columns, r=[1]):
    """tests feature combinations w/ sklearn cv to get brier/roc scores for each model, for each combination of columns
    base_list columns are always included. Will get every possible combination of 'considered_column' possible given values in 'r'
    for more details refer to 'column_combinations' function and the 'Combinations' function from itertools

    Args:
        base_list (list): columns to always be included in test
        considered_columns (list): columns that will possibly be included
        r (list, optional): Size of the combinations made with 'considered_columns'. Defaults to [1], meaning ever 'considered_column' will be included once

    Returns:
        DataFrame: Dataframe with 4 columns: the name of the model, the columns (besides the base columns) considered, brier score, roc auc score
    """

    X, y = training_data_Xy(subset=False)

    result = pd.DataFrame(columns=['model', 'columns', 'brier', 'roc_auc'])

    col_lst = column_combinations(base_list, considered_columns, r)
    model_dic = {
        0: 'RandomForest',
        1: 'GradientBoost',
        2: 'AdaBoost',
        3: 'KNN',
        4: 'logistic'
    }  # dictionary to get names of models
    model_lst = [
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        AdaBoostClassifier(),
        KNeighborsClassifier(),
        LogisticRegression()
    ]  # models to be considered
    count = 0
    for idx, model in enumerate(model_lst):  # iterate through model list
        print(idx)
        for cols in col_lst:  # iterate through column list
            X_subset = X[
                cols]  # reduces X to just columns that ought to be considered
            X_cols = list(X_subset.columns)
            SS = StandardScaler()
            X_subset = pd.DataFrame(data=SS.fit_transform(X_subset),
                                    columns=X_cols)  # scales X

            roc_auc, brier = cv(model, X_subset,
                                y)  # get scores using cv function

            for col in base_list:  # removes columns from base list from X_cols to remove extra info from DataFrame
                X_cols.remove(col)

            result.loc[count] = [model_dic[idx], X_cols, brier,
                                 roc_auc]  #inserting results into DataFrame
            count += 1

    return result
예제 #2
0
def create_model_load_data():
    """loads data and creates model fitted w/ data

    Returns:
        model: fitted model
        X_hold: Holdout Data features
        y_hold: Holdout data targets
    """
    model = GradientBoostingClassifier(learning_rate=.01,
                                       n_estimators=90,
                                       min_samples_leaf=6,
                                       min_samples_split=4,
                                       max_features=3,
                                       max_depth=5,
                                       subsample=.6)
    X, y = training_data_Xy()
    X_hold, y_hold = read_in_holdout_Xy()
    model.fit(X, y)
    return model, X_hold, y_hold
예제 #3
0
import pandas as pd 
import numpy as np 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold

from grid_and_thresh_funcs import threshold_testing, confusion, confusion_ratios
from data_prep import training_data_Xy



gb_final = GradientBoostingClassifier(learning_rate=.01, n_estimators=90, min_samples_leaf=6 , min_samples_split=4 ,max_features= 3,max_depth= 5,subsample= .6)


X, y = training_data_Xy()


def test_prediction_results(X, y, model, num_folds=5):
    """does cv to get predictions for entire dataframe

    Args:
        X (array): features
        y (array): target
        model (sklearn model): model to be tested
        num_folds (int, optional): number of folds in cross val. Defaults to 5.

    Returns:
        DataFrame: contains 2 columns, one for prediction, which is actual the predict_proba, so a float between 0-1, with 1 being the most positive and 0 being the least. The other column is the true value
    """    
    kf = KFold(n_splits=num_folds, shuffle=True)
    results = pd.DataFrame(columns=['prediction', 'truth'], index=X.index)
    for train, test in kf.split(X):
예제 #4
0
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.metrics import Precision, AUC
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
# load dataset
X, y = training_data_Xy(scale=True)

# encode class values as integers


# baseline model
def create_baseline():
    # create model
    model = Sequential()
    # model.add(Dropout(0.1, input_shape=(10,)))

    model.add(Dense(11, input_dim=11, activation='relu'))
    model.add(Dense(22, activation='relu'))

    model.add(Dense(1, activation='sigmoid'))
    # Compile model