from lib.porto.feature_type import get_cat_features_idx from lib.scoring.gini import gini_normalized import pandas as pd from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, Imputer, FunctionTransformer from sklearn.model_selection import train_test_split from datetime import datetime from keras.models import Sequential from keras.layers import Dense, Activation, Dropout start = datetime.now() cfg = get_config() logger = get_logger() logger.info("Loading training data into X and y...") train = load_file() X = train.drop(['target'], axis=1) X['bias'] = 1 y = train.target cat_columns = get_cat_features_idx(X) logger.info("Preprocessing Data (Impute, Encode)...") pipe = Pipeline([('impute', Imputer(missing_values=-1, strategy="most_frequent")), ('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown='ignore')), ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True))])
from lib.data import load_file, convert_columns_to_int, make_missing_zero from lib.submit import write_submission_file from lib.logger import get_logger from lib.porto.feature_type import get_bin_cat_features, get_cat_features_idx from sklearn.pipeline import Pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.model_selection import cross_val_predict, GridSearchCV from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from lib.scoring.gini import gini_normalized logger = get_logger() # training data train = load_file() X = train.drop(['target'], axis=1) y = train.target # bump all values up 1, so missing is now zero cat_columns = get_cat_features_idx(X) X = make_missing_zero(X, cat_columns) # make a pipeline pipe = Pipeline([('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown='ignore')), ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('model', LogisticRegression())]) param_grid = {'model': [GaussianNB(), LogisticRegression()]}
from lib.data import load_file, convert_columns_to_int from lib.submit import write_submission_file from lib.logger import get_logger from lib.porto.feature_type import get_bin_cat_features from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression logger = get_logger() # target columns / features chi2_df = load_file("chi2") def n_best(chdf, n=15): sorted = chdf.sort_values('chi2', axis=0, ascending=False) return sorted['feature'][:n] columns = n_best(chi2_df, n=20) # training data train = load_file() bit_columns = get_bin_cat_features(train) bit_columns.append('target') train = convert_columns_to_int(train, bit_columns) X = train[columns] y = train.target # make a pipeline pipe = make_pipeline(StandardScaler(), LogisticRegression())
from lib.data import load_file, convert_columns_to_int from lib.submit import write_submission_file from lib.logger import get_logger from lib.porto.feature_type import get_bin_cat_features from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_predict, GridSearchCV from lib.scoring.gini import gini_normalized logger = get_logger() # training data train = load_file() bit_columns = get_bin_cat_features(train) bit_columns.append('target') train = convert_columns_to_int(train, bit_columns) X = train.drop(['target'], axis = 1) y = train.target # make a pipeline pipe = Pipeline([('transform', StandardScaler()), ('model', GaussianNB())]) param_grid = {} model = GridSearchCV(pipe, param_grid, scoring = 'roc_auc') model.fit(X, y) logger.info("Best Params: {}".format(model.best_params_)) results = cross_val_predict(model, X, y, method = 'predict_proba')[:, 1]