def main(c_runtime, c_transformer, c_model, c_trainer, c_log): with blocktimer('Preprocess'): train, test = Transformer.run(**c_transformer.__dict__) X_train, y_train, X_test = split_X_y(train, test) with blocktimer('Tune & Train'): modelfactory = ModelFactory() # tune the model params model = modelfactory.create(c_model) optimal_c_model = tune_gbdt_params(model, X_train, y_train, c_trainer.n_splits) # train with best params, full data model = modelfactory.create(optimal_c_model) model = model.train(X_train, y_train) with blocktimer('Predict'): sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = test['TransactionID'] y_test = model.predict(X_test) sub['isFraud'] = y_test sub.to_csv(c_runtime.out_sub_path, index=False) logger.info(f'Saved {c_runtime.out_sub_path}')
def main(c): with blocktimer('Preprocess'): train, test = Transformer.run(**c.transformer.__dict__) X_train, y_train, X_test = split_X_y(train, test) test = test.sort_values('TransactionDT') with blocktimer('Tune & Train'): modelfactory = ModelFactory() # tune the model params model = modelfactory.create(c.model) optimal_c_model = tune_gbdt_params(model, X_train, y_train, c.trainer.n_splits) # train with best params, full data model = modelfactory.create(optimal_c_model) model = model.train(X_train, y_train) # save results model.save(c.model.dir / f'model_{c.runtime.VERSION}_{c.model.TYPE}.pkl') importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance_path = c.runtime.ROOTDIR / 'feature/importance' / f'importance_{c.runtime.VERSION}.csv' importance.to_csv(importance_path) logger.info(f'Saved {str(importance_path)}') with blocktimer('Predict'): sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = test['TransactionID'] y_test = model.predict(X_test) sub['isFraud'] = y_test sub.to_csv(c.runtime.out_sub_path, index=False) logger.debug(f'Saved {c.runtime.out_sub_path}')
def objective(trial, X_train, y_train, X_test, cols, c): ''' Define objectives for optuna ''' modelfactory = ModelFactory() if c.model.type == 'lightgbm': max_depth = trial.suggest_int('max_depth', 3, 12) params_to_tune = { # num_leaves should be smaller than approximately 2^max_depth*0.75 'num_leaves': 2**max_depth * 3 // 4, 'max_depth': max_depth, 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e0), 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 1e0), 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0), 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 200), 'feature_fraction': trial.suggest_uniform('feature_fraction', 0, 1), 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0, 1) } elif c.model.type == 'xgboost': params_to_tune = { 'min_split_loss': trial.suggest_loguniform('min_split_loss', 1e-3, 1e0), 'max_depth': trial.suggest_int('max_depth', 3, 12), 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e0), 'subsample': trial.suggest_uniform('subsample', 0, 1), 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.0, 1.0), 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e0), 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e0) } elif c.model.type == 'catboost': max_depth = trial.suggest_int('max_depth', 3, 12) params_to_tune = { # num_leaves should be smaller than approximately 2^max_depth*0.75 # 'num_leaves': 2 ** max_depth * 3 // 4, 'max_depth': max_depth, 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0) } # apply suggested params params = c.model.params.copy() params.update(params_to_tune) # Train by 6-fold CV oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): fold = i + 1 month = X_train.iloc[idxV]['DT_M'].iloc[0] model_fold_path = f'data/model/model_{c.runtime.version}_opt_fold{fold}{c.runtime.dsize}.pkl' model = modelfactory.create(c.model) logger.info(f'Fold {fold} withholding month {month}') logger.info( f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}') model = model.train( X_train[cols].iloc[idxT], y_train.iloc[idxT], X_train[cols].iloc[idxV], y_train.iloc[idxV], params=params, num_boost_round=c.train.num_boost_round, early_stopping_rounds=c.train.early_stopping_rounds, fold=i + 1) oof[idxV] = model.predict(X_train[cols].iloc[idxV]) preds += model.predict(X_test[cols]) / skf.n_splits r.paths.update({f'model_fold_{fold}_path': model_fold_path}) model.save(r.paths[f'model_fold_{fold}_path']) del model score = roc_auc_score(y_train, oof) logger.info(f'Fold {fold} OOF cv= {score}') mlflow.log_metric('oof_cv_score', score, step=trial.number) return score
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Train', level=INFO): logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') # CHRIS - TRAIN 75% PREDICT 25% idxT = X_train.index[:3 * len(X_train) // 4] idxV = X_train.index[3 * len(X_train) // 4:] ''' model = modelfactory.create(c.model) model = model.train(X_train.loc[idxT, :], y_train[idxT], X_train.loc[idxV, :], y_train[idxV], num_boost_round=best_iteration) importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) # save results paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl' paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' model.save(paths.out_model_dir) importance.to_csv(paths.importance_path) ''' from sklearn.model_selection import GroupKFold from sklearn.metrics import roc_auc_score oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}') categorical_features = [ 'ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] += model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): # y_test = model.predict(X_test) sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] # sub['isFraud'] = y_test sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result = EasyDict() result.update(c) result.scores = scores result.paths = paths return result
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() result = EasyDict() result.update(c) modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize num_boost_round', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Optimize model params', level=INFO): if c.train.optimize_model_params is True: # define objective for optuna def objectives(trial): max_depth = trial.suggest_int('max_depth', 3, 12) params = { 'boosting_type': 'gbdt', # num_leaves should be smaller than approximately 2^max_depth*0.75 'num_leaves': 2**max_depth * 3 // 4, 'max_depth': max_depth, 'learning_rate': 0.05, 'objective': 'binary', 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e0), # 0.03454472573214212, 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 1e0), # 0.3899927210061127, 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0), # 0.6485237330340494, 'random_state': 42, 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 200), # 106, 'metric': 'auc', 'max_bin': 255 } c.model.params = params # Train by 6-fold CV oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): fold = i + 1 month = X_train.iloc[idxV]['DT_M'].iloc[0] model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl' model = modelfactory.create(c.model) logger.info(f'Fold {fold} withholding month {month}') logger.info( f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}' ) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits paths.update({f'model_fold_{fold}_path': model_fold_path}) model.save(paths[f'model_fold_{fold}_path']) del model score = roc_auc_score(y_train, oof) logger.info(f'Fold {fold} OOF cv= {score}') return score # run optimization opt = optuna.create_study( direction='maximize', study_name=f'parameter_study_0016{dsize}', storage= f'sqlite:///data/optimization/parameter_study_0016{dsize}.db', load_if_exists=True) opt.optimize(objectives, n_trials=20) trial = opt.best_trial logger.debug(f'Best trial: {trial.value}') logger.debug(f'Best params: {trial.params}') scores.best_trial = trial.value result.optimize = {} result.optimize.best_params = trial.params else: logger.debug('Skip optimization') with blocktimer('Train', level=INFO): if c.train.train_model: logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}' ) ''' categorical_features = ['ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] ''' model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): if c.train.predict: sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result.scores = scores result.paths = paths return result