def main(): path_trial_folder_ulid = utl.get_path_trial_folder_ulid(c.CREATE_NEW) print('Trial:', path_trial_folder_ulid) # Mutable Paths path_selected_features = path_trial_folder_ulid + c.SELECTED_FEATURES path_log_tuneup = path_trial_folder_ulid + c.LOG_TUNEUP_FOR_SLC path_model = path_trial_folder_ulid + c.MODEL_SLC path_model_params = path_trial_folder_ulid + c.MODEL_PARAMS_SLC path_roc_curve = path_trial_folder_ulid + c.ROC_CURVE_SLC # Loading preprocessed data train = joblib.load(c.PATH_TRAIN_PRP) y = train[c.TARGET_COLUMN] X = utl.except_for(train, c.TARGET_COLUMN) if c.USE_SELECTED_FEATURES: if not utl.exists(path_selected_features): raise Exception("The file doesn't exist.") selected_features = list(pd.read_csv(path_selected_features)) X = X[selected_features] with utl.timer('tuneup'): best_model = tuneup(models=c.MODEL_LIST, params=c.MODEL_PARAMS, X=X, y=y, scoring=c.SCORING, direction=c.DIRECTION, cv=c.CV, n_splits=c.N_CV_SPLITS, random_state=c.SEED, n_trials=c.N_TRIALS_TUNE, timeout=c.TIMEOUT, n_jobs=-1, path_model=path_model, path_model_params=path_model_params, path_log_tuneup=path_log_tuneup) """ Plot ROC curve """ with utl.timer('Plot ROC curve'): viz.plot_roc_curve_with_cv(best_model, X, y, cv=c.CV, n_splits=c.N_CV_SPLITS, test_size_ratio=c.TEST_SIZE_RATIO, savepath=path_roc_curve)
def main(): # Loading data train = pd.read_csv(c.PATH_TRAIN) X = utl.except_for(train, c.TARGET_COLUMN) X_test = pd.read_csv(c.PATH_TEST) # Check if number of each class is inbalanced y = train[c.TARGET_COLUMN] print('class balance\n', y.value_counts()) # Create Profile of the data if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN): with utl.timer('Create report of train'): viz.create_profile(X, savepath=c.PATH_PROFILE_REPORT_TRAIN) if not utl.exists(c.PATH_PROFILE_REPORT_TEST): with utl.timer('Create report of test'): viz.create_profile( X_test, savepath=c.PATH_PROFILE_REPORT_TEST) prp = Preprocessor() prp.set_scaler(method=c.SCALING) with utl.timer('Preprocessing train'): train_prp = prp.exe( df=train, encoder=c.CAT_ENCODER, exclusive_features=c.EXCLUSIVE_FEATURES, dropped_features=c.DISCARDED_FEATURES, thresh_nan_ratio_per_col=c.THRESH_NAN_RATIO_PER_COL, thresh_corr=c.THRESH_CORR, alt_num=c.ALT_NUM, alt_cat=c.ALT_CAT, path_train_prp=c.PATH_TRAIN_PRP) y_prp = train_prp[c.TARGET_COLUMN] X_prp = utl.except_for(train_prp, c.TARGET_COLUMN) print('X_prp.shape:', X_prp.shape) print('y_prp.shape:', y_prp.shape) if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN_PRP): with utl.timer('Create report of train_prp'): viz.create_profile(X_prp, savepath=c.PATH_PROFILE_REPORT_TRAIN_PRP) with utl.timer('Preprocessing test'): X_test_prp = prp.exe_test( X=X_test, y=None, exclusive_features=c.EXCLUSIVE_FEATURES, alt_num=c.ALT_NUM, alt_cat=c.ALT_CAT, path_test_prp=c.PATH_TEST_PRP) print('X_test_prp.shape:', X_test_prp.shape) if not utl.exists(c.PATH_PROFILE_REPORT_TEST_PRP): with utl.timer('Create report of test_prp'): viz.create_profile( X_test_prp, savepath=c.PATH_PROFILE_REPORT_TEST_PRP)
params=c.MODEL_PARAMS, X=X, y=y, scoring=c.SCORING, direction=c.DIRECTION, cv=c.CV, n_splits=c.N_CV_SPLITS, random_state=c.SEED, n_trials=c.N_TRIALS_TUNE, timeout=c.TIMEOUT, n_jobs=-1, path_model=path_model, path_model_params=path_model_params, path_log_tuneup=path_log_tuneup) """ Plot ROC curve """ with utl.timer('Plot ROC curve'): viz.plot_roc_curve_with_cv(best_model, X, y, cv=c.CV, n_splits=c.N_CV_SPLITS, test_size_ratio=c.TEST_SIZE_RATIO, savepath=path_roc_curve) if __name__ == '__main__': with utl.timer('Train'): main()
X = utl.except_for(train, c.TARGET_COLUMN) X_test = joblib.load(c.PATH_TEST_PRP) if c.USE_SELECTED_FEATURES: selected_features = list(pd.read_csv(path_selected_features)) X = X[selected_features] X_test = X_test[selected_features] """ 2. Predict """ model.fit(X, y) y_pred = model.predict_proba(X_test) if c.USE_PREDICT_PROBA\ else model.predict(X_test) df_prediction = pd.DataFrame({ c.TEST_ID: X_test[c.TEST_ID], c.TARGET_COLUMN: y_pred }) print('Prediction\n', df_prediction.head()) with open(path_submit, 'w', encoding='utf-8-sig') as f: df_prediction.to_csv(f, index=False) # !kaggle competitions submit -c titanic -f ML/Kaggle/Titanic/trial_models/01DXQSYFD4W2NX064GHV38N53Z/submit.csv -m "LogisticRegression" if __name__ == '__main__': with utl.timer('Test'): main()
y_prp = train_prp[c.TARGET_COLUMN] X_prp = utl.except_for(train_prp, c.TARGET_COLUMN) print('X_prp.shape:', X_prp.shape) print('y_prp.shape:', y_prp.shape) if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN_PRP): with utl.timer('Create report of train_prp'): viz.create_profile(X_prp, savepath=c.PATH_PROFILE_REPORT_TRAIN_PRP) with utl.timer('Preprocessing test'): X_test_prp = prp.exe_test( X=X_test, y=None, exclusive_features=c.EXCLUSIVE_FEATURES, alt_num=c.ALT_NUM, alt_cat=c.ALT_CAT, path_test_prp=c.PATH_TEST_PRP) print('X_test_prp.shape:', X_test_prp.shape) if not utl.exists(c.PATH_PROFILE_REPORT_TEST_PRP): with utl.timer('Create report of test_prp'): viz.create_profile( X_test_prp, savepath=c.PATH_PROFILE_REPORT_TEST_PRP) if __name__ == '__main__': with utl.timer('Preprocess'): main()
path_trial_folder_ulid = utl.get_path_trial_folder_ulid(c.CREATE_NEW) print('Trial:', path_trial_folder_ulid) # Mutable Paths path_selected_features = path_trial_folder_ulid + c.SELECTED_FEATURES # Loading preprocessed data train = joblib.load(c.PATH_TRAIN_PRP) y = train[c.TARGET_COLUMN] X = utl.except_for(train, c.TARGET_COLUMN) if utl.exists_dir(c.PATH_LOG_FOLDER): utl.mkdir(c.PATH_LOG_FOLDER) model = ModelFactory(name=c.MODEL_NAME, params=c.MODEL_PARAMS).model selected_features\ = select_features_by_rfe( model=model, X=X, y=y, ratio_max_n_features=c.RATIO_MAX_N_FEATURES, path_selected_features=path_selected_features, path_study_name_opt_features=c.STUDY_NAME_OPT_FIEATURES, path_optuna_storage_opt_features=c.PATH_OPTUNA_STORAGE_OPT_FIEATURES) print(selected_features) if __name__ == '__main__': with utl.timer('Select feature'): main()