Пример #1
0
def main():
    path_trial_folder_ulid = utl.get_path_trial_folder_ulid(c.CREATE_NEW)
    print('Trial:', path_trial_folder_ulid)

    # Mutable Paths
    path_selected_features = path_trial_folder_ulid + c.SELECTED_FEATURES
    path_log_tuneup = path_trial_folder_ulid + c.LOG_TUNEUP_FOR_SLC
    path_model = path_trial_folder_ulid + c.MODEL_SLC
    path_model_params = path_trial_folder_ulid + c.MODEL_PARAMS_SLC
    path_roc_curve = path_trial_folder_ulid + c.ROC_CURVE_SLC

    # Loading preprocessed data
    train = joblib.load(c.PATH_TRAIN_PRP)
    y = train[c.TARGET_COLUMN]
    X = utl.except_for(train, c.TARGET_COLUMN)

    if c.USE_SELECTED_FEATURES:
        if not utl.exists(path_selected_features):
            raise Exception("The file doesn't exist.")

        selected_features = list(pd.read_csv(path_selected_features))
        X = X[selected_features]

    with utl.timer('tuneup'):

        best_model = tuneup(models=c.MODEL_LIST,
                            params=c.MODEL_PARAMS,
                            X=X,
                            y=y,
                            scoring=c.SCORING,
                            direction=c.DIRECTION,
                            cv=c.CV,
                            n_splits=c.N_CV_SPLITS,
                            random_state=c.SEED,
                            n_trials=c.N_TRIALS_TUNE,
                            timeout=c.TIMEOUT,
                            n_jobs=-1,
                            path_model=path_model,
                            path_model_params=path_model_params,
                            path_log_tuneup=path_log_tuneup)
    """
    Plot ROC curve
    """
    with utl.timer('Plot ROC curve'):
        viz.plot_roc_curve_with_cv(best_model,
                                   X,
                                   y,
                                   cv=c.CV,
                                   n_splits=c.N_CV_SPLITS,
                                   test_size_ratio=c.TEST_SIZE_RATIO,
                                   savepath=path_roc_curve)
Пример #2
0
def main():
    # Loading data
    train = pd.read_csv(c.PATH_TRAIN)
    X = utl.except_for(train, c.TARGET_COLUMN)
    X_test = pd.read_csv(c.PATH_TEST)

    # Check if number of each class is inbalanced
    y = train[c.TARGET_COLUMN]
    print('class balance\n', y.value_counts())

    # Create Profile of the data
    if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN):
        with utl.timer('Create report of train'):
            viz.create_profile(X, savepath=c.PATH_PROFILE_REPORT_TRAIN)

    if not utl.exists(c.PATH_PROFILE_REPORT_TEST):
        with utl.timer('Create report of test'):
            viz.create_profile(
                X_test, savepath=c.PATH_PROFILE_REPORT_TEST)

    prp = Preprocessor()
    prp.set_scaler(method=c.SCALING)

    with utl.timer('Preprocessing train'):
        train_prp = prp.exe(
            df=train,
            encoder=c.CAT_ENCODER,
            exclusive_features=c.EXCLUSIVE_FEATURES,
            dropped_features=c.DISCARDED_FEATURES,
            thresh_nan_ratio_per_col=c.THRESH_NAN_RATIO_PER_COL,
            thresh_corr=c.THRESH_CORR,
            alt_num=c.ALT_NUM,
            alt_cat=c.ALT_CAT,
            path_train_prp=c.PATH_TRAIN_PRP)

        y_prp = train_prp[c.TARGET_COLUMN]
        X_prp = utl.except_for(train_prp, c.TARGET_COLUMN)

        print('X_prp.shape:', X_prp.shape)
        print('y_prp.shape:', y_prp.shape)

    if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN_PRP):
        with utl.timer('Create report of train_prp'):
            viz.create_profile(X_prp, savepath=c.PATH_PROFILE_REPORT_TRAIN_PRP)

    with utl.timer('Preprocessing test'):
        X_test_prp = prp.exe_test(
            X=X_test,
            y=None,
            exclusive_features=c.EXCLUSIVE_FEATURES,
            alt_num=c.ALT_NUM, alt_cat=c.ALT_CAT,
            path_test_prp=c.PATH_TEST_PRP)

        print('X_test_prp.shape:', X_test_prp.shape)

    if not utl.exists(c.PATH_PROFILE_REPORT_TEST_PRP):
        with utl.timer('Create report of test_prp'):
            viz.create_profile(
                X_test_prp, savepath=c.PATH_PROFILE_REPORT_TEST_PRP)
Пример #3
0
                            params=c.MODEL_PARAMS,
                            X=X,
                            y=y,
                            scoring=c.SCORING,
                            direction=c.DIRECTION,
                            cv=c.CV,
                            n_splits=c.N_CV_SPLITS,
                            random_state=c.SEED,
                            n_trials=c.N_TRIALS_TUNE,
                            timeout=c.TIMEOUT,
                            n_jobs=-1,
                            path_model=path_model,
                            path_model_params=path_model_params,
                            path_log_tuneup=path_log_tuneup)
    """
    Plot ROC curve
    """
    with utl.timer('Plot ROC curve'):
        viz.plot_roc_curve_with_cv(best_model,
                                   X,
                                   y,
                                   cv=c.CV,
                                   n_splits=c.N_CV_SPLITS,
                                   test_size_ratio=c.TEST_SIZE_RATIO,
                                   savepath=path_roc_curve)


if __name__ == '__main__':
    with utl.timer('Train'):
        main()
Пример #4
0
    X = utl.except_for(train, c.TARGET_COLUMN)
    X_test = joblib.load(c.PATH_TEST_PRP)

    if c.USE_SELECTED_FEATURES:
        selected_features = list(pd.read_csv(path_selected_features))
        X = X[selected_features]
        X_test = X_test[selected_features]
    """
    2. Predict
    """
    model.fit(X, y)

    y_pred = model.predict_proba(X_test) if c.USE_PREDICT_PROBA\
        else model.predict(X_test)

    df_prediction = pd.DataFrame({
        c.TEST_ID: X_test[c.TEST_ID],
        c.TARGET_COLUMN: y_pred
    })
    print('Prediction\n', df_prediction.head())

    with open(path_submit, 'w', encoding='utf-8-sig') as f:
        df_prediction.to_csv(f, index=False)


# !kaggle competitions submit -c titanic -f ML/Kaggle/Titanic/trial_models/01DXQSYFD4W2NX064GHV38N53Z/submit.csv -m "LogisticRegression"

if __name__ == '__main__':
    with utl.timer('Test'):
        main()
Пример #5
0
        y_prp = train_prp[c.TARGET_COLUMN]
        X_prp = utl.except_for(train_prp, c.TARGET_COLUMN)

        print('X_prp.shape:', X_prp.shape)
        print('y_prp.shape:', y_prp.shape)

    if not utl.exists(c.PATH_PROFILE_REPORT_TRAIN_PRP):
        with utl.timer('Create report of train_prp'):
            viz.create_profile(X_prp, savepath=c.PATH_PROFILE_REPORT_TRAIN_PRP)

    with utl.timer('Preprocessing test'):
        X_test_prp = prp.exe_test(
            X=X_test,
            y=None,
            exclusive_features=c.EXCLUSIVE_FEATURES,
            alt_num=c.ALT_NUM, alt_cat=c.ALT_CAT,
            path_test_prp=c.PATH_TEST_PRP)

        print('X_test_prp.shape:', X_test_prp.shape)

    if not utl.exists(c.PATH_PROFILE_REPORT_TEST_PRP):
        with utl.timer('Create report of test_prp'):
            viz.create_profile(
                X_test_prp, savepath=c.PATH_PROFILE_REPORT_TEST_PRP)


if __name__ == '__main__':
    with utl.timer('Preprocess'):
        main()
Пример #6
0
    path_trial_folder_ulid = utl.get_path_trial_folder_ulid(c.CREATE_NEW)
    print('Trial:', path_trial_folder_ulid)

    # Mutable Paths
    path_selected_features = path_trial_folder_ulid + c.SELECTED_FEATURES

    # Loading preprocessed data
    train = joblib.load(c.PATH_TRAIN_PRP)
    y = train[c.TARGET_COLUMN]
    X = utl.except_for(train, c.TARGET_COLUMN)

    if utl.exists_dir(c.PATH_LOG_FOLDER):
        utl.mkdir(c.PATH_LOG_FOLDER)

    model = ModelFactory(name=c.MODEL_NAME, params=c.MODEL_PARAMS).model

    selected_features\
        = select_features_by_rfe(
            model=model, X=X, y=y,
            ratio_max_n_features=c.RATIO_MAX_N_FEATURES,
            path_selected_features=path_selected_features,
            path_study_name_opt_features=c.STUDY_NAME_OPT_FIEATURES,
            path_optuna_storage_opt_features=c.PATH_OPTUNA_STORAGE_OPT_FIEATURES)

    print(selected_features)


if __name__ == '__main__':
    with utl.timer('Select feature'):
        main()