示例#1
0
def do_generate_logistic_simple_model(X_train, y_train, parameters):
    model = LogisticRegression(random_state=my_constants.RANDOM_VALUE)
    model_grid = GridSearchCV(model,
                              param_grid=parameters,
                              cv=3,
                              verbose=3,
                              n_jobs=3)
    with ignore_warnings(category=ConvergenceWarning):
        model_grid.fit(X_train, y_train)
    file_operations.write_logs(FILENAME,
                               "Calculate logistic simple model" + model_grid)

    return model_grid
示例#2
0
def do_generate_rf_optimazed_model(X_train, y_train, parameters):
    file_operations.write_logs(
        FILENAME, 'Starting RF Grid Search with parameters:' + str(parameters))
    model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE,
                                   oob_score=True)
    model_grid = GridSearchCV(model,
                              param_grid=parameters,
                              cv=3,
                              verbose=3,
                              n_jobs=3)
    with ignore_warnings(category=ConvergenceWarning):
        model_grid.fit(X_train, y_train)

    file_operations.write_logs(FILENAME, 'RF Grid search completed')

    return model_grid
示例#3
0
def do_generate_metrics_logistic_simple_model(X_train, y_train, X_test, y_test,
                                              grid):
    file_operations.write_logs(FILENAME,
                               "do_generate_metrics_logistic_simple_model")
    model = LogisticRegression(random_state=my_constants.RANDOM_VALUE)
    file_operations.write_logs(FILENAME, "grid Best params")
    file_operations.write_logs(FILENAME, str(grid.best_params_))

    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, 'model params:' + str(model.get_params()) + " model score:" +
        str(model.score))
    file_operations.write_logs(
        FILENAME, 'model grid.best_params_:' + str(model.get_params()) +
        " grid.best_score_:" + str(grid.best_score_))

    return model, metrics
def do_generate_lgbm_optimazed_model(X_train, y_train, parameters):
    file_operations.write_logs(FILENAME,
                               'Starting LGBM Grid Search with parameters:')
    file_operations.write_logs(FILENAME, str(parameters))
    model = LGBMClassifier(random_state=0)
    model = GridSearchCV(model,
                         param_grid=parameters,
                         cv=3,
                         verbose=3,
                         n_jobs=3)
    model.fit(X_train, y_train)
    file_operations.write_logs(FILENAME, "LGBM grid search completed")
    return model
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test,
                                             grid):
    file_operations.write_logs(FILENAME, "LGBM metrics calculation\n")
    model = LGBMClassifier(random_state=0)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, "Generated model params and results\n params:" +
        str(model.get_params()) + "\nscore " +
        str(model.score(X_test, y_test)))
    file_operations.write_logs(
        FILENAME, "Search grid best params and results\n params:" +
        str(grid.best_params_) + "\nscore " + str(grid.best_score_))

    return model, metrics
示例#6
0
def do_generate_metrics_logistic_simple_model(X_train, y_train, X_test, y_test,
                                              grid):
    model = LogisticRegression(random_state=my_constants.RANDOM_VALUE)
    file_operations.write_logs(
        FILENAME,
        "Calculate logistic simple model best params" + grid.best_params_)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME,
        "model params" + model.get_params() + " scores:" + model.score)
    file_operations.write_logs(
        FILENAME,
        "Grid params" + grid.best_params_ + " scores:" + grid.best_score_)

    return model, metrics
示例#7
0
def do_generate_metrics_rf_optimazed_model(X_train, y_train, X_test, y_test,
                                           grid):
    file_operations.write_logs(FILENAME, 'Starting metrics calculation')
    model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE,
                                   oob_score=True)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, "Generated model params and results\n params:" +
        str(model.get_params()) + "\nscore " +
        str(model.score(X_test, y_test)))
    file_operations.write_logs(
        FILENAME, "Search grid best params and results\n params:" +
        str(grid.best_params_) + "\nscore " + str(grid.best_score_))

    return model, metrics
示例#8
0
def predictions():
    train_df = file_operations.read_data('processed', 'train.csv',
                                         'PassengerId')
    competition_df = file_operations.read_data('processed', 'test.csv',
                                               'PassengerId')

    X = train_df.loc[:, 'Age':].values.astype('float')
    y = train_df['Survived'].ravel()
    shape = X.shape
    if shape[0] == 891 & shape[1] > 36:
        file_operations.write_logs(
            FILENAME,
            "Dataset has " + shape[1] + " and right amount amount of rows")

    file_operations.write_logs(FILENAME, 'Creating test and train dataset')
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=my_constants.TEST_SIZE,
        random_state=my_constants.RANDOM_VALUE)
    # Linear base dummy model
    file_operations.write_logs(FILENAME, 'Creating linear model')
    base_model = create_base_model(X_train, y_train, X_test, y_test)
    file_operations.write_logs(FILENAME,
                               "Metrics BaseModel: " + base_model['metrics'])
    file_operations.get_submission_file(base_model['model'],
                                        '01_base_model.csv', competition_df)

    # Logistic regression model
    file_operations.write_logs(FILENAME, 'Creating logistic simple model')
    lg_simple_model = create_logistic_simple_model(X_train, y_train, X_test,
                                                   y_test)
    file_operations.write_logs(
        FILENAME, "Metrics lg_simple_model: " + lg_simple_model['metrics'])
    file_operations.get_submission_file(lg_simple_model['model'],
                                        '02_lg_model.csv', competition_df)

    # Logistic regression model with hyp optimization
    file_operations.write_logs(FILENAME, 'Creating logistic optimazed model')
    lg_optimazed_model = create_logistic_optimazed_model(
        X_train, y_train, X_test, y_test)
    file_operations.write_logs(
        FILENAME,
        "Metrics lg_optimazed_model: " + lg_optimazed_model['metrics'])
    file_operations.get_submission_file(lg_optimazed_model['model'],
                                        '03_lg_model_optimized.csv',
                                        competition_df)

    # print('Creating rf  model')
    # rf_model_scaled = create_rf_optimized_model(X_train, y_train, X_test, y_test)
    # print("Metrics rf_model_scaled: ", rf_model_scaled['metrics'])
    # file_operations.get_submission_file(rf_model_scaled['model'], '04_rf_model_optimized.csv', competition_df)
    #
    # print('Creating lgbm  model')
    # lgbm_model = create_lgbm_optimized_model(X_train, y_train, X_test, y_test)
    # print("Metrics lgbm_model: ", lgbm_model['metrics'])
    # file_operations.get_submission_file(lgbm_model['model'], '04_lgbm_model_optimized.csv', competition_df)
    #
    # print('Creating SVC scaled model')
    # svc_model = create_svc_optimized_model(X_train, y_train, X_test, y_test)
    # file_operations.get_submission_file(svc_model['model'], '05_svc_model_optimized_scaled.csv', competition_df)

    # Feature standarization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # linear base dummy model
    file_operations.write_logs(FILENAME, 'Creating dummy scaled model')
    base_model_scaled = create_base_model(X_train_scaled, y_train,
                                          X_test_scaled, y_test)
    file_operations.write_logs(
        FILENAME, "Metrics base_model_scaled: " + base_model_scaled['metrics'])
    file_operations.get_submission_file_with_standardization(
        base_model_scaled['model'], '01_base_model_scaled.csv', scaler,
        competition_df)

    # Logistic regression model
    file_operations.write_logs(FILENAME,
                               'Creating logitic optimazed scaled model')
    lg_simple_model_scaled = create_logistic_simple_model(
        X_train, y_train, X_test, y_test)
    file_operations.write_logs(
        FILENAME,
        "Metrics lg_simple_model_scaled: " + lg_simple_model_scaled['metrics'])
    file_operations.get_submission_file_with_standardization(
        lg_simple_model_scaled['model'], '02_lg_model_scaled.csv', scaler,
        competition_df)

    # Logistic regression model with hyp optimization
    print('Creating logitic optimazed scaled model')
    lg_optimazed_model_scaled = create_logistic_optimazed_model(
        X_train, y_train, X_test, y_test)
    print("Metrics lg_optimazed_model_scaled: ",
          lg_optimazed_model_scaled['metrics'])
    file_operations.get_submission_file_with_standardization(
        lg_optimazed_model_scaled['model'], '03_lg_model_optimized_scaled.csv',
        scaler, competition_df)
def predictions():
    train_df = file_operations.read_data('processed', 'train.csv',
                                         'PassengerId')
    competition_df = file_operations.read_data('processed', 'test.csv',
                                               'PassengerId')

    X = train_df.loc[:, 'Age':].values.astype('float')
    y = train_df['Survived'].ravel()
    shape = X.shape
    if shape[0] == 891 & shape[1] > 36:
        file_operations.write_logs(FILENAME, "Dataset has ", shape[1],
                                   " and right amount amount of rows")

    file_operations.write_logs(FILENAME, 'Creating test and train dataset')
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=my_constants.TEST_SIZE,
        random_state=my_constants.RANDOM_VALUE)
    # Linear base dummy model
    file_operations.write_logs(FILENAME, 'Creating linear model')
    base_model = create_base_model(X_train, y_train, X_test, y_test)
    file_operations.write_logs(FILENAME, "Metrics base_model: ")
    file_operations.write_logs(FILENAME, str(base_model['metrics']))
    file_operations.get_submission_file(base_model['model'],
                                        '01_base_model.csv', competition_df)

    file_operations.write_logs(FILENAME, 'Creating lgbm  model')
    lgbm_model = create_lgbm_optimized_model(X_train, y_train, X_test, y_test)
    file_operations.write_logs(FILENAME, "Metrics lgbm_model: ")
    file_operations.write_logs(FILENAME, str(lgbm_model['metrics']))
    file_operations.get_submission_file(lgbm_model['model'],
                                        '04_lgbm_model_optimized.csv',
                                        competition_df)

    # Feature standarization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    file_operations.write_logs(FILENAME, 'Creating lgbm  scaled model')
    lgbm_model_scaled = create_lgbm_optimized_model(X_train_scaled, y_train,
                                                    X_test_scaled, y_test)
    file_operations.write_logs(FILENAME, "Metrics lgbm_model_scaled: ")
    file_operations.write_logs(FILENAME, str(lgbm_model_scaled['metrics']))
    file_operations.get_submission_file(str(lgbm_model_scaled['model']),
                                        '04_lgbm_model_optimized_scaled.csv',
                                        competition_df)