예제 #1
0
def main():
    df_train, df_evaluate = read_challenge_data()
    df_train = process_and_filter_data(df_train, config)
    df_evaluate = process_evaluate_data(df_evaluate, config)

    X_train, X_test, y_train, y_test = split_data(df_train, config)

    run_baseline(X_train, X_test, y_train, y_test, config)

    gb_clf, y_pred_gb = run_gradient_boosting_classifier(
        X_train, X_test, y_train, y_test, config)
    rf_clf, y_pred_rf = run_random_forest(X_train, X_test, y_train, y_test,
                                          config)
    xgb_clf, y_pred_xgb = run_xgboost(X_train, X_test, y_train, y_test, config)

    # Voting classifier.
    voting_clf = VotingClassifier(estimators=[("rf", rf_clf), ("gb", gb_clf),
                                              ("xgb", xgb_clf)],
                                  voting="soft").fit(X_train, y_train)
    stratified_shuffle_split = StratifiedKFold(n_splits=10)
    cross_val_score_ = cross_val_score(voting_clf,
                                       X_train,
                                       y_train,
                                       cv=stratified_shuffle_split).mean()
    LOGGER.info(
        f"Voting classifier cross validation score: {cross_val_score_}")
    if config["test"]:
        print(classification_report(voting_clf.predict(X_test), y_test))

    final_model = voting_clf
    final_predictions = final_model.predict(df_evaluate)

    pd.DataFrame(final_predictions).to_csv("data/101617.txt",
                                           index=False,
                                           header=False)
def extract_weighted_columns(data):
    data = data[:, [1, 2, 4, 6, 8, 9, 10, 15, 17, 18, 21, 25, 27, 28, 29]]
    return data


if __name__ == "__main__":

    training_data = genfromtxt('training.csv', dtype=str, delimiter=',')
    testing_data = genfromtxt('testing.csv', dtype=str, delimiter=',')

    train_data, train_labels, train_weights = helper.process_data(
        training_data)
    test_data, test_labels, test_weights = helper.process_data(testing_data)

    train_data_weighted = helper.normalize_data(
        helper.replace_missing_values(extract_weighted_columns(train_data)))
    test_data_weighted = helper.normalize_data(
        helper.replace_missing_values(extract_weighted_columns(test_data)))

    models.run_lr(train_data_weighted, train_labels, test_data_weighted,
                  test_labels, test_weights)
    models.run_gnb(train_data_weighted, train_labels, test_data_weighted,
                   test_labels, test_weights)
    models.run_gradient_boosting(train_data_weighted, train_labels,
                                 test_data_weighted, test_labels, test_weights)
    models.run_decision_tree(train_data_weighted, train_labels,
                             test_data_weighted, test_labels, test_weights)
    models.run_xgboost(train_data_weighted, train_labels, test_data_weighted,
                       test_labels, test_weights)
    models.run_random_forest(train_data_weighted, train_labels,
                             test_data_weighted, test_labels, test_weights)
    preds_1 = models.run_xgboost(data_jet_num_1, data_jet_num_1_labels,
                                 data_jet_num_1_test,
                                 data_jet_num_1_test_labels,
                                 data_jet_num_1_test_weights)
    preds_2_3 = models.run_xgboost(data_jet_num_2_3, data_jet_num_2_3_labels,
                                   data_jet_num_2_3_test,
                                   data_jet_num_2_3_test_labels,
                                   data_jet_num_2_3_test_weights)
    preds_xgb = merge_jet_num(
        preds_0, preds_1, preds_2_3, data_jet_num_0_test_labels,
        data_jet_num_1_test_labels, data_jet_num_2_3_test_labels,
        data_jet_num_0_test_weights, data_jet_num_1_test_weights,
        data_jet_num_2_3_test_weights)

    preds_0 = models.run_random_forest(data_jet_num_0, data_jet_num_0_labels,
                                       data_jet_num_0_test,
                                       data_jet_num_0_test_labels,
                                       data_jet_num_0_test_weights)
    preds_1 = models.run_random_forest(data_jet_num_1, data_jet_num_1_labels,
                                       data_jet_num_1_test,
                                       data_jet_num_1_test_labels,
                                       data_jet_num_1_test_weights)
    preds_2_3 = models.run_random_forest(data_jet_num_2_3,
                                         data_jet_num_2_3_labels,
                                         data_jet_num_2_3_test,
                                         data_jet_num_2_3_test_labels,
                                         data_jet_num_2_3_test_weights)
    preds_rf = merge_jet_num(
        preds_0, preds_1, preds_2_3, data_jet_num_0_test_labels,
        data_jet_num_1_test_labels, data_jet_num_2_3_test_labels,
        data_jet_num_0_test_weights, data_jet_num_1_test_weights,
        data_jet_num_2_3_test_weights)