def main(): df_train, df_evaluate = read_challenge_data() df_train = process_and_filter_data(df_train, config) df_evaluate = process_evaluate_data(df_evaluate, config) X_train, X_test, y_train, y_test = split_data(df_train, config) run_baseline(X_train, X_test, y_train, y_test, config) gb_clf, y_pred_gb = run_gradient_boosting_classifier( X_train, X_test, y_train, y_test, config) rf_clf, y_pred_rf = run_random_forest(X_train, X_test, y_train, y_test, config) xgb_clf, y_pred_xgb = run_xgboost(X_train, X_test, y_train, y_test, config) # Voting classifier. voting_clf = VotingClassifier(estimators=[("rf", rf_clf), ("gb", gb_clf), ("xgb", xgb_clf)], voting="soft").fit(X_train, y_train) stratified_shuffle_split = StratifiedKFold(n_splits=10) cross_val_score_ = cross_val_score(voting_clf, X_train, y_train, cv=stratified_shuffle_split).mean() LOGGER.info( f"Voting classifier cross validation score: {cross_val_score_}") if config["test"]: print(classification_report(voting_clf.predict(X_test), y_test)) final_model = voting_clf final_predictions = final_model.predict(df_evaluate) pd.DataFrame(final_predictions).to_csv("data/101617.txt", index=False, header=False)
def extract_weighted_columns(data): data = data[:, [1, 2, 4, 6, 8, 9, 10, 15, 17, 18, 21, 25, 27, 28, 29]] return data if __name__ == "__main__": training_data = genfromtxt('training.csv', dtype=str, delimiter=',') testing_data = genfromtxt('testing.csv', dtype=str, delimiter=',') train_data, train_labels, train_weights = helper.process_data( training_data) test_data, test_labels, test_weights = helper.process_data(testing_data) train_data_weighted = helper.normalize_data( helper.replace_missing_values(extract_weighted_columns(train_data))) test_data_weighted = helper.normalize_data( helper.replace_missing_values(extract_weighted_columns(test_data))) models.run_lr(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights) models.run_gnb(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights) models.run_gradient_boosting(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights) models.run_decision_tree(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights) models.run_xgboost(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights) models.run_random_forest(train_data_weighted, train_labels, test_data_weighted, test_labels, test_weights)
preds_1 = models.run_xgboost(data_jet_num_1, data_jet_num_1_labels, data_jet_num_1_test, data_jet_num_1_test_labels, data_jet_num_1_test_weights) preds_2_3 = models.run_xgboost(data_jet_num_2_3, data_jet_num_2_3_labels, data_jet_num_2_3_test, data_jet_num_2_3_test_labels, data_jet_num_2_3_test_weights) preds_xgb = merge_jet_num( preds_0, preds_1, preds_2_3, data_jet_num_0_test_labels, data_jet_num_1_test_labels, data_jet_num_2_3_test_labels, data_jet_num_0_test_weights, data_jet_num_1_test_weights, data_jet_num_2_3_test_weights) preds_0 = models.run_random_forest(data_jet_num_0, data_jet_num_0_labels, data_jet_num_0_test, data_jet_num_0_test_labels, data_jet_num_0_test_weights) preds_1 = models.run_random_forest(data_jet_num_1, data_jet_num_1_labels, data_jet_num_1_test, data_jet_num_1_test_labels, data_jet_num_1_test_weights) preds_2_3 = models.run_random_forest(data_jet_num_2_3, data_jet_num_2_3_labels, data_jet_num_2_3_test, data_jet_num_2_3_test_labels, data_jet_num_2_3_test_weights) preds_rf = merge_jet_num( preds_0, preds_1, preds_2_3, data_jet_num_0_test_labels, data_jet_num_1_test_labels, data_jet_num_2_3_test_labels, data_jet_num_0_test_weights, data_jet_num_1_test_weights, data_jet_num_2_3_test_weights)