preds_array = np.array(p.to_dataframe(), dtype=float) return preds_array if __name__ == '__main__': train, labels, test, _, _ = utils.load_data() clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.009) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print ('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': train, labels, _, _ = utils.stratified_split(train, labels, test_size=.7) score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print ('Log loss:', score) elif MODE == 'tune': # Objective function def objective(args): max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
train, labels, test, _, _ = utils.load_data() # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray() # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions) elif MODE == 'holdout': score = utils.hold_out_evaluation(clf, train, labels, calibrate=False) print('Log loss:', score) else: print('Unknown mode')