def cv_model(model_list): print "generating cv csv files...." train, test = gen_data() label = train['signal'] train_id = train.id test_id = test.id train_del, test_del = delete_features(train), delete_features(test) check_agreement = pd.read_csv('../data/check_agreement.csv') check_correlation = pd.read_csv('../data/check_correlation.csv') check_agreement= add_features(check_agreement) check_correlation = add_features(check_correlation) X, X_test = train_del.as_matrix(), test_del.as_matrix() print X.shape, X_test.shape kf = KFold(label, n_folds = 4) for j, (clf, clf_name) in enumerate(model_list): print "modelling model %i ...."%j cv_train = np.zeros(len(label)) for i, (train_fold, validate) in enumerate(kf): X_train, X_validate, label_train, label_validate = X[train_fold,:], X[validate,:], label[train_fold], label[validate] clf.fit(X_train,label_train) cv_train[validate] = clf.predict_proba(X_validate)[:,1] auc_score = evaluation.roc_auc_truncated(label[train['min_ANNmuon'] > 0.4], pd.Series(cv_train)[train['min_ANNmuon'] > 0.4]) print "the true roc_auc_truncated is %.6f"%auc_score clf.fit(X, label) test_probs = clf.predict_proba(X_test)[:,1] # check if it passes the tests print "check if it passes the tests" agreement_probs = clf.predict_proba(delete_features(check_agreement).as_matrix())[:,1] ks = evaluation.compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric', ks, ks <= 0.09) correlation_probs = clf.predict_proba(delete_features(check_correlation).as_matrix())[:,1] print ('Checking correlation...') cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass']) print ('CvM metric', cvm, cvm <= 0.002) #if ks <= 0.09 and cvm <= 0.002 and auc_score > 0.975: # no need to check here if auc_score > 0.965: # the minimum threshold # save the cv cv_sub = pd.DataFrame({"id": train_id, "prediction": cv_train, "label": label}) cv_sub.to_csv("../data/cv_folder/xgb%i.csv"%j, index=False) # save the prediction submission = pd.DataFrame({"id": test_id, "prediction": test_probs}) submission.to_csv("../data/pred_folder/xgb%i.csv"%j, index=False) # save agreement submission = pd.DataFrame({"id": check_agreement['id'], "prediction": agreement_probs}) submission.to_csv("../data/agree_folder/xgb%i.csv"%j, index=False) # save correlation submission = pd.DataFrame({"id": check_correlation['id'], "prediction": correlation_probs}) submission.to_csv("../data/correlation_folder/xgb%i.csv"%j, index=False)
def gen_data(): path = '../data/' print "loading data..." train = pd.read_csv(path + "training.csv") test = pd.read_csv(path + "test.csv") train, test = add_features(train), add_features(test) return train, test
print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0 train = pd.read_csv('../input/training.csv') test = pd.read_csv('../input/test.csv') train = add_features(train) test = add_features(test) # add SPDHITS back... filter_out = [ 'id', 'min_ANNmuon', 'production', 'mass', 'signal', 'p0_eta', 'p1_eta', 'p2_eta', 'LifeTime', 'FlightDistanceError' ] #features = list(train.columns) features = list(f for f in train.columns if f not in filter_out) is_test = False res = stacked_models(train, features, test, is_test)
print 'Scipy Score = %s' % (score4) print 'LR + GB score = %s' % (score5) print 'ANN Score= %s' % (score6) print 'LR + GB + ANN Score = %s' % (score7) print 'ADA Score = %s' % (score8) print 'GB + ANN + ADA Score = %s' % (score9) print 'LR + GB + ANN + ADA Score = %s' % (score10) return blend_train, Y_dev, blend_test, Y_test # average of ADA, ANN and GBM. return (Y_test_predict + Y_test_predict2 + Y_test_predict3 + Y_test_predict4) / 4.0 train = pd.read_csv('../input/training.csv') test = pd.read_csv('../input/test.csv') train = add_features(train) test = add_features(test) # add SPDHITS back... filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'p0_eta','p1_eta','p2_eta','LifeTime', 'FlightDistanceError'] #features = list(train.columns) features = list(f for f in train.columns if f not in filter_out) is_test = False res = stacked_models(train, features, test, is_test) if not is_test:
import evaluation from sklearn.ensemble import GradientBoostingClassifier from hep_ml.uboost import uBoostClassifier from hep_ml.gradientboosting import UGradientBoostingClassifier,LogLossFunction from hep_ml.losses import BinFlatnessLossFunction, KnnFlatnessLossFunction print("Load the training/test data using pandas") train = pd.read_csv("../input/training.csv") test = pd.read_csv("../input/test.csv") check_agreement = pd.read_csv('../input/check_agreement.csv') check_correlation = pd.read_csv('../input/check_correlation.csv') from feat import add_features print("Adding features to both training and testing") train = add_features(train) test = add_features(test) check_agreement = add_features(check_agreement) check_correlation = add_features(check_correlation) print("Eliminate SPDhits, which makes the agreement check fail") from feat import filter_out features = list(f for f in train.columns if f not in filter_out) train_eval = train[train['min_ANNmuon'] > 0.4] print("features:",features) #train[features] = train[features].apply(lambda x: (x - x.min()) / (x.max() - x.min())) #test[features] = test[features].apply(lambda x: (x - x.min()) / (x.max() - x.min()))