gbr1 = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=3, max_depth=15, init=None, random_state=None, max_features=None, alpha=0.9, verbose=2) # Select Greedy features from Xt_orig1 Xt_orig1 = Xt_orig1[:, [0, 1, 3, 5, 7, 9, 12, 13, 14, 16, 19, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36]] #Xt_orig1 = Xt_orig1[:,[0, 1, 3, 5, 8, 9, 11, 12, 13, 16, 18, 21, 22, 23, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 38]] print "encoding original data..." Xt_orig, keymap = utils.OneHotEncoder(Xt_orig1) train_orig = Xt_orig[:num_train] test_orig = Xt_orig[num_train:] #print "use ACTION for data generation..." y, X = data_io.load_data_pd('../data/train_orig.csv', use_labels=True) _, X_test = data_io.load_data_pd('../data/test_orig.csv', use_labels=False) oldTest = loadData('../data/test_orig.csv') oldTrain= loadData('../data/train_orig.csv') oldTrain = oldTrain[1:,1:] oldTest = oldTest[1:,1:] # # print "Grouping Data" # # xd2 = utils.group_data2(oldTrain[:,:-1], degree=2) #skip last column # # xd3 = utils.group_data2(oldTrain[:,:-1], degree=3) #skip last column # # xtestd2 = utils.group_data2(oldTest[:,:-1], degree=2) # # xtestd3 = utils.group_data2(oldTest[:,:-1], degree=3) # # X_old = np.hstack((oldTrain, xd2, xd3))
def doCV(): SEED = 42 rnd = np.random.RandomState(SEED) model_lr = linear_model.LogisticRegression(C=3) model_rf = ensemble.RandomForestClassifier( n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2 ) print "loading data for random forest..." y, X = data_io.load_data_pd("train_orig.csv", use_labels=True) _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False) xtrain = getRFX(X) xtest = getRFX_test(X_test) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] xtrain.dump("num_train.dat") xtest.dump("num_test.dat") print "dumped..!" print "loading data for logistic regression..." ysp, Xsp = data_io.load_data("train_orig.csv") y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() # print Xsp.shape, X_testsp.shape encoder.fit(np.vstack((Xsp, X_testsp))) Xsp = encoder.transform(Xsp) # Returns a sparse matrix (see numpy.sparse) X_testsp = encoder.transform(X_testsp) print "starting cross validation..." nFeatures = X.shape[0] niter = 10 cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd) mean_auc = 0.0 i = 0 for train, test in cv: xtrain = X.ix[train] ytrain = y[train] xtest = X.ix[test] ytest = y[test] xtrain_sp = Xsp[train] xtest_sp = X_testsp[test] ytrainsp = ysp[train] xtrain = getRFX(xtrain) xtest = getRFX_test(xtest) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] print "fitting random forest...." model_rf.fit(xtrain, ytrain) preds_rf = model_rf.predict_proba(xtest)[:, 1] print "fitting logistic regression..." model_lr.fit(xtrain_sp, ytrainsp) preds_lr = model_lr.predict_proba(xtest_sp)[:, 1] preds = [np.mean(x) for x in zip(preds_rf, preds_lr)] fpr, tpr, _ = metrics.roc_curve(ytest, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc) mean_auc += roc_auc i += 1 print "Mean AUC: ", mean_auc / niter