np.random.shuffle(idx) X = X[idx] y = y[idx] # split the data Xtrain = X[:nTrain, :] ytrain = y[:nTrain] Xtest = X[nTrain:, :] ytest = y[nTrain:] # train the decision tree #modelDT = DecisionTreeClassifier() #modelDT.fit(Xtrain,ytrain) # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2) modelBoostedDT.fit(Xtrain, ytrain) # output predictions on the remaining data #ypred_DT = modelDT.predict(Xtest) ypred_BoostedDT = modelBoostedDT.predict(Xtest) # compute the training accuracy of the model #accuracyDT = accuracy_score(ytest, ypred_DT) accuracyBoostedDT = accuracy_score(ytest, ypred_BoostedDT) #print "Decision Tree Accuracy = "+str(accuracyDT) print "Boosted Decision Tree Accuracy = " + str(accuracyBoostedDT) # challenge data challengeTrainingData = np.loadtxt('data/challengeTrainLabeled.dat', delimiter=',')
np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] # split the data Xtrain = X[:nTrain,:] ytrain = y[:nTrain] Xtest = X[nTrain:,:] ytest = y[nTrain:] print ytest # print Xtrain # train the decision tree modelDT = DecisionTreeClassifier() modelDT.fit(Xtrain,ytrain) # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2) modelBoostedDT.fit(Xtrain,ytrain) # output predictions on the remaining data ypred_DT = modelDT.predict(Xtest) ypred_BoostedDT = modelBoostedDT.predict(Xtest) # compute the training accuracy of the model accuracyDT = accuracy_score(ytest, ypred_DT) accuracyBoostedDT = accuracy_score(ytest, ypred_BoostedDT) print ','.join(str(e) for e in ypred_BoostedDT.astype(int)) print "Decision Tree Accuracy = "+str(accuracyDT) print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT)
X = X[idx] y = y[idx] # split the data Xtrain = X[:nTrain, :] ytrain = y[:nTrain] Xtest = X[nTrain:, :] ytest = y[nTrain:] # train the decision tree modelDT = DecisionTreeClassifier() modelDT.fit(Xtrain, ytrain) #print ypred_DT # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=3) model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=13) kfold = cross_validation.KFold(n=n, n_folds=2, random_state=13) results = cross_validation.cross_val_score(model, X, y, cv=kfold) modelBoostedDT.fit(Xtrain, ytrain) model.fit(Xtrain, ytrain) clf = SVC() clf.fit(Xtrain, ytrain) y_pred_rbf1 = clf.predict(Xtest) scores = list() scores_rbf = list() k_model = BoostedDT(numBoostingIters=140, maxTreeDepth=5) k_model.fit(X, y)
idx = np.arange(n) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] # split the data Xtrain = X[:nTrain,:] ytrain = y[:nTrain] Xtest = X[nTrain:,:] ytest = y[nTrain:] # train the decision tree modelDT = DecisionTreeClassifier() modelDT.fit(Xtrain,ytrain) # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=200, maxTreeDepth=3) modelBoostedDT.fit(Xtrain,ytrain) # output predictions on the remaining data ypred_DT = modelDT.predict(Xtest) ypred_BoostedDT = modelBoostedDT.predict(Xtest) # compute the training accuracy of the model accuracyDT = accuracy_score(ytest, ypred_DT) accuracyBoostedDT = accuracy_score(ytest, ypred_BoostedDT) print "Decision Tree Accuracy = "+str(accuracyDT) print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT)
filename = 'data/challengeTestUnlabeled.dat' X2 = np.loadtxt(filename, delimiter=',') #n,d = X.shape #nTrain = 0.50*n #training on 50% of the data # split the data #Xtrain = X[:nTrain,:] #ytrain = y[:nTrain] #Xtest = X[nTrain:,:] #ytest = y[nTrain:] Xtest_unlabeled = X2 # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=500, maxTreeDepth=6) modelBoostedDT.fit(X,y) # output predictions on the remaining data #ypred_BoostedDT = modelBoostedDT.predict(X) #accuracyBoostedDT = accuracy_score(y, ypred_BoostedDT) #print 'Calculated boostedDT accuracy: ',accuracyBoostedDT #output predictions for unlabeled data ypred_unlabeled_BoostedDT = modelBoostedDT.predict(Xtest_unlabeled) f = open('data/predictions-BoostedDT.dat', 'w') predict_string = '' for i in xrange(len(ypred_unlabeled_BoostedDT)): if i == 0:
n,d = Xdata.shape nTrain = 0.5*n idx = np.arange(n) np.random.seed(22) np.random.shuffle(idx) Xdata = Xdata[idx] ydata = ydata[idx] boost_iter_list = [] for i in [100,1000,10000]: depth_list = [] for j in [1,2,3]: test_accuracy_list = [] modelBoostedDT = BoostedDT(numBoostingIters=i, maxTreeDepth=j) kf = KFold(2000, n_folds=10) for train_index, test_index in kf: Xtrain, Xtest = Xdata[train_index], Xdata[test_index] ytrain, ytest = ydata[train_index], ydata[test_index] modelBoostedDT.fit(Xtrain,ytrain) test_ypred_BoostedDT = modelBoostedDT.predict(Xtest) test_accuracyBoostedDT = accuracy_score(ytest, test_ypred_BoostedDT) test_accuracy_list.append(test_accuracyBoostedDT) depth_list.append(np.mean(test_accuracy_list)) boost_iter_list.append(depth_list) ''' output [[0.031000000000000007, 0.016499999999999997, 0.017000000000000005], [0.027000000000000003, 0.019000000000000003, 0.013500000000000002], [0.026500000000000003, 0.020500000000000001, 0.013000000000000001]]
np.random.shuffle(idx) X = X[idx] y = y[idx] # split the data Xtrain = X[:nTrain,:] ytrain = y[:nTrain] Xtest = X[nTrain:,:] ytest = y[nTrain:] # train the decision tree modelDT = DecisionTreeClassifier() modelDT.fit(Xtrain,ytrain) # train the boosted DT modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2) modelBoostedDT.fit(Xtrain,ytrain) # output predictions on the remaining data ypred_DT = modelDT.predict(Xtest) ypred_BoostedDT = modelBoostedDT.predict(Xtest) # compute the training accuracy of the model accuracyDT = accuracy_score(ytest, ypred_DT) accuracyBoostedDT = accuracy_score(ytest, ypred_BoostedDT) print "Decision Tree Accuracy = "+str(accuracyDT) print "Boosted Decision Tree Accuracy = "+str(accuracyBoostedDT) # predict data/challengeTrainLabeled.dat newBoosted = BoostedDT(numBoostingIters=100, maxTreeDepth=2)