# Fit the training data to the Survived labels and create the decision trees forest = forest.fit(train_x,train_y) #find training and cv error trainpred = forest.predict(train_x).astype(int) cvpred = forest.predict(cv_x).astype(int) terr = 1-np.sum(trainpred == train_y)/trainpred.shape[0] cverr = 1-np.sum(cvpred == cv_y)/cvpred.shape[0] # Take the same decision trees and run it on the test data output = forest.predict(test_x).astype(int) return terr,cverr,output #load data and seperate into train and cv data_x, data_y, test_x, headings, submission = LoadData.loadcleandata() fraction = 0.66 ###MAKE PREDICTIONS FOR SUBMISSION############### ##nummodels = 100 ##predictions = np.zeros((test_x.shape[0],nummodels)) ##for i in range(nummodels): ## rseed = np.random.randint(1) ## train_x,cv_x,train_y,cv_y = sklearn.cross_validation.train_test_split(data_x,data_y,train_size=int(fraction*data_x.shape[0]),random_state=rseed) ## #select important features using randomized logreg #### rlrtrain_x,rlrcv_x,rlrtest_x = randomlr(train_x,train_y,cv_x,test_x,regp=1,alpha=0.5) #### terr,cverr,testpred = forestit(rlrtrain_x,train_y,rlrcv_x,cv_y,rlrtest_x,n_est=50) ## #train and predict ## terr,cverr,testpred = forestit(train_x,train_y,cv_x,cv_y,test_x,n_est=100) ## predictions[:,i] = testpred