resultTrain=list() resultTest=list() for i in range(0, len(varToClean)): col=varToClean[i] resultTrain.append(bowConverter(decoder[col], TrainData[col])) resultTest.append(bowConverter(decoder[col], TestData[col])) print(resultTrain[i].shape) print(resultTest[i].shape) #rf with 500 trees default setting get 60.7% from scipy.sparse import hstack X=hstack(resultTrain).toarray() CST_1=np.array(TrainData.CST_1.tolist()) CST_2=np.array(TrainData.CST_2.tolist()) import sys sys.path.insert(0, filepath+'HierarchicalModel/') import hierarchicalModel_rfrf as myfunc #Model #fit model result = myfunc.hierarchicalModel() result.fit(X, CST_1, CST_2) #pickle.dump(result, open(filepath+'HierarchicalModel/HierarchicalModel_rfrf.pickle', 'wb')) #this thing is 50+GB, too large to write to dropbox, so write it to desktop pickle.dump(result, open('C:/Users/vichan/Desktop/HierarchicalModel_rfrf.pickle', 'wb'))
sys.path.insert(0, filepath+'HierarchicalModel/') import hierarchicalModel_rfrf as myfunc from sklearn.cross_validation import StratifiedKFold estimatedError1=list() estimatedError2=list() estimatedError3=list() skf = StratifiedKFold(y=TrainData.Y, n_folds=nfold, random_state=987654) for train, test in skf: trainX=X[train,:] trainCST_1=CST_1[train] trainCST_2=CST_2[train] testX=X[test,:] testY=Y[test] #fit model clf = myfunc.hierarchicalModel() clf.fit(trainX, trainCST_1, trainCST_2) CST_1_Pred, CST_2_Pred = clf.predict(testX) predictedY = np.array(['%s::$!^!$::%s' % t for t in zip(CST_1_Pred.tolist(), CST_2_Pred.tolist())]) #test model error1=np.mean(CST_1[test]==np.array(CST_1_Pred))*100 error2=np.mean(testY==predictedY)*100 estimatedError1.append(error1) estimatedError2.append(error2) temp=list() for k in TrainData.CST_1.unique().tolist(): temp.append(np.mean(testY[CST_1_Pred==k]==predictedY[CST_1_Pred==k])*100) estimatedError3.append(temp)