def main(P, src, model): samples = [] # read data for line in src: line = line.strip() if not line[0].isdigit(): continue d = line.split() proto = 1 if d[1] == "TCP" else 2 port = int(d[2]) gt = d[-1] b = int((len(d)-4) / 2) pl_up = [int(x) for x in d[3:3+b]][0:P.i] pl_down = [int(x) for x in d[3+b:-1]][0:P.i] if pl_up[0] == -1 or pl_down[0] == -1: continue v = [proto,port] + pl_up + pl_down samples.append((v, gt)) # load model cls = DT(verb=True) cls.load(model) # test (acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples]) print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" % (acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
def main(P, src, model): samples = [] # read data for line in src: line = line.strip() if not line[0].isdigit(): continue d = line.split() proto = 1 if d[1] == "TCP" else 2 port = int(d[2]) gt = d[-1] b = int((len(d)-4) / 2) szup = [int(x) for x in d[3:3+b]][0:P.i] szdown = [int(x) for x in d[3+b:-1]][0:P.i] if szup[0] == 0 or szdown[0] == 0: continue v = [proto,port] + szup + szdown samples.append((v, gt)) # load model #cls = kNN(k=P.k, verb=True) #cls = kNN(k=P.k) cls = DT() cls.load(model) cls.algo.set_params(n_jobs=-1) # test (acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples]) print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" % (acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
def main(P, src, model): samples = [] # read data for line in src: line = line.strip() if not line[0].isdigit(): continue d = line.split() proto = 1 if d[1] == "TCP" else 2 port = int(d[2]) gt = d[-1] stats = [int(x) for x in d[3:-1]] if stats[4] == 0 or stats[12] == 0: continue v = [proto,port] + stats samples.append((v, gt)) # load model cls = DT() cls.load(model) cls.algo.set_params(n_jobs=-1) # test (acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples]) print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" % (acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
def _dtCompileFunc(name, data): if Configuration.noTagDebug: otagdbg = DTCompilerUtil.tagDebug DTCompilerUtil.tagDebug = dt_no_tag_debug obj = DT.compileTemplate(data, name, tagRegistry) DTCompilerUtil.tagDebug = otagdbg return obj else: return DT.compileTemplate(data, name, tagRegistry)
def _dtCompileFunc(name, data): if Configuration.noTagDebug: otagdbg = DTCompilerUtil.tagDebug DTCompilerUtil.tagDebug = dt_no_tag_debug obj = DT.compileTemplate(data, name, tagRegistry) DTCompilerUtil.tagDebug = otagdbg return obj else: return DT.compileTemplate(data, name, tagRegistry)
def buildtree(x,y, samples, min_node=1, result_cur = None): if type(x) != np.ndarray: x = np.array(x) if type(y) != np.ndarray: y = np.array(y) if type(samples) != np.ndarray: samples = np.array(samples) if len(samples) == 0: return DTme.decisionnode() ## transform old rank to new rank form if y.ndim == 2: # rank_old form # y = y.tolist() temp = map(rankO2New, y) y = np.array(temp) if result_cur is None: result_cur = MM(y[samples]) if len(samples)<= min_node: return DTme.decisionnode(result=result_cur[1]) # find best split best_gain = 0.0 best_split = [] best_sets = [] best_sets_result = [] N_feature = x.shape[1] start = datetime.now() ### test for feature in range(N_feature): # nlogn selection min_var, split, sets, sets_result = bestSplit(x,y,samples,feature) if min_var is None: continue gain = result_cur[0] - min_var # print "feature: ", feature, "gain: ", gain, "result_cur: ", result_cur, "min_var: ", min_var ### test if gain > best_gain and len(sets[0]) * len(sets[1]) > 0: best_gain = gain best_split = split best_sets = sets best_sets_result = sets_result duration = datetime.now() - start ### test print "Nsamps: ", len(samples) print "duration: ", duration.total_seconds() if best_gain > 0: tb = buildtree(x,y, best_sets[0], min_node = min_node, result_cur = best_sets_result[0]) fb = buildtree(x,y, best_sets[1], min_node = min_node, result_cur = best_sets_result[1]) return DTme.decisionnode(feature = best_split[0], value = best_split[1], result = result_cur[1], tb = tb, fb = fb, gain = (tb.gain+fb.gain+best_gain), size_subtree = (tb.size+fb.size)) else: return DTme.decisionnode(result = result_cur[1])
def crossval(X,Y,size,k=10): score = [] for i in range(0,k-1): rem = range(size*i,size*i+size) rem = set(rem) m = X.shape[0] left = set(range(0,m)) - rem left = list(left) train = np.take(X,left,axis=0) tree = learn(train) a,b = dt.test(tree,Y) c = dt.accr(a,b) score.append(c) return score
def DecisionTree(words): alpha = 150 beta = 15 mytree = DT.Tree() mytree.load(mytree.root, alpha, beta) label = mytree.predict(mytree.root, words) return label
def DecisionTreeTest(pca_option): import DT DT.DecisionTreeSimulation( DT.dt, processing.linear_pca, processing.overall_training_data, pca_option) processing.final_validation = np.array(processing.final_validation) FV_features = [] FV_labels = [] FV_features, FV_labels = processing.createFeatures_Labels( processing.final_validation) FV_features_data = None FV_labels_data = None FV_features_data, FV_labels_data = processing.convertToDataFrame( FV_features, FV_labels, processing.column_titles) global DT_final_predictions if(pca_option == 'yes' or pca_option == 'both'): transformed_FV = processing.linear_pca.transform(FV_features_data) final_predictions = DT.dt.predict(transformed_FV) DT_final_predictions = final_predictions accuracy = metrics.accuracy_score(final_predictions, FV_labels) precision = metrics.precision_score( FV_labels, final_predictions, average='micro') recall = metrics.recall_score( FV_labels, final_predictions, average='micro') print('DECISION TREE MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy) print('DECISION TREE MODEL FINAL TEST DATA PRECISION: ', 100 * precision) print('DECISION TREE MODEL FINAL TEST DATA RECALL: ', 100 * recall) print() return accuracy, precision, recall else: final_predictions = DT.dt.predict(FV_features_data) DT_final_predictions = final_predictions accuracy = metrics.accuracy_score(final_predictions, FV_labels) precision = metrics.precision_score( FV_labels, final_predictions, average='micro') recall = metrics.recall_score( FV_labels, final_predictions, average='micro') print('DECISION TREE MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy) print('DECISION TREE MODEL FINAL TEST DATA PRECISION: ', 100 * precision) print('DECISION TREE MODEL FINAL TEST DATA RECALL: ', 100 * recall) print() return accuracy, precision, recall
def running_dt(data, drop_g1g2=0): train_x, test_x, train_y, test_y = pre_process(data, drop_g1g2) res = DT.dt(train_x, train_y, test_x) P, R, F1 = evaluate_result(test_y, res) print("Precise:" + str(P)) print("Recall:" + str(R)) print("F1 Score:" + str(F1)) print() return P, R, F1
def run_test(trX, trY,res_file): desired_dt20 = 0.78 desired_dt50 = 0.78 desired_knn1 = 0.70 desired_knn3 = 0.73 print '\n\nFirst, we run DT and KNN on the training/development data to ' print 'ensure that we are getting roughly the right accuracies.' print 'We use the first 80% of the data as training, and the last' print '20% as test.' decTree = DT.DT() res = 1 print '\nDT (cutoff=20)...' sizeX = trX.shape end = int(np.round(sizeX[0]*0.80,decimals=0)) testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:, :], trY[end:], 20) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_dt20) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nDT (cutoff=20)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) print '\nDT (cutoff=50)...' testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 50) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_dt50) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nDT (cutoff=50)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) knnModel = KNN.KNN() print '\nKNN (K=1)' max_size = sizeX[0] if sizeX[0] < 10001 else 10000 end = int(np.round(max_size*0.80,decimals=0)) testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 1) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_knn1) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nKNN (K=1)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) print '\nKNN (K=3)' testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 3) acc = testRun.run_tt() res += testRun.verifyAcc(acc['acc'], desired_knn3) print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime'] res_file.write('\nKNN (K=3)') res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime'])) raw_input('\nPress enter to continue...') return
def find_best_model(df, tgt): lr, ls, dt, dnn = [], [], [], [] for i in range(100): seed = random.randrange(100) lr.append(LR.linreg(df, tgt, seed)) ls.append(LS.lasso(df, tgt, seed)) dt.append(DT.dectree(df, tgt, seed)) dnn.append(DNN.nn(df, tgt, seed)) print(pd.DataFrame({'lr': lr}).describe()) print(pd.DataFrame({'ls': ls}).describe()) print(pd.DataFrame({'dt': dt}).describe()) print(pd.DataFrame({'dnn': dnn}).describe())
def learningcurve(df,p=10,n=100): m,z = df.shape size = int(0.7*m/n) sizes =[] traina = [] testa = [] times = [] for i in range(1,n): train,trial = dt.split(df,size*i,int(0.3*m)) s = time.clock() tree = learn(train) a,b = dt.test(tree, trial) score = dt.accr(a,b) c,d = dt.test(tree, train) scoret = dt.accr(c,d) e = time.clock() sizes.append(size*i) traina.append(scoret) testa.append(score) times.append(e-s) print("Trial Times") print(times) return sizes,testa,traina
def crossValidate(x,y, method = "dT",cv=5, alpha = None, min_node = 1): # error measure results = [] if method == "logReg": results = {"perf":[], "coef":[], "interc":[]} elif method == "dT": results = {"alpha": [], "perf":[]} # cross validation # np.random.seed(1100) kf = KFold(n_splits = cv, shuffle = True, random_state = 0) ## for testing fixing random_state for train,test in kf.split(x): x_train = x[train,:] y_train = y[train,:] x_test = x[test,:] y_test = y[test,:] # training and predict if alpha == None: ## nested select validate and test ## # print "start searching alpha:", datetime.now() ### test alpha_sel, perf = DTme.hyperParometer(x_train,y_train) # print "finish searching alpha:", datetime.now(), alpha ### test else: alpha_sel = alpha result = decisionTree(x_train, y_train, x_test, alpha = alpha_sel, min_node = min_node) # performance measure alpha_sel, y_pred = result results["perf"].append(perfMeasure(y_pred,y_test,rankopt=True)) results["alpha"].append(alpha_sel) print alpha_sel, "alpha" for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis = 0) std = np.nanstd(item, axis = 0) results[key] = [mean, std] return results
def main(P, src, dst): samples = [] total = 0 # read data for line in src: line = line.strip() if not line[0].isdigit(): continue total += 1 d = line.split() proto = 1 if d[1] == "TCP" else 2 port = int(d[2]) gt = d[-1] b = int((len(d)-4) / 2) pl_up = [int(x) for x in d[3:3+b]][0:P.i] pl_down = [int(x) for x in d[3+b:-1]][0:P.i] if pl_up[0] == -1 or pl_down[0] == -1: continue v = [proto,port] + pl_up + pl_down samples.append((v, gt)) print("read %d samples out of %d total (%.2f%%)" % (len(samples), total, 100.0*len(samples)/total)) # take random samples if P.t > 0: samples = random.sample(samples, P.t+P.T) train = samples[:P.t] test = samples[P.t:] else: train = samples test = [] # train knc = DT() knc.fit([x[0] for x in train], [x[1] for x in train]) # test if len(test) > 0: (acc, ratio, err) = knc.score([x[0] for x in test], [x[1] for x in test]) print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" % (acc * 100.0, ratio * 100.0, len(test)/1000.0, err)) # store model if dst: knc.store(dst)
def repeatTheLearningProcess(self, bestGridSearched, set): _, bestParams = self.getBestGridSearchedModel(bestGridSearched, set) if bestGridSearched.learnerType == 'KNN': bestGridSearched = KNN.KNNLearner(**bestParams, datasetNo=set.datasetNo) elif bestGridSearched.learnerType == 'DT': bestGridSearched = DT.DTLearner(**bestParams, datasetNo=set.datasetNo) elif bestGridSearched.learnerType == 'SVM': bestGridSearched = SVM.SVMLearner(**bestParams, datasetNo=set.datasetNo) elif bestGridSearched.learnerType == 'Boosting': bestGridSearched = Boosting.BoostingLearner( **bestParams, datasetNo=set.datasetNo) elif bestGridSearched.learnerType == 'ANN': bestGridSearched = ANN.ANNLearner(**bestParams, datasetNo=set.datasetNo) self.getLearningCurve(bestGridSearched, set) self.getComplexityCurve(bestGridSearched, set) return bestGridSearched
#CopaLeche.updateCopa() #MENUUU opcion = int( input( "MENU\n 1-ORGANIZACIONES \n 2-COPAS \n 3-PAISES \n 4-LIGAS \n 5-EQUIPOS \n 6-JUGADOR \n 7- DTs \n 8- Salir \n OPCION: " )) ORG = Organizacion() COPA = Copa() PAIS = Pais() Ligue = Liga() Team = Equipo() Player = Jugador() DeTe = DT() while (opcion != 8): #opcion = int(input("\n MENU\n 1-ORGANIZACIONES \n 2-COPAS \n 3-PAISES \n 4-LIGAS \n 5-EQUIPOS \n 6-JUGADOR \n 7- DTs \n 8- Salir \n OPCION: ")) if (opcion == 1): opcionOrg = int( input( "\n 1-CREAR ORGANIZACION \n 2-INSERTAR ORG EN BASE \n 3-VER ORGANIZACIONES DE LA BASE \n 4-MODIFICAR UNA ORGANIZACION \n 5-ELIMINAR UNA ORGANIZACION \n 6-VOLVER AL 1ER MENU \n OPCION: " )) if (opcionOrg == 1): nombre_org = input("Escriba el nombre de la organizacion: ")
''' import numpy as np import DT as dt import KNN as knn if __name__ == '__main__': print 'running tests on DT and KNN' #This is the class example [mathy, test >= 80, project >= 80, early] #with a slight change so that non-mathy first splits on early. trX=np.array([[1,1,1,1],[1,1,1,0],[0,1,0,1],[0,0,1,1],[0,0,1,1],[0,0,0,0],[0,0,0,0],[1,0,1,1],[1,0,0,1],[0,0,1,1],[1,0,0,0],[0,0,1,1],[0,1,0,1],[0,0,1,0]]) trY=np.array([[1],[1],[0],[0],[0],[1],[0],[1],[0],[0],[0],[0],[0],[1]]) deX = np.array([[0,1,0,0],[0,0,1,0],[0,1,1,1]]) deY = np.array([[0],[1],[0]]) decTree = dt.DT() print 'DT, cutoff=0' trainModel = decTree.res('train',X=trX,Y=trY,h_param=0) decTree.DTdraw(trainModel) output = decTree.res('predict',model=trainModel,test_case=deX) print output knnMode = knn.KNN() print 'KNN, k=1' trainModel = knnMode.res('train',X=trX,Y=trY,h_param=1) output = knnMode.res('predict',model=trainModel,test_case=deX) print output print 'Done'
import sys import time import marshal import stat def phfunc(name, obj): marshal.dump(obj, open(name,'w')) if __name__=='__main__': bt = time.time() fname=sys.argv[1] mtime=os.stat(fname)[stat.ST_MTIME] cform=sys.argv[1]+'.dtcc' try: cmtime=os.stat(cform)[stat.ST_MTIME] comp_form=marshal.load(open(cform)) except: comp_form=None cmtime=-1 d=DT.DT(open(fname).read(), fname, comp_form, mtime, cmtime, lambda x, y=cform: phfunc(y, x)) class dumb: pass ns=dumb() text = d(ns) et = time.time() print text print 'elapsed time:', et - bt
res_file.write('\n' + disp + '\ndone') base = 'baseline'+data_types[i] if base not in results.keys(): print "Lets run some baseline measures..." res = run_test(trX,trY,res_file) res_file.write('\n' + base + '\n') res_file.write(str(res)) raw_input('Press enter to continue...') dec = 'dt'+data_types[i] if dec not in results.keys(): print '\nNow we vary the cutoff for the decision tree and see how it affects accuracy...' thresh = [5,10,20,40,80,160] decTree = DT.DT() res = run_comps(decTree, thresh, trX[0:4800, :], trY[0:4800], trX[4801:6000, :], trY[4801:6000],"Figure 2: DT cutoff versus accuracy (MNIST)","DT cutoff","../figure2.png") results[dec] = res res_file.write('\n' + dec + '\n') res_file.write(str(res)) raw_input('Press enter to continue...') neigh = 'knn'+data_types[i] if neigh not in results.keys(): print '\nNow we vary the k for the KNN classifier and see how it affects accuracy...' allK = [1,8,16,32,64,128] knnModel = KNN.KNN() res = run_comps(knnModel, allK, trX[0:2000, :], trY[0:2000], trX[2001:2501, :], trY[2001:2501],"Figure 3: KNN count versus accuracy (MNIST)","KNN count","../figure3.png") results[neigh] = res
lr.append(LR.linreg(df, tgt, seed)) ls.append(LS.lasso(df, tgt, seed)) dt.append(DT.dectree(df, tgt, seed)) dnn.append(DNN.nn(df, tgt, seed)) print(pd.DataFrame({'lr': lr}).describe()) print(pd.DataFrame({'ls': ls}).describe()) print(pd.DataFrame({'dt': dt}).describe()) print(pd.DataFrame({'dnn': dnn}).describe()) tgt = 'medv' df = prepro.Data(tgt) y = df[tgt] seed = 101 LR.linreg(df, tgt, seed) LS.lasso(df, tgt, seed) DT.dectree(df, tgt, seed) DNN.nn(df, tgt, seed) #find_best_model(df, tgt) #PCA.analysis(df)
import DT import treePloter import numpy as np fr = open('lenses.txt') dataSet = [line.strip().split('\t') for line in fr.readlines()] dataSet = np.array(dataSet) attribute_list = ['age', 'prescript', 'astimatic', 'tearRate'] fixed_attribute_list = ['age', 'prescript', 'astimatic', 'tearRate'] tree = DT.createTree(np.array(dataSet), attribute_list, fixed_attribute_list) print(tree) #print(DT.classify(tree,['pre','myope','yes','normal','hard'],fixed_attribute_list)) #treePloter.createPlot(tree)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=cfg.test_size, random_state=1) fu.cleanSIDirs('./out/') if cfg.algo.lower() == 'ann': import ANN as ann ann.process(X_train, X_test, y_train, y_test) elif cfg.algo.lower() == 'k-nn': import KNN as knn knn.process(X_train, X_test, y_train, y_test) elif cfg.algo.lower() == 'svm': import SVM as svm svm.process(X_train, X_test, y_train, y_test) elif cfg.algo.lower() == 'dt': import DT as dt dt.process(X_train, X_test, y_train, y_test) if cfg.ds_test: if cfg.nTests == 'full': fu.saveTestingSet(X_test, y_test) elif cfg.nTests != None: fu.saveTestingSet(X_test[0:cfg.nTests], y_test[0:cfg.nTests], full=False) if cfg.algo.lower() == 'k-nn': fu.saveTrainingSet(X_train, y_train) if cfg.export_dir != None: from distutils.dir_util import copy_tree fu.cleanSIDirs(f'{cfg.export_dir}/') fromDirectory = f"./out/include" toDirectory = f"{cfg.export_dir}/ds/include" copy_tree(fromDirectory, toDirectory)
def _dtCompileFunc( name, data ): return DT.compileTemplate( data, name, tagRegistry )
DataSet = pd.read_csv(data_file) import DT as decision_tree # 划分训练集和测试集 index = DataSet.shape[0] - 1 index_train = np.arange(index) rand_train = np.random.choice(index_train, size=random.randint(int((index + 1) / 2), index), replace=False) DataSet_train = DataSet.iloc[rand_train] DataSet_test = DataSet.drop(rand_train) # generate a full tree root = decision_tree.TreeGenerate(DataSet_train) decision_tree.DrawPNG( root, "Decision Tree/Decision Tree Based on Gini Index/Decision Tree Based on Gini Index.png", ) print("accuracy of full tree: %.3f" % decision_tree.PredictAccuracy(root, DataSet_test)) # pre-purning 预剪枝 root = decision_tree.PrePurn(DataSet_train, DataSet_test) decision_tree.DrawPNG( root, "Decision Tree/Decision Tree Based on Gini Index/decision_tree_pre.png") print("accuracy of pre-purning tree: %.3f" % decision_tree.PredictAccuracy(root, DataSet_test))
# -*- coding: utf-8 -*- """ Created on Thu Feb 16 15:19:55 2017 @author: Thomas """ import DT import pool import row import server h = DT.DT('dc.in') h.disp()
'maint': ['vhigh', 'high', 'med', 'low'], 'doors': ['2', '3', '4', '5more'], 'persons': ['2', '4', 'more'], 'lug_boot': ['small', 'med', 'big'], 'safety': ['low', 'med', 'high'] } label = {'label': ['unacc', 'acc', 'good', 'vgood']} train_acc = [[0 for x in range(6)] for y in range(3)] test_acc = [[0 for x in range(6)] for y in range(3)] for feature_selection in range(3): for max_depth in range(6): # ID3 dt_generator = dt.ID3(feature_selection=feature_selection, max_depth=max_depth + 1) # get decision tree decision_tree = dt_generator.generate_decision_tree( train_data, features, label) # train acc # predict train_data['plabel'] = dt_generator.classify(decision_tree, train_data) train_acc[feature_selection][max_depth] = train_data.apply( lambda row: 1 if row['label'] == row['plabel'] else 0, axis=1).sum() / train_size # test acc # predict test_data['plabel'] = dt_generator.classify(decision_tree, test_data) test_acc[feature_selection][max_depth] = test_data.apply( lambda row: 1 if row['label'] == row['plabel'] else 0, axis=1).sum() / test_size
import DT import numpy as np X = np.loadtxt('X.csv', dtype=np.int32, delimiter=',') y = np.loadtxt('y.csv', dtype=np.int32, delimiter=',') tree = DT.decision_tree() tree.fit(X, y) tree.predict(X) pause = 0
def __init__(self): DT.initBoard()
def trigger_out(self, value, event): if event == 'strobed': DT.toggleBitsOnPost(STROBE_EVT) DT.setBitsNoDelay(value) DT.clearBitsNoDelay(MAXPOSTABLEINT) DT.toggleBitsOnPost(0) if event == 'start': DT.toggleBitsNoDelay(value) if event == 'stop': DT.clearBitsNoDelay(value)
a,b = dt.test(tree, trial) score = dt.accr(a,b) c,d = dt.test(tree, train) scoret = dt.accr(c,d) e = time.clock() sizes.append(size*i) traina.append(scoret) testa.append(score) times.append(e-s) print("Trial Times") print(times) return sizes,testa,traina if __name__=="__main__": print ("Boosted Decision Tree") df1,df2 = dt.readin() train1, trial1, size = dt.cross(df1) score = crossval(train1,trial1,size) print("Cross Validation for Collection 1") print(score) m,n = df1.shape train1, trial1 = dt.split(df1,int(0.7*m),int(0.3*m)) s = time.clock() tree1 = learn(train1) a,b = dt.test(tree1, trial1) score = dt.accr(a,b) c,d = dt.test(tree1, train1) scoret = dt.accr(c,d) e = time.clock() print("Testing Set Score for Collection 1") print(score)
def TrainingOnDT(): global filenameForTrainingResult,filenameForPredictedResult,\ DTClass_weight,DTMax_features,\ DTCriterion,unknowRow,hamRow,spamRow,PercisionRow,recallRow print DTClass_weight.get() DTconfusion_matrix, percision, recall, combinedResultOnActualAndPred = DT.DT( DTCriterion.get(), DTMax_features.get(), DTClass_weight.get()) print DTconfusion_matrix, percision, recall strOnUnknowRow = "unknown: " + " ".join( str(x) for x in \ DTconfusion_matrix[0]) strOnHamRow= "Ham: " + " ".join(str(x) for x in \ DTconfusion_matrix[1]) strOnSpamRow = "Spam: " + " ".join(str(x) for x in \ DTconfusion_matrix[2]) strOnPercisionRow = "precision On spam is : " + str(percision) strOnRecallRow = "Recall on spam is : " + str(recall) unknowRow.set(strOnUnknowRow) hamRow.set(strOnHamRow) spamRow.set(strOnSpamRow) PercisionRow.set(strOnPercisionRow) recallRow.set(strOnRecallRow) # ####write Condussion result into file print filenameForTrainingResult if (filenameForTrainingResult == ''): pass else: fileOnTrainingResult = open(filenameForTrainingResult, 'w') try: fileOnTrainingResult.write("The result of confusion matrix") fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(" predicted ") fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write( " unknown ham spam") fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(strOnUnknowRow) fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(strOnHamRow) fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(strOnSpamRow) fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(strOnPercisionRow) fileOnTrainingResult.write('\r\n') fileOnTrainingResult.write(strOnRecallRow) finally: fileOnTrainingResult.close() # ###Write ID --Actual result --Predicted resutl into file print filenameForPredictedResult if (filenameForPredictedResult == ''): pass else: fileOnIndividualResult = open(filenameForPredictedResult, 'w') fileOnIndividualResult.write(" ID " "\tActual\tPredicted\r\n") try: for i in combinedResultOnActualAndPred: fileOnIndividualResult.write(i) # fileOnIndividualResult.write('\r\n') finally: fileOnIndividualResult.close() print strOnUnknowRow
import DT import numpy as np dataSet, labels = DT.createDataSet() attribute_list = ['surface', 'flipper'] fixed_attribute = ['surface', 'flipper'] #mapToStr = ['Does it live on surface?','Does it have the flipper?'] tree = DT.createTree(np.array(dataSet), attribute_list, fixed_attribute) #treePloter.createPlot(tree) #print (DT.classify(tree,['1','1'],fixed_attribute)) print(tree)
def test_impurity(): print DT.entropy([1, 1, 1, 0]) print DT.entropy([]) print DT.entropy([1, 1, 1, 1]) a = DT.DecisionTree(1) print a.impurity_func([1, 1, 0, 1], [0, 0])
label = {'y': ['yes', 'no']} num_run = 100 T = 1000 test_py = np.array([[0 for x in range(test_size)] for y in range(num_run)]) test_py_first = np.array([0 for x in range(test_size)]) for iter in range(num_run): train_subset = train_data.sample(n=1000, replace=False, random_state=iter) for t in range(T): print('iter: ', iter, 't: ', t) # sample with replace sampled = train_subset.sample(frac=0.01, replace=True, random_state=t) # ID3 dt_generator = dt.ID3(feature_selection=0, max_depth=17, subset=6) # get decision tree decision_tree = dt_generator.generate_decision_tree( sampled, features, label) ## predict # test py = dt_generator.classify(decision_tree, test_data) py = np.array(py.tolist()) py[py == 'yes'] = 1 py[py == 'no'] = -1 py = py.astype(int) test_py[iter] = test_py[iter] + py if t == 0: test_py_first = test_py_first + py true_value = np.array(test_data['y'].tolist())
def pertest(name, item): print name, item print len(pfunc(item)) class nsc: this = 'that' num = 1 ns = nsc() import DT t = timer.Timer() print t, 'start' x = DT.DT(open('tests/test1.dtml').read()) print t, 'cooked' output = x(ns) print t, 'rendered' #node=x.node #for i in node.children: # pertest('i',i) # if hasattr(i, 'children'): # for j in i.children: # pertest('j',j) # if hasattr(j,'children'): # for k in j.children: # pertest('k', k) print 'serializing'