示例#1
0
def main(P, src, model):
	samples = []

	# read data
	for line in src:
		line = line.strip()
		if not line[0].isdigit(): continue

		d = line.split()
		proto = 1 if d[1] == "TCP" else 2
		port = int(d[2])
		gt = d[-1]

		b = int((len(d)-4) / 2)
		pl_up = [int(x) for x in d[3:3+b]][0:P.i]
		pl_down = [int(x) for x in d[3+b:-1]][0:P.i]

		if pl_up[0] == -1 or pl_down[0] == -1: continue

		v = [proto,port] + pl_up + pl_down
		samples.append((v, gt))

	# load model
	cls = DT(verb=True)
	cls.load(model)

	# test
	(acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples])
	print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" %
		(acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
示例#2
0
def main(P, src, model):
	samples = []

	# read data
	for line in src:
		line = line.strip()
		if not line[0].isdigit(): continue

		d = line.split()
		proto = 1 if d[1] == "TCP" else 2
		port = int(d[2])
		gt = d[-1]

		b = int((len(d)-4) / 2)
		szup = [int(x) for x in d[3:3+b]][0:P.i]
		szdown = [int(x) for x in d[3+b:-1]][0:P.i]

		if szup[0] == 0 or szdown[0] == 0: continue

		v = [proto,port] + szup + szdown
		samples.append((v, gt))

	# load model
	#cls = kNN(k=P.k, verb=True)
	#cls = kNN(k=P.k)
	cls = DT()
	cls.load(model)
	cls.algo.set_params(n_jobs=-1)

	# test
	(acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples])
	print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" %
		(acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
示例#3
0
def main(P, src, model):
	samples = []

	# read data
	for line in src:
		line = line.strip()
		if not line[0].isdigit(): continue

		d = line.split()
		proto = 1 if d[1] == "TCP" else 2
		port = int(d[2])
		gt = d[-1]

		stats = [int(x) for x in d[3:-1]]
		if stats[4] == 0 or stats[12] == 0: continue

		v = [proto,port] + stats
		samples.append((v, gt))

	# load model
	cls = DT()
	cls.load(model)
	cls.algo.set_params(n_jobs=-1)

	# test
	(acc, ratio, err) = cls.score([x[0] for x in samples], [x[1] for x in samples])
	print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" %
		(acc * 100.0, ratio * 100.0, len(samples)/1000.0, err))
示例#4
0
def _dtCompileFunc(name, data):
    if Configuration.noTagDebug:
        otagdbg = DTCompilerUtil.tagDebug
        DTCompilerUtil.tagDebug = dt_no_tag_debug
        obj = DT.compileTemplate(data, name, tagRegistry)
        DTCompilerUtil.tagDebug = otagdbg
        return obj
    else:
        return DT.compileTemplate(data, name, tagRegistry)
示例#5
0
def _dtCompileFunc(name, data):
    if Configuration.noTagDebug:
        otagdbg = DTCompilerUtil.tagDebug
        DTCompilerUtil.tagDebug = dt_no_tag_debug
        obj = DT.compileTemplate(data, name, tagRegistry)
        DTCompilerUtil.tagDebug = otagdbg
        return obj
    else:
        return DT.compileTemplate(data, name, tagRegistry)
def buildtree(x,y, samples, min_node=1, result_cur = None):
    if type(x) != np.ndarray:
        x = np.array(x)
    if type(y) != np.ndarray:
        y = np.array(y)
    if type(samples) != np.ndarray:
        samples = np.array(samples)
    if len(samples) == 0:
        return DTme.decisionnode()
    ## transform old rank to new rank form
    if y.ndim == 2:
        # rank_old form #
        y = y.tolist()
        temp = map(rankO2New, y)
        y = np.array(temp)


    if result_cur is None:
        result_cur = MM(y[samples])

    if len(samples)<= min_node:
        return DTme.decisionnode(result=result_cur[1])
    # find best split
    best_gain = 0.0
    best_split = []
    best_sets = []
    best_sets_result = []

    N_feature = x.shape[1]
    start = datetime.now() ### test
    for feature in range(N_feature):
        # nlogn selection
        min_var, split, sets, sets_result = bestSplit(x,y,samples,feature)
        if min_var is None:
            continue
        gain = result_cur[0] - min_var
        # print "feature: ", feature, "gain: ", gain, "result_cur: ", result_cur, "min_var: ", min_var ### test
        if gain > best_gain and len(sets[0]) * len(sets[1]) > 0:
            best_gain = gain
            best_split = split
            best_sets = sets
            best_sets_result = sets_result
    duration = datetime.now() - start ### test
    print "Nsamps: ", len(samples)
    print "duration: ", duration.total_seconds()

    if best_gain > 0:
        tb = buildtree(x,y, best_sets[0], min_node = min_node, result_cur = best_sets_result[0])
        fb = buildtree(x,y, best_sets[1], min_node = min_node, result_cur = best_sets_result[1])
        return DTme.decisionnode(feature = best_split[0], value = best_split[1], result = result_cur[1],
                            tb = tb, fb = fb,
                            gain = (tb.gain+fb.gain+best_gain), size_subtree = (tb.size+fb.size))
    else:
        return DTme.decisionnode(result = result_cur[1])
示例#7
0
def crossval(X,Y,size,k=10):
    score = []
    for i in range(0,k-1):
        rem = range(size*i,size*i+size)
        rem = set(rem)
        m = X.shape[0]
        left = set(range(0,m)) - rem
        left = list(left)
        train = np.take(X,left,axis=0)
        tree = learn(train)
        a,b = dt.test(tree,Y)
        c = dt.accr(a,b)
        score.append(c)
    return score
示例#8
0
def DecisionTree(words):
    alpha = 150
    beta = 15
    mytree = DT.Tree()
    mytree.load(mytree.root, alpha, beta)
    label = mytree.predict(mytree.root, words)
    return label
示例#9
0
def DecisionTreeTest(pca_option):

    import DT

    DT.DecisionTreeSimulation(
        DT.dt, processing.linear_pca, processing.overall_training_data, pca_option)

    processing.final_validation = np.array(processing.final_validation)

    FV_features = []
    FV_labels = []

    FV_features, FV_labels = processing.createFeatures_Labels(
        processing.final_validation)

    FV_features_data = None
    FV_labels_data = None

    FV_features_data, FV_labels_data = processing.convertToDataFrame(
        FV_features, FV_labels, processing.column_titles)

    global DT_final_predictions
    if(pca_option == 'yes' or pca_option == 'both'):

        transformed_FV = processing.linear_pca.transform(FV_features_data)
        final_predictions = DT.dt.predict(transformed_FV)
        DT_final_predictions = final_predictions

        accuracy = metrics.accuracy_score(final_predictions, FV_labels)
        precision = metrics.precision_score(
            FV_labels, final_predictions, average='micro')
        recall = metrics.recall_score(
            FV_labels, final_predictions, average='micro')

        print('DECISION TREE MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy)
        print('DECISION TREE MODEL FINAL TEST DATA PRECISION: ', 100 * precision)
        print('DECISION TREE MODEL FINAL TEST DATA RECALL: ', 100 * recall)
        print()
        return accuracy, precision, recall

    else:

        final_predictions = DT.dt.predict(FV_features_data)
        DT_final_predictions = final_predictions

        accuracy = metrics.accuracy_score(final_predictions, FV_labels)
        precision = metrics.precision_score(
            FV_labels, final_predictions, average='micro')
        recall = metrics.recall_score(
            FV_labels, final_predictions, average='micro')

        print('DECISION TREE MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy)
        print('DECISION TREE MODEL FINAL TEST DATA PRECISION: ', 100 * precision)
        print('DECISION TREE MODEL FINAL TEST DATA RECALL: ', 100 * recall)
        print()

        return accuracy, precision, recall
示例#10
0
def running_dt(data, drop_g1g2=0):
    train_x, test_x, train_y, test_y = pre_process(data, drop_g1g2)
    res = DT.dt(train_x, train_y, test_x)
    P, R, F1 = evaluate_result(test_y, res)
    print("Precise:" + str(P))
    print("Recall:" + str(R))
    print("F1 Score:" + str(F1))
    print()
    return P, R, F1
示例#11
0
def run_test(trX, trY,res_file):
    desired_dt20 = 0.78
    desired_dt50 = 0.78
    desired_knn1 = 0.70
    desired_knn3 = 0.73
    
    print '\n\nFirst, we run DT and KNN on the training/development data to '
    print 'ensure that we are getting roughly the right accuracies.'
    print 'We use the first 80% of the data as training, and the last'
    print '20% as test.'
    
    
    decTree = DT.DT()
    res = 1

    print '\nDT (cutoff=20)...'
    sizeX = trX.shape
    end = int(np.round(sizeX[0]*0.80,decimals=0))
    testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:, :], trY[end:], 20)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_dt20)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nDT (cutoff=20)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
 
    print '\nDT (cutoff=50)...'
    testRun = tt.TrainTest(decTree, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 50)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_dt50)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nDT (cutoff=50)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
    
    knnModel = KNN.KNN()
    print '\nKNN (K=1)'
    max_size = sizeX[0] if sizeX[0] < 10001 else 10000
    end = int(np.round(max_size*0.80,decimals=0)) 
    testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 1)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_knn1)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nKNN (K=1)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))
 
    print '\nKNN (K=3)'
    testRun = tt.TrainTest(knnModel, trX[:end, :], trY[:end], trX[end:sizeX[0], :], trY[end:sizeX[0]], 3)
    acc = testRun.run_tt()
    res += testRun.verifyAcc(acc['acc'], desired_knn3)
    print'\nTrainTime, TestTime', acc['trainTime'], acc['testTime']
    res_file.write('\nKNN (K=3)')
    res_file.write('\nTrainTime, TestTime ' + str(acc['trainTime']) + ', ' + str(acc['testTime']))

    raw_input('\nPress enter to continue...')
    
    return
示例#12
0
def find_best_model(df, tgt):
    lr, ls, dt, dnn = [], [], [], []

    for i in range(100):
        seed = random.randrange(100)
        lr.append(LR.linreg(df, tgt, seed))
        ls.append(LS.lasso(df, tgt, seed))
        dt.append(DT.dectree(df, tgt, seed))
        dnn.append(DNN.nn(df, tgt, seed))

    print(pd.DataFrame({'lr': lr}).describe())
    print(pd.DataFrame({'ls': ls}).describe())
    print(pd.DataFrame({'dt': dt}).describe())
    print(pd.DataFrame({'dnn': dnn}).describe())
示例#13
0
def learningcurve(df,p=10,n=100):
    m,z = df.shape
    size = int(0.7*m/n)
    sizes =[]
    traina = []
    testa = []
    times = []
    for i in range(1,n):
        train,trial = dt.split(df,size*i,int(0.3*m))
        s = time.clock()
        tree = learn(train)
        a,b = dt.test(tree, trial)
        score = dt.accr(a,b)
        c,d = dt.test(tree, train)
        scoret = dt.accr(c,d)
        e = time.clock()
        sizes.append(size*i)
        traina.append(scoret)
        testa.append(score)
        times.append(e-s)
    print("Trial Times")
    print(times)
    return sizes,testa,traina
def crossValidate(x,y, method = "dT",cv=5, alpha = None, min_node = 1):
    #  error measure
    results = []
    if method == "logReg":
        results = {"perf":[], "coef":[], "interc":[]}
    elif method == "dT":
        results = {"alpha": [], "perf":[]}

    # cross validation #
    np.random.seed(1100)
    kf = KFold(n_splits = cv, shuffle = True, random_state = 0) ## for testing fixing random_state
    for train,test in kf.split(x):
        x_train = x[train,:]
        y_train = y[train,:]
        x_test = x[test,:]
        y_test = y[test,:]

        # training and predict

        if alpha == None:
            ## nested select validate and test ##
            # print "start searching alpha:", datetime.now() ### test
            alpha_sel, perf = DTme.hyperParometer(x_train,y_train)
            # print "finish searching alpha:", datetime.now(), alpha ### test
        else:
            alpha_sel = alpha
        result = decisionTree(x_train, y_train, x_test, alpha = alpha_sel, min_node = min_node)

        # performance measure

        alpha_sel, y_pred = result
        results["perf"].append(perfMeasure(y_pred,y_test,rankopt=True))
        results["alpha"].append(alpha_sel)
        print alpha_sel, "alpha"

    for key in results.keys():
        item = np.array(results[key])
        mean = np.nanmean(item, axis = 0)
        std = np.nanstd(item, axis = 0)
        results[key] = [mean, std]

    return results
示例#15
0
def main(P, src, dst):
	samples = []
	total = 0

	# read data
	for line in src:
		line = line.strip()
		if not line[0].isdigit(): continue
		total += 1

		d = line.split()
		proto = 1 if d[1] == "TCP" else 2
		port = int(d[2])
		gt = d[-1]

		b = int((len(d)-4) / 2)
		pl_up = [int(x) for x in d[3:3+b]][0:P.i]
		pl_down = [int(x) for x in d[3+b:-1]][0:P.i]

		if pl_up[0] == -1 or pl_down[0] == -1: continue

		v = [proto,port] + pl_up + pl_down
		samples.append((v, gt))

	print("read %d samples out of %d total (%.2f%%)" % (len(samples), total, 100.0*len(samples)/total))

	# take random samples
	if P.t > 0:
		samples = random.sample(samples, P.t+P.T)
		train = samples[:P.t]
		test = samples[P.t:]
	else:
		train = samples
		test = []

	# train
	knc = DT()
	knc.fit([x[0] for x in train], [x[1] for x in train])

	# test
	if len(test) > 0:
		(acc, ratio, err) = knc.score([x[0] for x in test], [x[1] for x in test])
		print("ok %.3f%%\tin %.3f%%\tof %d K total (%d errors)" %
			(acc * 100.0, ratio * 100.0, len(test)/1000.0, err))

	# store model
	if dst: knc.store(dst)
示例#16
0
    def repeatTheLearningProcess(self, bestGridSearched, set):
        _, bestParams = self.getBestGridSearchedModel(bestGridSearched, set)

        if bestGridSearched.learnerType == 'KNN':
            bestGridSearched = KNN.KNNLearner(**bestParams,
                                              datasetNo=set.datasetNo)
        elif bestGridSearched.learnerType == 'DT':
            bestGridSearched = DT.DTLearner(**bestParams,
                                            datasetNo=set.datasetNo)
        elif bestGridSearched.learnerType == 'SVM':
            bestGridSearched = SVM.SVMLearner(**bestParams,
                                              datasetNo=set.datasetNo)
        elif bestGridSearched.learnerType == 'Boosting':
            bestGridSearched = Boosting.BoostingLearner(
                **bestParams, datasetNo=set.datasetNo)
        elif bestGridSearched.learnerType == 'ANN':
            bestGridSearched = ANN.ANNLearner(**bestParams,
                                              datasetNo=set.datasetNo)

        self.getLearningCurve(bestGridSearched, set)
        self.getComplexityCurve(bestGridSearched, set)

        return bestGridSearched
#CopaLeche.updateCopa()

#MENUUU

opcion = int(
    input(
        "MENU\n 1-ORGANIZACIONES \n 2-COPAS \n 3-PAISES \n 4-LIGAS \n 5-EQUIPOS \n 6-JUGADOR \n 7- DTs \n 8- Salir \n OPCION: "
    ))
ORG = Organizacion()
COPA = Copa()
PAIS = Pais()
Ligue = Liga()
Team = Equipo()
Player = Jugador()
DeTe = DT()

while (opcion != 8):

    #opcion = int(input("\n MENU\n 1-ORGANIZACIONES \n 2-COPAS \n 3-PAISES \n 4-LIGAS \n 5-EQUIPOS \n 6-JUGADOR \n 7- DTs \n 8- Salir \n OPCION: "))

    if (opcion == 1):

        opcionOrg = int(
            input(
                "\n 1-CREAR ORGANIZACION \n 2-INSERTAR ORG EN BASE \n 3-VER ORGANIZACIONES DE LA BASE \n 4-MODIFICAR UNA ORGANIZACION \n 5-ELIMINAR UNA ORGANIZACION \n 6-VOLVER AL 1ER MENU \n OPCION: "
            ))

        if (opcionOrg == 1):

            nombre_org = input("Escriba el nombre de la organizacion: ")
示例#18
0
'''
import numpy as np
import DT as dt
import KNN as knn


if __name__ == '__main__':

    print 'running tests on DT and KNN'
    #This is the class example [mathy, test >= 80, project >= 80, early]
    #with a slight change so that non-mathy first splits on early.
    trX=np.array([[1,1,1,1],[1,1,1,0],[0,1,0,1],[0,0,1,1],[0,0,1,1],[0,0,0,0],[0,0,0,0],[1,0,1,1],[1,0,0,1],[0,0,1,1],[1,0,0,0],[0,0,1,1],[0,1,0,1],[0,0,1,0]])
    trY=np.array([[1],[1],[0],[0],[0],[1],[0],[1],[0],[0],[0],[0],[0],[1]])
    deX = np.array([[0,1,0,0],[0,0,1,0],[0,1,1,1]])
    deY = np.array([[0],[1],[0]])

    decTree = dt.DT()
    print 'DT, cutoff=0'
    trainModel = decTree.res('train',X=trX,Y=trY,h_param=0)
    decTree.DTdraw(trainModel)
    output = decTree.res('predict',model=trainModel,test_case=deX)
    print output
    
    knnMode = knn.KNN()
    print 'KNN, k=1'
    trainModel = knnMode.res('train',X=trX,Y=trY,h_param=1)
    output = knnMode.res('predict',model=trainModel,test_case=deX)
    print output
    
    print 'Done'
示例#19
0
import sys
import time
import marshal
import stat
    

def phfunc(name, obj):
    marshal.dump(obj, open(name,'w'))
    
if __name__=='__main__':
    bt = time.time()
    fname=sys.argv[1]
    mtime=os.stat(fname)[stat.ST_MTIME]
    cform=sys.argv[1]+'.dtcc'
    try:
        cmtime=os.stat(cform)[stat.ST_MTIME]
        comp_form=marshal.load(open(cform))
    except:
        comp_form=None
        cmtime=-1
    d=DT.DT(open(fname).read(), fname, comp_form, mtime, cmtime,
            lambda x, y=cform: phfunc(y, x))
    class dumb: pass
    ns=dumb()
    text = d(ns)
    et = time.time()
    print text
    print 'elapsed time:', et - bt
    
    
示例#20
0
            res_file.write('\n' + disp + '\ndone')


        base = 'baseline'+data_types[i]
        if base not in results.keys():
            print "Lets run some baseline measures..."
            res = run_test(trX,trY,res_file)
            res_file.write('\n' + base + '\n')
            res_file.write(str(res)) 
            raw_input('Press enter to continue...')
 
        dec = 'dt'+data_types[i]
        if dec not in results.keys():
            print '\nNow we vary the cutoff for the decision tree and see how it affects accuracy...'
            thresh = [5,10,20,40,80,160]
            decTree = DT.DT()
            res = run_comps(decTree, thresh, trX[0:4800, :], trY[0:4800], trX[4801:6000, :], 
                        trY[4801:6000],"Figure 2: DT cutoff versus accuracy (MNIST)","DT cutoff","../figure2.png")
            results[dec] = res
            res_file.write('\n' + dec + '\n') 
            res_file.write(str(res))
            raw_input('Press enter to continue...')
     
        neigh = 'knn'+data_types[i]
        if neigh not in results.keys():
            print '\nNow we vary the k for the KNN classifier and see how it affects accuracy...'
            allK = [1,8,16,32,64,128]
            knnModel = KNN.KNN()
            res = run_comps(knnModel, allK, trX[0:2000, :], trY[0:2000], trX[2001:2501, :], 
                         trY[2001:2501],"Figure 3: KNN count versus accuracy (MNIST)","KNN count","../figure3.png")
            results[neigh] = res
示例#21
0
        lr.append(LR.linreg(df, tgt, seed))
        ls.append(LS.lasso(df, tgt, seed))
        dt.append(DT.dectree(df, tgt, seed))
        dnn.append(DNN.nn(df, tgt, seed))

    print(pd.DataFrame({'lr': lr}).describe())
    print(pd.DataFrame({'ls': ls}).describe())
    print(pd.DataFrame({'dt': dt}).describe())
    print(pd.DataFrame({'dnn': dnn}).describe())


tgt = 'medv'

df = prepro.Data(tgt)

y = df[tgt]

seed = 101

LR.linreg(df, tgt, seed)

LS.lasso(df, tgt, seed)

DT.dectree(df, tgt, seed)

DNN.nn(df, tgt, seed)

#find_best_model(df, tgt)

#PCA.analysis(df)
示例#22
0
import DT
import treePloter
import numpy as np

fr = open('lenses.txt')
dataSet = [line.strip().split('\t') for line in fr.readlines()]
dataSet = np.array(dataSet)
attribute_list = ['age', 'prescript', 'astimatic', 'tearRate']
fixed_attribute_list = ['age', 'prescript', 'astimatic', 'tearRate']
tree = DT.createTree(np.array(dataSet), attribute_list, fixed_attribute_list)
print(tree)
#print(DT.classify(tree,['pre','myope','yes','normal','hard'],fixed_attribute_list))
#treePloter.createPlot(tree)
示例#23
0
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=cfg.test_size, random_state=1)

fu.cleanSIDirs('./out/')

if cfg.algo.lower() == 'ann':
    import ANN as ann
    ann.process(X_train, X_test, y_train, y_test)      
elif cfg.algo.lower() == 'k-nn':
    import KNN as knn
    knn.process(X_train, X_test, y_train, y_test)
elif cfg.algo.lower() == 'svm':
    import SVM as svm
    svm.process(X_train, X_test, y_train, y_test)
elif cfg.algo.lower() == 'dt':
    import DT as dt
    dt.process(X_train, X_test, y_train, y_test)

if cfg.ds_test:
    if cfg.nTests == 'full':
        fu.saveTestingSet(X_test, y_test) 
    elif cfg.nTests != None:
        fu.saveTestingSet(X_test[0:cfg.nTests], y_test[0:cfg.nTests], full=False) 
if cfg.algo.lower() == 'k-nn':
    fu.saveTrainingSet(X_train, y_train)
if cfg.export_dir != None:
    from distutils.dir_util import copy_tree

    fu.cleanSIDirs(f'{cfg.export_dir}/')
    fromDirectory = f"./out/include"
    toDirectory = f"{cfg.export_dir}/ds/include"
    copy_tree(fromDirectory, toDirectory)
示例#24
0
def _dtCompileFunc( name, data ):
    return DT.compileTemplate( data, name, tagRegistry )
示例#25
0
    DataSet = pd.read_csv(data_file)

import DT as decision_tree

# 划分训练集和测试集
index = DataSet.shape[0] - 1
index_train = np.arange(index)
rand_train = np.random.choice(index_train,
                              size=random.randint(int((index + 1) / 2), index),
                              replace=False)

DataSet_train = DataSet.iloc[rand_train]
DataSet_test = DataSet.drop(rand_train)

# generate a full tree
root = decision_tree.TreeGenerate(DataSet_train)
decision_tree.DrawPNG(
    root,
    "Decision Tree/Decision Tree Based on Gini Index/Decision Tree Based on Gini Index.png",
)
print("accuracy of full tree: %.3f" %
      decision_tree.PredictAccuracy(root, DataSet_test))

# pre-purning 预剪枝
root = decision_tree.PrePurn(DataSet_train, DataSet_test)
decision_tree.DrawPNG(
    root,
    "Decision Tree/Decision Tree Based on Gini Index/decision_tree_pre.png")
print("accuracy of pre-purning tree: %.3f" %
      decision_tree.PredictAccuracy(root, DataSet_test))
示例#26
0
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 16 15:19:55 2017

@author: Thomas
"""

import DT
import pool
import row
import server

h = DT.DT('dc.in')
h.disp()
示例#27
0
文件: car.py 项目: GregDobby/CS6350
    'maint': ['vhigh', 'high', 'med', 'low'],
    'doors': ['2', '3', '4', '5more'],
    'persons': ['2', '4', 'more'],
    'lug_boot': ['small', 'med', 'big'],
    'safety': ['low', 'med', 'high']
}

label = {'label': ['unacc', 'acc', 'good', 'vgood']}

train_acc = [[0 for x in range(6)] for y in range(3)]
test_acc = [[0 for x in range(6)] for y in range(3)]

for feature_selection in range(3):
    for max_depth in range(6):
        # ID3
        dt_generator = dt.ID3(feature_selection=feature_selection,
                              max_depth=max_depth + 1)
        # get decision tree
        decision_tree = dt_generator.generate_decision_tree(
            train_data, features, label)
        # train acc
        # predict
        train_data['plabel'] = dt_generator.classify(decision_tree, train_data)
        train_acc[feature_selection][max_depth] = train_data.apply(
            lambda row: 1
            if row['label'] == row['plabel'] else 0, axis=1).sum() / train_size
        # test acc
        # predict
        test_data['plabel'] = dt_generator.classify(decision_tree, test_data)
        test_acc[feature_selection][max_depth] = test_data.apply(
            lambda row: 1
            if row['label'] == row['plabel'] else 0, axis=1).sum() / test_size
示例#28
0
import DT
import numpy as np

X = np.loadtxt('X.csv', dtype=np.int32, delimiter=',')
y = np.loadtxt('y.csv', dtype=np.int32, delimiter=',')

tree = DT.decision_tree()
tree.fit(X, y)
tree.predict(X)

pause = 0
示例#29
0
 def __init__(self):
     DT.initBoard()
示例#30
0
 def trigger_out(self, value, event):
     if event == 'strobed':
         DT.toggleBitsOnPost(STROBE_EVT)
         DT.setBitsNoDelay(value)
         DT.clearBitsNoDelay(MAXPOSTABLEINT)
         DT.toggleBitsOnPost(0)
     if event == 'start':
         DT.toggleBitsNoDelay(value)
     if event == 'stop':
         DT.clearBitsNoDelay(value)
示例#31
0
        a,b = dt.test(tree, trial)
        score = dt.accr(a,b)
        c,d = dt.test(tree, train)
        scoret = dt.accr(c,d)
        e = time.clock()
        sizes.append(size*i)
        traina.append(scoret)
        testa.append(score)
        times.append(e-s)
    print("Trial Times")
    print(times)
    return sizes,testa,traina

if __name__=="__main__":
    print ("Boosted Decision Tree")
    df1,df2 = dt.readin()
    train1, trial1, size = dt.cross(df1)
    score = crossval(train1,trial1,size)
    print("Cross Validation for Collection 1")
    print(score)
    m,n = df1.shape
    train1, trial1 = dt.split(df1,int(0.7*m),int(0.3*m))
    s = time.clock()
    tree1 = learn(train1)
    a,b = dt.test(tree1, trial1)
    score = dt.accr(a,b)
    c,d = dt.test(tree1, train1)
    scoret = dt.accr(c,d)
    e = time.clock()
    print("Testing Set Score for Collection 1")
    print(score)
示例#32
0
def TrainingOnDT():
    global filenameForTrainingResult,filenameForPredictedResult,\
        DTClass_weight,DTMax_features,\
        DTCriterion,unknowRow,hamRow,spamRow,PercisionRow,recallRow

    print DTClass_weight.get()
    DTconfusion_matrix, percision, recall, combinedResultOnActualAndPred = DT.DT(
        DTCriterion.get(), DTMax_features.get(), DTClass_weight.get())
    print DTconfusion_matrix, percision, recall
    strOnUnknowRow = "unknown:            " +  "            ".join( str(x)
                                                                    for x in \
            DTconfusion_matrix[0])
    strOnHamRow= "Ham:           " + "         ".join(str(x) for x in \
                                                DTconfusion_matrix[1])
    strOnSpamRow = "Spam:           " + "          ".join(str(x) for x in \
                                                DTconfusion_matrix[2])
    strOnPercisionRow = "precision On spam is : " + str(percision)
    strOnRecallRow = "Recall on spam is : " + str(recall)
    unknowRow.set(strOnUnknowRow)
    hamRow.set(strOnHamRow)
    spamRow.set(strOnSpamRow)
    PercisionRow.set(strOnPercisionRow)
    recallRow.set(strOnRecallRow)

    # ####write Condussion result into file
    print filenameForTrainingResult

    if (filenameForTrainingResult == ''):
        pass
    else:
        fileOnTrainingResult = open(filenameForTrainingResult, 'w')
        try:
            fileOnTrainingResult.write("The result of confusion matrix")
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write("             predicted           ")
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(
                "                     unknown    ham    spam")
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(strOnUnknowRow)
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(strOnHamRow)
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(strOnSpamRow)
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(strOnPercisionRow)
            fileOnTrainingResult.write('\r\n')
            fileOnTrainingResult.write(strOnRecallRow)
        finally:
            fileOnTrainingResult.close()

    # ###Write ID --Actual result --Predicted resutl into file
    print filenameForPredictedResult
    if (filenameForPredictedResult == ''):
        pass
    else:
        fileOnIndividualResult = open(filenameForPredictedResult, 'w')
        fileOnIndividualResult.write("     ID           "
                                     "\tActual\tPredicted\r\n")
        try:
            for i in combinedResultOnActualAndPred:
                fileOnIndividualResult.write(i)
            # fileOnIndividualResult.write('\r\n')

        finally:
            fileOnIndividualResult.close()

    print strOnUnknowRow
示例#33
0
import DT
import numpy as np

dataSet, labels = DT.createDataSet()
attribute_list = ['surface', 'flipper']
fixed_attribute = ['surface', 'flipper']
#mapToStr = ['Does it live on surface?','Does it have the flipper?']
tree = DT.createTree(np.array(dataSet), attribute_list, fixed_attribute)
#treePloter.createPlot(tree)
#print (DT.classify(tree,['1','1'],fixed_attribute))
print(tree)
示例#34
0
def test_impurity():
    print DT.entropy([1, 1, 1, 0])
    print DT.entropy([])
    print DT.entropy([1, 1, 1, 1])
    a = DT.DecisionTree(1)
    print a.impurity_func([1, 1, 0, 1], [0, 0])
示例#35
0
label = {'y': ['yes', 'no']}

num_run = 100
T = 1000

test_py = np.array([[0 for x in range(test_size)] for y in range(num_run)])
test_py_first = np.array([0 for x in range(test_size)])

for iter in range(num_run):
    train_subset = train_data.sample(n=1000, replace=False, random_state=iter)
    for t in range(T):
        print('iter: ', iter, 't: ', t)
        # sample with replace
        sampled = train_subset.sample(frac=0.01, replace=True, random_state=t)
        # ID3
        dt_generator = dt.ID3(feature_selection=0, max_depth=17, subset=6)
        # get decision tree
        decision_tree = dt_generator.generate_decision_tree(
            sampled, features, label)
        ## predict
        # test
        py = dt_generator.classify(decision_tree, test_data)
        py = np.array(py.tolist())
        py[py == 'yes'] = 1
        py[py == 'no'] = -1
        py = py.astype(int)
        test_py[iter] = test_py[iter] + py
        if t == 0:
            test_py_first = test_py_first + py

true_value = np.array(test_data['y'].tolist())
def pertest(name, item):
    print name, item
    print len(pfunc(item))


class nsc:
    this = 'that'
    num = 1


ns = nsc()

import DT
t = timer.Timer()
print t, 'start'
x = DT.DT(open('tests/test1.dtml').read())
print t, 'cooked'
output = x(ns)
print t, 'rendered'

#node=x.node
#for i in node.children:
#    pertest('i',i)
#    if hasattr(i, 'children'):
#        for j in i.children:
#            pertest('j',j)
#            if hasattr(j,'children'):
#                for k in j.children:
#                    pertest('k', k)

print 'serializing'