示例#1
0
def train_randomized_forest(training_data):
    print "Training Random Forest"
    pb = ProgBar()
    #m=100,kcandidates=10,nmin=15 -> 53%
    forest = treerandom.build_randomized_forest(training_data,m=100,kcandidates=5,nmin=5, callback = pb.callback) 
    del pb
    return forest
示例#2
0
def build_random_big_forest(rows,m,kcandidates,nmin,pickcandidate=treerandom.pick_candidate_random,number_of_forests=10,callback=None):
  forests = []
  for i in range(0,number_of_forests):
    pb = ProgBar()
    forests.append(treerandom.build_randomized_forest(rows,m,kcandidates,nmin,callback=pb.callback))
    del pb
    print i
    if callback: callback(i,number_of_forests-i)
  return forests
def train_randomized_forest(training_data):
    print "Training Random Forest"
    pb = ProgBar()
    #m=100,kcandidates=10,nmin=15 -> 53%
    forest = treerandom.build_randomized_forest(training_data,
                                                m=100,
                                                kcandidates=5,
                                                nmin=5,
                                                callback=pb.callback)
    del pb
    return forest
def do_kcross_validation(fin,finy,kfolds):
    print "Starting k=" + str(kfolds)+" validation for random forest"
    #there is 2500 tracks
    labels = dt.get_lines(finy,int)
    pb = ProgBar()
    lines = dt.get_lines(fin,float," ", callback = pb.callback)
    del pb
    #normalize features
    
    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)


    block_size = len(lines)/kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

   
    print "number of chunks = " +str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []
    #need to add loop here, to loop over configurations of m,n,k
    m = [100]
    k = [5]
    n = [5]

    bestm = 0
    bestk = 0
    bestn = 0
    bestaccuracy = 0

    for p in range(0,len(m)):
        for f in range(0,len(k)):
            for g in range(0,len(n)):
                for i in range(0,len(example_chunks)):

                    #we leave set in index i out of train
                    print "prepare validation set"
                    validationdata = example_chunks[i]

                    #extract validation chunk
                    print "leaving out block " + str(i) + " for validation"
                    leaveout = i
                    validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ]
                    
                    trainingdata = []

                    print("merging blocks "),
                    for j in range(0,len(example_chunks)):
                        if(j != leaveout):
                            #print "j="+str(j) + " i="+ str(leaveout)
                            print(str(j) + ","),
                            trainingdata = trainingdata + example_chunks[j]

                    print "\nprepare training set"

                    print "training on " + str(len(trainingdata))
                    print "each track has " + str(len(trainingdata[0])) + " features"
                    pb = ProgBar()
                    forest = treerandom.build_randomized_forest(trainingdata,m=m[p],kcandidates=k[f],nmin=n[g], callback=pb.callback)
                    del pb
                    print "testing on " + str(len(validationdata))
                    corrects = 0
                    #classify a set of entries
                    for example in validationdata:
                        #print example.features
                        result = treerandom.classify(example.features,forest)
                        #print 'expected : ' + str(example.label) + ' result : '+ str(result)
                        if(result == example.label):
                            corrects = corrects + 1
                    #calculate the % of accuracy
                    accuracy_percentage = (corrects*100)/len(validationdata)
                    print "accuracy = " + str(accuracy_percentage) + "%"
                    accuracy_results.append(accuracy_percentage)
                avgcc = dt.average(accuracy_results)
                print "average accuracy using m="+str(m[p]) + ", k="+str(k[f])+", n="+str(n[g]) + "---> " + str(avgcc) + "%"
                if(avgcc > bestaccuracy):
                    bestm = m[p]
                    bestk = k[f]
                    bestn = n[g]
                    bestaccuracy = avgcc
    print "BEST COMBINATION m="+str(bestm) + ", k="+str(bestk)+", n="+str(bestn) + "---> " + str(bestaccuracy) + "%"
def do_kcross_validation(fin, finy, kfolds):
    print "Starting k=" + str(kfolds) + " validation for random forest"
    #there is 2500 tracks
    labels = dt.get_lines(finy, int)
    pb = ProgBar()
    lines = dt.get_lines(fin, float, " ", callback=pb.callback)
    del pb
    #normalize features

    lines = dt.transform_features(lines)
    data = dt.add_labels_to_lines(lines, labels)

    block_size = len(lines) / kfolds
    print "chunk size = " + str(block_size)
    example_chunks = list(dt.chunks(data, block_size))
    #labels_chunks = list(dt.chunks(labels, block_size))

    print "number of chunks = " + str(len(example_chunks))

    #holds avg accuracy for one forest
    accuracy_results = []
    #need to add loop here, to loop over configurations of m,n,k
    m = [100]
    k = [5]
    n = [5]

    bestm = 0
    bestk = 0
    bestn = 0
    bestaccuracy = 0

    for p in range(0, len(m)):
        for f in range(0, len(k)):
            for g in range(0, len(n)):
                for i in range(0, len(example_chunks)):

                    #we leave set in index i out of train
                    print "prepare validation set"
                    validationdata = example_chunks[i]

                    #extract validation chunk
                    print "leaving out block " + str(i) + " for validation"
                    leaveout = i
                    validationdata = [
                        exampleentry(
                            validationdata[i][0:len(validationdata[i]) - 1],
                            validationdata[i][-1])
                        for i in range(0, len(validationdata))
                    ]

                    trainingdata = []

                    print("merging blocks "),
                    for j in range(0, len(example_chunks)):
                        if (j != leaveout):
                            #print "j="+str(j) + " i="+ str(leaveout)
                            print(str(j) + ","),
                            trainingdata = trainingdata + example_chunks[j]

                    print "\nprepare training set"

                    print "training on " + str(len(trainingdata))
                    print "each track has " + str(len(
                        trainingdata[0])) + " features"
                    pb = ProgBar()
                    forest = treerandom.build_randomized_forest(
                        trainingdata,
                        m=m[p],
                        kcandidates=k[f],
                        nmin=n[g],
                        callback=pb.callback)
                    del pb
                    print "testing on " + str(len(validationdata))
                    corrects = 0
                    #classify a set of entries
                    for example in validationdata:
                        #print example.features
                        result = treerandom.classify(example.features, forest)
                        #print 'expected : ' + str(example.label) + ' result : '+ str(result)
                        if (result == example.label):
                            corrects = corrects + 1
                    #calculate the % of accuracy
                    accuracy_percentage = (corrects *
                                           100) / len(validationdata)
                    print "accuracy = " + str(accuracy_percentage) + "%"
                    accuracy_results.append(accuracy_percentage)
                avgcc = dt.average(accuracy_results)
                print "average accuracy using m=" + str(m[p]) + ", k=" + str(
                    k[f]) + ", n=" + str(n[g]) + "---> " + str(avgcc) + "%"
                if (avgcc > bestaccuracy):
                    bestm = m[p]
                    bestk = k[f]
                    bestn = n[g]
                    bestaccuracy = avgcc
    print "BEST COMBINATION m=" + str(bestm) + ", k=" + str(
        bestk) + ", n=" + str(bestn) + "---> " + str(bestaccuracy) + "%"