def Dealing_with_Missing_Data(): print '>>Dealing with Missing Data' reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) print '------------------' print treepredict.mdclassify(['google',None,'yes',None],tree) print treepredict.mdclassify(['google','France',None,None],tree)
def train_random_trees(train_data, origin_attribute_list, label_list, sample_copy_count, attribute_count_per_tree): trees = [] #用随机选取的多个训练集 for sample_copies_index in range(0, sample_copy_count): sample_copy = generate_random_sample(train_data) #每棵决策树使用的属性集(随机) random_attributes_lists = choose_attributes_lists( len(origin_attribute_list), attribute_count_per_tree) #用不同属性集训练的决策树 for attributes_lists_per_tree in random_attributes_lists: #根据随机选定的属性集训练每棵决策树 #根据当前决策树使用的属性集,重新生成训练集(只剩下用到的属性) reduced_data = organize_sample_with_selected_attributes( sample_copy, attributes_lists_per_tree) #将属性序号(attributes_lists_per_tree)转换成属性取值信息(real_attribute_list) real_attribute_list = [] for index in attributes_lists_per_tree: real_attribute_list.append(origin_attribute_list[index]) tree = treepredict.buildtree(reduced_data, real_attribute_list, label_list) tree_with_attribute_index = { 'tree': tree, 'attributes_index': attributes_lists_per_tree } trees.append(tree_with_attribute_index) return trees
def testing_gain_increments(increments=[]): classresults = {} for increment in increments: tree = treepredict.buildtree(train_data, gain_increment=increment, gain_threshold=0, instance_minimum=1) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain, '):' for row in trainConfMat: print '\t'.join(map(lambda x: str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest, '):' for row in testConfMat: print '\t'.join(map(lambda x: str(x), row)) print '' classresults[increment] = [crTest] return classresults
def Modeling_Home_Prices(): print '>>Modeling Home Prices' import zillow if os.path.exists('housedata.txt'): f=open('housedata.txt','r') lines=f.readlines() housedata=[] for line in lines: fields=line.split('\t') l1=[fields[0],fields[1],fields[2],fields[3],fields[4],fields[5],fields[6]] housedata.append(l1) f.close(); else: housedata=zillow.getpricelist( ) f=open('housedata.txt','w') for l in housedata: if l is None: continue print l for k in l: f.write('%s\t' % (k)) f.write('\n') f.close reload(treepredict) housetree=treepredict.buildtree(housedata,scoref=treepredict.variance) treepredict.drawtree(housetree,'housetree.jpg')
def main(): from treepredict import buildtree, entropy, drawtree # house_data = getpricelist() # print house_data print 'build tree' t = buildtree(house_data, scoref=entropy) print 'draw tree' drawtree(t, 'house_price_tree.jpeg')
def Pruning_the_Tree(): print '>>Pruning the Tree' reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) print '------------------' treepredict.prune(tree,0.1) treepredict.printtree(tree) treepredict.prune(tree,1.0) treepredict.printtree(tree)
def main(rows): # fruits with their colors and size tree = treepredict.buildtree(rows) # print(treepredict.classify([2, 'red'], tree)) # print(treepredict.classify([5, 'red'], tree)) # print(treepredict.classify([1, 'green'], tree)) # 决策树 treepredict.printtree(tree) treepredict.drawtree(tree, jpeg='treeview.jpg')
def testing_gain_increments(increments=[]): classresults={} for increment in increments: tree=treepredict.buildtree(train_data,gain_increment=increment,gain_threshold=0,instance_minimum=1) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain,'):' for row in trainConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest,'):' for row in testConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' classresults[increment]=[crTest] return classresults
#print doc.toxml() gender = doc.getElementsByTagName('gender')[0].firstChild.data age = doc.getElementsByTagName('age')[0].firstChild.data loc = doc.getElementsByTagName('location')[0].firstChild.data region = None for r, s in stateregions.iteritems(): if loc[0:2] in s: region = r if region: result.append((gender, int(age), region, rating)) except: pass return result if __name__ == '__main__': d = getrandomratings(50) # hu, all results are always of the same gender? pdata = getpeopledata(d) print pdata import drawtree import treepredict tree = treepredict.buildtree(pdata, treepredict.variance) treepredict.prune(tree, 0.5) drawtree.drawtree(tree, 'hottree.png') print 'Wrote hottree.png'
def train_simple_tree(training_data): print "Training Simple Tree" tree = treepredict.buildtree(training_data) return tree
def decision_tree1(self, evt): import treepredict reload(treepredict) full_price = 1130 flights = self.fl_lines[('PEK','PVG')].set_of_flights data = [] for deptid in flights.keys(): flight = flights[deptid] ftype = flight['ftype'] deptdate = flight['date'] deptdate = deptdate.split('/') # 获得周几 weekday = datetime.datetime(int(deptdate[0]),int(deptdate[1]),int(deptdate[2])).weekday() weekday = int(weekday) #print weekday time = flight['time'] # 若起飞时间非常早或者非常晚,取为1,否则为0 if int(time[0:2]) < 9 or int(time[0:2]) > 20: time = 1 else: time = 0 # 处理价格 dd = flight['date'] # str-->date dd = dd.split('/') deptdate = datetime.date(int(dd[0]), int(dd[1]), int(dd[2])) price = flight['price'] points = [] for ftdate in price.keys(): ff = ftdate.split('/') fetchdate = datetime.date(int(ff[0]), int(ff[1]), int(ff[2])) days = (deptdate-fetchdate).days points.append((days, price[ftdate])) points = self.pre(points) p = [] for i in points.keys(): if i >= 6: p.append(points[i]) #print p if len(p) <= 1: continue result_price = p[0] p.pop(0) avg_price = sum(p)/len(p) result_price = int(float(result_price)/float(full_price)*10) avg_price = int(float(avg_price)/float(full_price)*10) data.append((weekday, time, avg_price, result_price)) ## fout = open('task.txt', 'w') ## lines = ['%s %s %s %s\n' %v for v in data] ## fout.writelines(lines) ## fout.close() flighttree = treepredict.buildtree(data, scoref = treepredict.giniimpurity) treepredict.drawtree(flighttree,'flighttree_entropy.jpg')
def testBasics(self): t = treepredict.buildtree(treepredict.testdata()) self.assertEquals(treepredict.classify(['(direct)', 'USA', 'yes', 5], t), {'Basic': 4})
try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data #rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except Exception, e: #print e return None #return zipcode, use, int(year), float(bath), int(bed), int(rooms), price return zipcode, use, int(year), float(bath), int(bed), price def getpricelist(): return filter(None, [getaddressdata(line.strip(), 'Cambridge,MA') for line in open('addresslist.txt')]) if __name__ == '__main__': import drawtree import treepredict housedata = getpricelist() print housedata tree = treepredict.buildtree(housedata, scorefun=treepredict.variance) drawtree.drawtree(tree, 'zillow.png') print "Wrote zillow.png"
import treepredict as tr tree = tr.buildtree(tr.my_data) tr.printtree(tree) print tr.mdclassify(['google',None,'yes',None],tree) print tr.mdclassify(['google','France',None,None],tree)
def train_simple_tree(training_data): print "Training Simple Tree" tree = treepredict.buildtree(training_data) return tree
import treepredict # fruits with their colors and size fruits = [ [4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'], [1, 'green', 'grape'], [5, 'red', 'apple'] ] tree = treepredict.buildtree(fruits) treepredict.classify([2, 'red'], tree) treepredict.classify([5, 'red'], tree) treepredict.classify([1, 'green'], tree) treepredict.printtree(tree) #treepredict.drawtree(tree, jpeg='treeview.jpg')
orig_date = datetime.date(2014, 2, 1) for i in range(60): timedelta = datetime.timedelta(i) cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20" + str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) train_flights.update(new_flights) #train_flights = fl_lines.get_flights_by_dpdate('2014/05/25') #print train_flights print '训练数据读入完成,用时', time.clock() train_data = get_data(train_flights) #writeintxt('train_data.txt',train_data) flighttree = treepredict.buildtree(train_data, scoref=treepredict.giniimpurity) #treepredict.drawtree(flighttree, 'test.jpg') print '树训练完成,用时%s,数据%s条' % (time.clock(), len(train_data)) test_flights = {} orig_date = datetime.date(2014, 5, 2) for i in range(10): timedelta = datetime.timedelta(i) cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20" + str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) test_flights.update(new_flights) test_data = get_data(test_flights) #writeintxt('test_data.txt', test_data)
def make_tree(self, data): ''' Make a decision tree with the supplied data ''' return treepredict.buildtree(data)
def Classifying_New_Observations(): reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) print '>>Classifying New Observations' print treepredict.classify(['(direct)','USA','yes',5],tree)
def Recursive_Tree_Building(): print '>>Recursive Tree Building' reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) treepredict.printtree(tree)
increments=[0] for i in xrange(1,10): x=10**(-i) increments.append(x) print 'Increments to be tested and passed to gain_increments',increments accuracyTest=testing_gain_increments(increments) #print accuracyTest values=accuracyTest.keys() values.sort(cmp=lambda a,b:cmp(accuracyTest[a],accuracyTest[b])) print 'Increment value with best classification rate was ',values[-1] # Let's see what it looks like... #print "\nFinal tree...\n" treepredict.printtree(treepredict.buildtree(train_data,gain_increment=values[-1],gain_threshold=0,instance_minimum=1)) # Produce a png of the tree treepredict.drawtree(tree,jpeg="sample_tree.jpg") #print "\npng of tree generated using PIL (Python Imaging Library) modules.\n" # Let's classify an incoming record of '(direct), USA, yes, 5' ... #incoming = ['(direct)','USA','yes',5] #print "Prediction of new record: ",treepredict.classify(incoming,tree) # Finally, what does pruning do with say a mingain = 0.9 ? #print "\nPruned tree...\n" #treepredict.prune(tree,0.9) #treepredict.printtree(tree) # For group homework, modify "buildtree" function so that it stops
code = doc.getElementsByTagName('code')[0].firstChild.data if code != '0': print 'Code Error!' return None try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: print 'Error!' return None return (str(zipcode), str(use), int(year), float(bath), int(bed), int(rooms), int(price)) def getPriceList(filename): pricelist = [] for line in open(filename): data = getAddressData(line.strip(), 'Cambridge,MA') if data != None: pricelist.append(data) return pricelist if __name__ == '__main__': housedata = getPriceList('addresslist.txt') housetree = treepredict.buildtree(housedata, scoreFunction=treepredict.variance) treepredict.drawTree(housetree, 'housetree.jpg')
if best_gain > 0: trueBranch = buildtree(best_sets[0]) falseBranch = buildtree(best_sets[1]) return decisionnode(col_num=best_criteria[0], test=best_criteria[1], tb=trueBranch, fb=falseBranch) else: return decisionnode(results=uniquecounts(data)) set0, set1 = dividedata(data, 2, lambda answer: answer == 2017) print(entropy(set0), entropy(set1), sep='\n') tree = treepredict.buildtree(treepredict.data) print(tree.col_num) print(tree.test) print(tree.results) print("") print(tree.tb.col_num) print(tree.tb.test) print(tree.tb.results) print("") print(tree.tb.tb.col_num) print(tree.tb.tb.test) print(tree.tb.tb.results) print("") print(tree.tb.fb.col_num) print(tree.tb.fb.test)
# Script to demonstrate the CART-like DT classifier from # Chapter 7 of "Programming Collective Intelligence" by # T. Segaran, O'Reilly, (c) 2007 # import treepredict import fileinput import Image import ImageDraw # If the last parameter is set to 0, then all attributes other than 'age' and 'war' would be used. train_data, test_data = fileinput.loadDataset(5, ['age','gender','occupation','fantasy','film-noir', 'drama', 'western'], 1) tree=treepredict.buildtree(train_data,gain_increment=0,gain_threshold=0,instance_minimum=100) # Let's see what it looks like... print "\nFinal tree...\n" treepredict.printtree(tree) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain,'):' for row in trainConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest,'):'
def testBasics(self): t = treepredict.buildtree(treepredict.testdata()) self.assertEquals( treepredict.classify(['(direct)', 'USA', 'yes', 5], t), {'Basic': 4})
if code != '0': return None # extract the info about this property try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None return zipcode, use, int(year), float(bath), int(bed), int(rooms), price def getpricelist(): l1 = [] for line in file('addresslist.txt'): data = getaddressdata(line.strip(), 'Cambridge+MA') l1.append(data) return l1 if __name__ == "__main__": housedata = getpricelist() housedata = [data for data in housedata if data != None] housetree = treepredict.buildtree(housedata, treepredict.variance) treepredict.drawtree(housetree, 'housetree.jpg')
sort=False) dataset = ads_df_final[['Adjective', 'Adverb', 'Noun', 'Verb', 'Sentiment']] ############# Splitting the Dataset into Testing and Training Sets ############## final_acc = 0.0 for i in range(no_of_trials): splitRatio = 0.7 trainingSet, testSet = splitDataset(dataset, splitRatio) #print(trainingSet) # print(type(trainingSet)) print('Split {0} rows into train = {1} and test = {2} rows'.format( len(dataset), len(trainingSet), len(testSet))) ############# Model Building ############## b = dt.buildtree(trainingSet) dt.drawtree(b, jpeg='treeview.jpg') #print("original_testset=",testSet) ############# Preparing Testing DataSet ############## testlabels = [] for i in range(len(testSet)): label = testSet[i].pop(-1) testlabels.append(label) #print("testSet=",testSet) #print("testlabels=",testlabels) ############# Classification of Test Records ############## number = 0 for i in range(len(testSet)): #print("\ntest_data",testSet[i])
gender = doc2.getElementsByTagName('gender')[0].firstChild.data age = doc2.getElementsByTagName('age')[0].firstChild.data loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2] # 将州转换成地区 for r, s in stateregions.items(): if loc in s: region = r if region != None: result.append((gender, int(age), region, rating)) except: pass return result l1 = getrandomratings(500) print len(l1) pdata = getpeopledata(l1) print pdata[0] import treepredict hottree = treepredict.buildtree(pdata, scoref=treepredict.variance) treepredict.prune(hottree, 0.5) treepredict.drawtree(hottree, 'hottree.jpg') south = treepredict.mdclassify((None, None, 'south'), hottree) midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree) print south[10] / sum(south.values()) print midat[10] / sum(midat.values())
import treepredict # fruits with their colors and size fruits = [[4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'], [1, 'green', 'grape'], [5, 'red', 'apple']] tree = treepredict.buildtree(fruits) treepredict.classify([2, 'red'], tree) treepredict.classify([5, 'red'], tree) treepredict.classify([1, 'green'], tree) treepredict.printtree(tree) #treepredict.drawtree(tree, jpeg='treeview.jpg')
orig_date = datetime.date(2014, 2, 1) for i in range(60): timedelta = datetime.timedelta(i) cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20"+str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) train_flights.update(new_flights) #train_flights = fl_lines.get_flights_by_dpdate('2014/05/25') #print train_flights print '训练数据读入完成,用时', time.clock() train_data = get_data(train_flights) #writeintxt('train_data.txt',train_data) flighttree = treepredict.buildtree(train_data, scoref=treepredict.giniimpurity) #treepredict.drawtree(flighttree, 'test.jpg') print '树训练完成,用时%s,数据%s条' %(time.clock(), len(train_data)) test_flights = {} orig_date = datetime.date(2014, 5, 2) for i in range(10): timedelta = datetime.timedelta(i) cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20"+str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) test_flights.update(new_flights) test_data = get_data(test_flights) #writeintxt('test_data.txt', test_data)
for i in xrange(1, 10): x = 10**(-i) increments.append(x) print 'Increments to be tested and passed to gain_increments', increments accuracyTest = testing_gain_increments(increments) #print accuracyTest values = accuracyTest.keys() values.sort(cmp=lambda a, b: cmp(accuracyTest[a], accuracyTest[b])) print 'Increment value with best classification rate was ', values[-1] # Let's see what it looks like... #print "\nFinal tree...\n" treepredict.printtree( treepredict.buildtree(train_data, gain_increment=values[-1], gain_threshold=0, instance_minimum=1)) # Produce a png of the tree treepredict.drawtree(tree, jpeg="sample_tree.jpg") #print "\npng of tree generated using PIL (Python Imaging Library) modules.\n" # Let's classify an incoming record of '(direct), USA, yes, 5' ... #incoming = ['(direct)','USA','yes',5] #print "Prediction of new record: ",treepredict.classify(incoming,tree) # Finally, what does pruning do with say a mingain = 0.9 ? #print "\nPruned tree...\n" #treepredict.prune(tree,0.9) #treepredict.printtree(tree)
def do_simpletree_kcross_validation(fin,finy,kfolds): print "Starting k=" + str(kfolds)+" validation for Simple tree" #there is 2500 tracks labels = dt.get_lines(finy,int) pb = ProgBar() lines = dt.get_lines(fin,float," ", callback = pb.callback) del pb #normalize features lines = dt.transform_features(lines) data = dt.add_labels_to_lines(lines, labels) block_size = len(lines)/kfolds print "chunk size = " + str(block_size) example_chunks = list(dt.chunks(data, block_size)) #labels_chunks = list(dt.chunks(labels, block_size)) print "number of chunks = " +str(len(example_chunks)) #holds avg accuracy for one forest accuracy_results = [] for i in range(0,len(example_chunks)): #we leave set in index i out of train print "prepare validation set" validationdata = example_chunks[i] #extract validation chunk print "leaving out block " + str(i) + " for validation" leaveout = i validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ] trainingdata = [] print("merging blocks "), for j in range(0,len(example_chunks)): if(j != leaveout): #print "j="+str(j) + " i="+ str(leaveout) print(str(j) + ","), trainingdata = trainingdata + example_chunks[j] print "\nprepare training set" print "training on " + str(len(trainingdata)) print "each track has " + str(len(trainingdata[0])) + " features" tree = treepredict.buildtree(trainingdata) print "testing on " + str(len(validationdata)) corrects = 0 #classify a set of entries for example in validationdata: #print example.features result = treepredict.classify(example.features,tree) #print 'expected : ' + str(example.label) + ' result : '+ str(result) if(result == example.label): corrects = corrects + 1 #calculate the % of accuracy accuracy_percentage = (corrects*100)/len(validationdata) print "accuracy = " + str(accuracy_percentage) + "%" accuracy_results.append(accuracy_percentage) avgcc = dt.average(accuracy_results) print "average accuracy ="+ str(avgcc) + "%"
def do_simpletree_kcross_validation(fin, finy, kfolds): print "Starting k=" + str(kfolds) + " validation for Simple tree" #there is 2500 tracks labels = dt.get_lines(finy, int) pb = ProgBar() lines = dt.get_lines(fin, float, " ", callback=pb.callback) del pb #normalize features lines = dt.transform_features(lines) data = dt.add_labels_to_lines(lines, labels) block_size = len(lines) / kfolds print "chunk size = " + str(block_size) example_chunks = list(dt.chunks(data, block_size)) #labels_chunks = list(dt.chunks(labels, block_size)) print "number of chunks = " + str(len(example_chunks)) #holds avg accuracy for one forest accuracy_results = [] for i in range(0, len(example_chunks)): #we leave set in index i out of train print "prepare validation set" validationdata = example_chunks[i] #extract validation chunk print "leaving out block " + str(i) + " for validation" leaveout = i validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i]) - 1], validationdata[i][-1]) for i in range(0, len(validationdata)) ] trainingdata = [] print("merging blocks "), for j in range(0, len(example_chunks)): if (j != leaveout): #print "j="+str(j) + " i="+ str(leaveout) print(str(j) + ","), trainingdata = trainingdata + example_chunks[j] print "\nprepare training set" print "training on " + str(len(trainingdata)) print "each track has " + str(len(trainingdata[0])) + " features" tree = treepredict.buildtree(trainingdata) print "testing on " + str(len(validationdata)) corrects = 0 #classify a set of entries for example in validationdata: #print example.features result = treepredict.classify(example.features, tree) #print 'expected : ' + str(example.label) + ' result : '+ str(result) if (result == example.label): corrects = corrects + 1 #calculate the % of accuracy accuracy_percentage = (corrects * 100) / len(validationdata) print "accuracy = " + str(accuracy_percentage) + "%" accuracy_results.append(accuracy_percentage) avgcc = dt.average(accuracy_results) print "average accuracy =" + str(avgcc) + "%"
import treepredict # main function # print('<----DivideSet---->') # for item in treepredict.divideset(treepredict.my_data, 2, 'yes'): # print(item) # print('\n<----Build and Display the Tree---->') tree = treepredict.buildtree(treepredict.my_data) treepredict.printtree(tree) # # print('\n<----Graphical Display---->') # path = 'output/treeview.jpg' # treepredict.drawtree(tree, jpeg=path) # print("picture has been saved in " + path) # # print('\n<----Classify and prune---->') # test = ['(direct)', 'USA', 'yes', 5] # print(test) # print(treepredict.classify(test, tree), '\n') # # print('Before pune:') # treepredict.printtree(tree) # treepredict.prune(tree, 1.0) # print('\nAfter pune:') # treepredict.printtree(tree) # print('<----Zillow API---->') # import zillow # # housedata = zillow.getpricelist() # # print('house data saved!')
print "\nParent node...\n" gini = treepredict.giniimpurity(treepredict.my_data) entr = treepredict.entropy(treepredict.my_data) print "Gini: %8f Entropy: %8f" % (gini, entr) # Let's now split on the Read FAQ field and assess impurity node1, node2 = treepredict.divideset(treepredict.my_data, 2, "yes") print "\nRead FAQ = Yes leaf node...\n" gini = treepredict.giniimpurity(node1) entr = treepredict.entropy(node1) print "Gini: %8f Entropy: %8f" % (gini, entr) # Build the DT recursively using the buildtree function; assumes # last column/field is the classification attribute. tree = treepredict.buildtree(treepredict.my_data) # Let's see what it looks like... print "\nFinal tree...\n" treepredict.printtree(tree) # Produce a png of the tree treepredict.drawtree(tree, jpeg="sample_tree.jpg") print "\npng of tree generated using PIL (Python Imaging Library) modules.\n" # Let's classify an incoming record of '(direct), USA, yes, 5' ... incoming = ["(direct)", "USA", "yes", 5] print "Prediction of new record: ", treepredict.classify(incoming, tree) # Let's see how the missing data classification via # the "mdclassify" function performs on our sample data.
["t", "China", "no", 17, "None"], ] my_data2 = [ ["a", "USA", "yes", "18", "None"], ["b", "France", "yes", "23", "Premium"], ["c", "USA", "yes", "24", "Basic"], ["d", "France", "yes", "23", "Basic"], ] train_flowers = data.read_filedata("..//data//train_data.txt", "ALL", ",", [0, 1, 2, 3]) test_flowers = data.read_filedata("..//data//test_data.txt", "ALL", ",", [0, 1, 2, 3]) tree = DecisionTree(train_flowers) treepredict.buildtree(tree) tree.printTree() right = 0 wrong = 0 for flower in test_flowers: result = treepredict.predic(tree, flower) if flower[-1] in result: if right == 49: pass right += 1 else: wrong += 1 print "正确预测:" + str(right) + "个" print "错误预测:" + str(wrong) + "个"
import treepredict as tp if __name__ == '__main__': print("ESERCIZIO SU IRIS DATASET\n") train_data = [] test_data = [] mydata = tp.aprifile("iris.txt") train_data, test_data = tp.createdataset2(mydata, 60, []) print("TRAIN DATA : \n") print(train_data, "\n") print("TEST DATA: \n") print(train_data) iris_tree = tp.buildtree(train_data) tp.drawtree(iris_tree, "iris_tree.jpeg") tp.fperformance(mydata)
import advancedclassify as ad import treepredict as tr agesonly = ad.loadmatch('agesonly.csv', allnum=True) matchmaker = ad.loadmatch('matchmaker.csv') # ad.plotagematches(agesonly) age = [] for line in file('agesonly.csv'): l = [] for w in line.split(','): l.append(int(w)) age.append(l) tree = tr.buildtree(age) tr.printtree(tree) tr.drawtree(tree) print tr.classify(tree, [65, 63]) avgs = ad.lineartrain(agesonly) print avgs print ad.dpclassify([30, 25], avgs.values()) print ad.dpclassify([25, 40], avgs.values()) print ad.dpclassify([48, 20], avgs.values()) print tr.classify(tree, [30, 25]) print tr.classify(tree, [25, 40]) print tr.classify(tree, [48, 20])
def tree_view(): from treepredict import buildtree, drawtree my_data = [map(float, line.split(',')) for line in open('data/agesonly.csv')] tree = buildtree(my_data) drawtree(tree, 'treeview.png')
import treepredict import preprocessor import postprocessor import arff import copy label_count = 6 train_data_file = '.\\scene\\scene-train-tiny.arff' test_data_file = '.\\scene\\scene-test-tiny.arff' method = input('1 单标签;2 多个二类分类') if method == '1': #读取训练集,建树(多标签转换成单标签) (attributes_list, label_value_list,train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE) train_data = preprocessor.translate_label_multiclass(train_data, label_count) tree = treepredict.buildtree(train_data, attributes_list, label_value_list) treepredict.printtree(tree) #读取测试集,验证效果 (test_attributes_list, test_label_value_list, test_data) = preprocessor.read_data(test_data_file, label_count, arff.DENSE) test_data_copy = copy.deepcopy(test_data) predicted_labels_list = [] for row in test_data: result = treepredict.classify(row, tree, test_attributes_list) post_result = treepredict.post_classify(result) decoded_result = preprocessor.label_decoding(post_result) predicted_labels_list.append(decoded_result) hamming_loss = postprocessor.hamming_loss(test_data_copy, predicted_labels_list) print('hamming loss of merging labels:', hamming_loss) else : #当做多个二类分类问题处理
# Script to demonstrate the CART-like DT classifier from # Chapter 7 of "Programming Collective Intelligence" by # T. Segaran, O'Reilly, (c) 2007 # import treepredict import fileinput import Image import ImageDraw # If the last parameter is set to 0, then all attributes other than 'age' and 'war' would be used. train_data, test_data = fileinput.loadDataset(2, ['age', 'gender','occupation','unknown genre','film-noir', 'horror', 'western'], 1) tree=treepredict.buildtree(train_data,gain_increment=0,gain_threshold=0,instance_minimum=0) # Let's see what it looks like... print "\nFinal tree...\n" treepredict.printtree(tree) trainConfMat, crTrain = treepredict.testTree(train_data, tree) print 'Training set confusion matrix (Classification rate:', crTrain,'):' for row in trainConfMat: print '\t'.join(map(lambda x:str(x), row)) print '' testConfMat, crTest = treepredict.testTree(test_data, tree) print 'Test set confusion matrix (Classification rate:', crTest,'):'
# 状态码为0代表操作成功, 否则代表有错误发生 if code != '0': return None # 提取有关该房产的信息 try: zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data use = doc.getElementsByTagName('useCode')[0].firstChild.data year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data price = doc.getElementsByTagName('amount')[0].firstChild.data except: return None return (zipcode, use, int(year), float(bath), int(bed), int(rooms), price) '''读取addresslist.txt文件并构造一个数据列表''' def getpricelist(): l1 = [] for line in file('../data/addresslist.txt'): data = getaddressdata(line.strip(), 'Cambridge, MA') l1.append(data) return l1 import treepredict housedata = getpricelist() housetree = treepredict.buildtree(housedata, scoref=treepredict.variance) treepredict.drawtree(housetree, 'housetree.jpg')
import treepredict as tp if __name__ == '__main__': print("ESERCIZIO SU MUSHROOMS DATASET\n") train_data = [] test_data = [] print("ALL DATASET:\n") mydata = tp.aprifile("mushrooms_final.txt") train_data, test_data = tp.createdataset2(mydata, 3250, []) print("TRAIN DATA : \n") print(train_data, "\n") print("TEST DATA: \n") print(train_data) mushrooms_tree = tp.buildtree(train_data) tp.drawtree(mushrooms_tree, "mushrooms_tree.jpeg") tp.fperformance(mydata)