def main(): print("Give the details of user:"******"Who was your referrer(Slashdot,Google or someone else:)") faq = input("Did you read faqs?:") pages = int(input("How many pages did you visit:")) location = input("Enter your location:") list = [ref, location, faq, pages] treepredict.classify(list)
def classify(example, forest): #print "using forest to classify" counts = {} #count the results for i in range(0, len(forest)): #printtree(forest[0]) r = treepredict.classify(example, forest[i]) if r not in counts: counts[r] = 1 counts[r] += 1 winner_key = 0 winner_value = 0 pickrandom = False for key in counts.keys(): if (counts[key] > winner_value): winner_key = key winner_value = counts[key] elif (counts[key] == winner_value): pickrandom = True if (pickrandom): winner_key = random.choice(counts.keys()) #print counts return winner_key
def classify(example,forest): #print "using forest to classify" counts = {} #count the results for i in range(0,len(forest)): #printtree(forest[0]) r = treepredict.classify(example,forest[i]) if r not in counts: counts[r]=1 counts[r]+=1 winner_key = 0 winner_value = 0 pickrandom = False for key in counts.keys(): if(counts[key] > winner_value): winner_key = key winner_value = counts[key] elif (counts[key] == winner_value): pickrandom = True if(pickrandom): winner_key = random.choice(counts.keys()) #print counts return winner_key
def run_tree(self, tree): ''' Run the testing data against the tree ''' # first get a sample to use for training and make a tree from it print "transforming test data" test_data = [] for item in self.testing_data: test_data.append(self.normalize_data(item)) print "running..." for item in test_data: guess = treepredict.classify(item[0:-1], tree) print "loan id=%s, results=%s" % (item[-1], guess)
def test_tree(self, k=.2): ''' Conduct a test of decision trees ''' # first get a sample to use for training and make a tree from it print "making sample and training tree..." ids = random.sample(range(len(self.training_data)), int(k * len(self.training_data))) sample = [] transform_all = [] for i, item in enumerate(self.training_data): row = self.normalize_data(item) if i in ids: sample.append(row) else: transform_all.append(row) tree = self.make_tree(sample) # now go through the rest of all, seeing how it does num_false_positive = num_false_negative = num_right = 0 print "testing..." for item in transform_all: status = item[-1] guess = treepredict.classify(item[1:-1], tree) # if we're right, record. if not, determine if false negative (ok) or false positive (bad) if status in guess: num_right += 1 else: if status == 'GOOD': num_false_negative += 1 else: num_false_positive += 1 # display results num_processed = len(transform_all) print "sample size=%d, testing size=%d" % (len(sample), num_processed) print "%.2f correct" % ((float(num_right) / float(num_processed)) * 100.0) print "%.2f false negatives (kinda ok)" % ((float(num_false_negative) / float(num_processed)) * 100.0) print "%.2f false positives (bad)" % ((float(num_false_positive) / float(num_processed)) * 100.0)
def classify_with_several_trees(data, trees, original_attribute_list): predicted_results_all = {} for tree in trees: decision_tree = tree['tree'] attributes_index = tree['attributes_index'] re_organized_data = [] #根据当前决策树,重新组织数据(只留下这棵决策树用到的属性和标签) re_organized_attribute_value_list = [] for index in attributes_index: re_organized_data.append(data[index]) re_organized_attribute_value_list.append( original_attribute_list[index]) re_organized_data.append(data[len(data) - 1]) #将标签加入重新组织的数据中 predicted_results = treepredict.classify( re_organized_data, decision_tree, re_organized_attribute_value_list) #将当前这棵树的预测结果合并到总的预测结果中去 for result in predicted_results.keys(): predicted_results_all[result] = predicted_results_all.get( result, 0) + predicted_results[result] print('predicted_results_all:', predicted_results_all) single_label_result = treepredict.post_classify(predicted_results_all) return single_label_result
cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20"+str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) test_flights.update(new_flights) test_data = get_data(test_flights) #writeintxt('test_data.txt', test_data) print '测试数据读入完成,用时', time.clock() corr = 0.0 err_shouldwait = 0.0 err_shouldbuy = 0.0 for row in test_data: obs = row[:-1] pred = treepredict.classify(obs, flighttree) pred = pred.keys()[0] #print pred if pred == row[-1]: corr = corr + 1 elif pred == 'wait': #print '实际为,价格%s,%s' % (row[-2], row[-1]) err_shouldbuy = err_shouldbuy + 1 else: #print '实际为,价格%s,%s' % (row[-2], row[-1]) err_shouldwait = err_shouldwait + 1 size = float(len(test_data)) print "正确率:", float(corr)/size print '错误情况:应当买却没买%s,应当等却买了%s。' % (err_shouldbuy/size, err_shouldwait/size) print '预测%s个数据用时%s' % (len(test_data), time.clock())
# #测试divideSet # print treepredict.divideSet(data.my_data, 2, "yes") # #测试经过训练后,基尼不纯度和熵的变化 # print treepredict.giniImpurity(data.my_data) # print treepredict.entropy(data.my_data) # set1,set2=treepredict.divideSet(data.my_data, 2, "yes") # print treepredict.giniImpurity(set1) # print treepredict.entropy(set1) #测试buildTree tree=treepredict.buildTree(data.my_data) draw=DrawTree.DrawTree(tree,'treeview.jpg') draw.drawTree() # #使用classify函数进行预测 # tree=treepredict.buildTree(data.my_data) # print treepredict.classify(['(direct)','USA','yes',5], tree) #尝试剪枝函数,并绘图 tree=treepredict.buildTree(data.my_data) treepredict.prune(tree, 1.0) draw=DrawTree.DrawTree(tree,'treeview2.jpg') draw.drawTree() # #使用mdclassify函数进行预测 tree=treepredict.buildTree(data.my_data) print treepredict.classify(['(direct)','USA','yes',5], tree) print treepredict.mdclassify(['google',None,'yes',None], tree) print treepredict.mdclassify(['google','France',None,None], tree)
agesonly = ad.loadmatch('agesonly.csv', allnum=True) matchmaker = ad.loadmatch('matchmaker.csv') # ad.plotagematches(agesonly) age = [] for line in file('agesonly.csv'): l = [] for w in line.split(','): l.append(int(w)) age.append(l) tree = tr.buildtree(age) tr.printtree(tree) tr.drawtree(tree) print tr.classify(tree, [65, 63]) avgs = ad.lineartrain(agesonly) print avgs print ad.dpclassify([30, 25], avgs.values()) print ad.dpclassify([25, 40], avgs.values()) print ad.dpclassify([48, 20], avgs.values()) print tr.classify(tree, [30, 25]) print tr.classify(tree, [25, 40]) print tr.classify(tree, [48, 20]) numericalset = ad.loadnumerical() numericalset[0].data
def testBasics(self): t = treepredict.buildtree(treepredict.testdata()) self.assertEquals( treepredict.classify(['(direct)', 'USA', 'yes', 5], t), {'Basic': 4})
import treepredict # fruits with their colors and size fruits = [[4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'], [1, 'green', 'grape'], [5, 'red', 'apple']] tree = treepredict.buildtree(fruits) treepredict.classify([2, 'red'], tree) treepredict.classify([5, 'red'], tree) treepredict.classify([1, 'green'], tree) treepredict.printtree(tree) #treepredict.drawtree(tree, jpeg='treeview.jpg')
def Classifying_New_Observations(): reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) print '>>Classifying New Observations' print treepredict.classify(['(direct)','USA','yes',5],tree)
def testBasics(self): t = treepredict.buildtree(treepredict.testdata()) self.assertEquals(treepredict.classify(['(direct)', 'USA', 'yes', 5], t), {'Basic': 4})
cur_date = orig_date + timedelta str_date = cur_date.strftime('%y/%m/%d') str_date = "20" + str_date new_flights = fl_lines.get_flights_by_dpdate(str_date) test_flights.update(new_flights) test_data = get_data(test_flights) #writeintxt('test_data.txt', test_data) print '测试数据读入完成,用时', time.clock() corr = 0.0 err_shouldwait = 0.0 err_shouldbuy = 0.0 for row in test_data: obs = row[:-1] pred = treepredict.classify(obs, flighttree) pred = pred.keys()[0] #print pred if pred == row[-1]: corr = corr + 1 elif pred == 'wait': #print '实际为,价格%s,%s' % (row[-2], row[-1]) err_shouldbuy = err_shouldbuy + 1 else: #print '实际为,价格%s,%s' % (row[-2], row[-1]) err_shouldwait = err_shouldwait + 1 size = float(len(test_data)) print "正确率:", float(corr) / size print '错误情况:应当买却没买%s,应当等却买了%s。' % (err_shouldbuy / size, err_shouldwait / size) print '预测%s个数据用时%s' % (len(test_data), time.clock())
import treepredict # fruits with their colors and size fruits = [ [4, 'red', 'apple'], [4, 'green', 'apple'], [1, 'red', 'cherry'], [1, 'green', 'grape'], [5, 'red', 'apple'] ] tree = treepredict.buildtree(fruits) treepredict.classify([2, 'red'], tree) treepredict.classify([5, 'red'], tree) treepredict.classify([1, 'green'], tree) treepredict.printtree(tree) #treepredict.drawtree(tree, jpeg='treeview.jpg')
def do_simpletree_kcross_validation(fin,finy,kfolds): print "Starting k=" + str(kfolds)+" validation for Simple tree" #there is 2500 tracks labels = dt.get_lines(finy,int) pb = ProgBar() lines = dt.get_lines(fin,float," ", callback = pb.callback) del pb #normalize features lines = dt.transform_features(lines) data = dt.add_labels_to_lines(lines, labels) block_size = len(lines)/kfolds print "chunk size = " + str(block_size) example_chunks = list(dt.chunks(data, block_size)) #labels_chunks = list(dt.chunks(labels, block_size)) print "number of chunks = " +str(len(example_chunks)) #holds avg accuracy for one forest accuracy_results = [] for i in range(0,len(example_chunks)): #we leave set in index i out of train print "prepare validation set" validationdata = example_chunks[i] #extract validation chunk print "leaving out block " + str(i) + " for validation" leaveout = i validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i])-1],validationdata[i][-1]) for i in range(0,len(validationdata)) ] trainingdata = [] print("merging blocks "), for j in range(0,len(example_chunks)): if(j != leaveout): #print "j="+str(j) + " i="+ str(leaveout) print(str(j) + ","), trainingdata = trainingdata + example_chunks[j] print "\nprepare training set" print "training on " + str(len(trainingdata)) print "each track has " + str(len(trainingdata[0])) + " features" tree = treepredict.buildtree(trainingdata) print "testing on " + str(len(validationdata)) corrects = 0 #classify a set of entries for example in validationdata: #print example.features result = treepredict.classify(example.features,tree) #print 'expected : ' + str(example.label) + ' result : '+ str(result) if(result == example.label): corrects = corrects + 1 #calculate the % of accuracy accuracy_percentage = (corrects*100)/len(validationdata) print "accuracy = " + str(accuracy_percentage) + "%" accuracy_results.append(accuracy_percentage) avgcc = dt.average(accuracy_results) print "average accuracy ="+ str(avgcc) + "%"
def do_simpletree_kcross_validation(fin, finy, kfolds): print "Starting k=" + str(kfolds) + " validation for Simple tree" #there is 2500 tracks labels = dt.get_lines(finy, int) pb = ProgBar() lines = dt.get_lines(fin, float, " ", callback=pb.callback) del pb #normalize features lines = dt.transform_features(lines) data = dt.add_labels_to_lines(lines, labels) block_size = len(lines) / kfolds print "chunk size = " + str(block_size) example_chunks = list(dt.chunks(data, block_size)) #labels_chunks = list(dt.chunks(labels, block_size)) print "number of chunks = " + str(len(example_chunks)) #holds avg accuracy for one forest accuracy_results = [] for i in range(0, len(example_chunks)): #we leave set in index i out of train print "prepare validation set" validationdata = example_chunks[i] #extract validation chunk print "leaving out block " + str(i) + " for validation" leaveout = i validationdata = [ exampleentry(validationdata[i][0:len(validationdata[i]) - 1], validationdata[i][-1]) for i in range(0, len(validationdata)) ] trainingdata = [] print("merging blocks "), for j in range(0, len(example_chunks)): if (j != leaveout): #print "j="+str(j) + " i="+ str(leaveout) print(str(j) + ","), trainingdata = trainingdata + example_chunks[j] print "\nprepare training set" print "training on " + str(len(trainingdata)) print "each track has " + str(len(trainingdata[0])) + " features" tree = treepredict.buildtree(trainingdata) print "testing on " + str(len(validationdata)) corrects = 0 #classify a set of entries for example in validationdata: #print example.features result = treepredict.classify(example.features, tree) #print 'expected : ' + str(example.label) + ' result : '+ str(result) if (result == example.label): corrects = corrects + 1 #calculate the % of accuracy accuracy_percentage = (corrects * 100) / len(validationdata) print "accuracy = " + str(accuracy_percentage) + "%" accuracy_results.append(accuracy_percentage) avgcc = dt.average(accuracy_results) print "average accuracy =" + str(avgcc) + "%"
dt.drawtree(b, jpeg='treeview.jpg') #print("original_testset=",testSet) ############# Preparing Testing DataSet ############## testlabels = [] for i in range(len(testSet)): label = testSet[i].pop(-1) testlabels.append(label) #print("testSet=",testSet) #print("testlabels=",testlabels) ############# Classification of Test Records ############## number = 0 for i in range(len(testSet)): #print("\ntest_data",testSet[i]) a = dt.classify(testSet[i], b) #print("a=",a) max = 0 best = "" for key in a.keys(): if a[key] > max: max = a[key] best = key #print("best=",best) #print("label=",testlabels[i]) if (best == testlabels[i]): number = number + 1 ############# Accuracy Calculations ############## accuracy = (number / len(testSet)) * 100 final_acc += accuracy
# Build the DT recursively using the buildtree function; assumes # last column/field is the classification attribute. tree = treepredict.buildtree(treepredict.my_data) # Let's see what it looks like... print "\nFinal tree...\n" treepredict.printtree(tree) # Produce a png of the tree treepredict.drawtree(tree, jpeg="sample_tree.jpg") print "\npng of tree generated using PIL (Python Imaging Library) modules.\n" # Let's classify an incoming record of '(direct), USA, yes, 5' ... incoming = ["(direct)", "USA", "yes", 5] print "Prediction of new record: ", treepredict.classify(incoming, tree) # Let's see how the missing data classification via # the "mdclassify" function performs on our sample data. # Suppose the page field is mssing... reload(treepredict) missing1 = ["google", "France", None, None] treepredict.mdclassify(missing1, tree) print "Prediction when missing pages: ", treepredict.mdclassify(missing1, tree) # Finally, what does pruning do with say a mingain = 0.9 ? print "\nPruned tree...\n" treepredict.prune(tree, 0.9) treepredict.printtree(tree)
train_data_file = '.\\scene\\scene-train-tiny.arff' test_data_file = '.\\scene\\scene-test-tiny.arff' method = input('1 单标签;2 多个二类分类') if method == '1': #读取训练集,建树(多标签转换成单标签) (attributes_list, label_value_list,train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE) train_data = preprocessor.translate_label_multiclass(train_data, label_count) tree = treepredict.buildtree(train_data, attributes_list, label_value_list) treepredict.printtree(tree) #读取测试集,验证效果 (test_attributes_list, test_label_value_list, test_data) = preprocessor.read_data(test_data_file, label_count, arff.DENSE) test_data_copy = copy.deepcopy(test_data) predicted_labels_list = [] for row in test_data: result = treepredict.classify(row, tree, test_attributes_list) post_result = treepredict.post_classify(result) decoded_result = preprocessor.label_decoding(post_result) predicted_labels_list.append(decoded_result) hamming_loss = postprocessor.hamming_loss(test_data_copy, predicted_labels_list) print('hamming loss of merging labels:', hamming_loss) else : #当做多个二类分类问题处理 (attributes_list, label_value_list, train_data) = preprocessor.read_data(train_data_file, label_count, arff.DENSE) trees = [] for label_index in range(0, label_count): #建立label_count个决策树,每个决策树对应一个二类分类问题(Binary Classification) binary_data = preprocessor.translate_label_binary(train_data, label_count, label_index) print('numbers of attributes of binary_data', len(binary_data[1])) trees.append(treepredict.buildtree(binary_data, attributes_list, label_value_list)) print('label index:', label_index)
#读取训练集,建树(多标签转换成单标签) label_count = 6 # (attributes_list, label_list,train_data) = preprocessor.read_data('.\\scene\\scene-train-tiny.arff', # label_count, arff.DENSE) # train_data = preprocessor.translate_label_multiclass(train_data, label_count) # tree = treepredict.buildtree(train_data, attributes_list, label_list) # treepredict.printtree(tree) # # #测试决策树文件读写 # tree_list = preprocessor.tree2array(tree) # preprocessor.store_tree('.\\my_tree', tree_list) #从文件中加载决策树 loaded_tree_list = preprocessor.load_tree('.\\my_tree') loaded_tree = preprocessor.list2tree(loaded_tree_list) #读取测试集,验证效果 (test_attributes_list, test_label_value_list, test_data) = preprocessor.read_data('.\\scene\\scene-test-tiny.arff', label_count, arff.DENSE) results = [] for row in test_data: result = treepredict.classify(row, loaded_tree, test_label_value_list) print('predict result:', result, 'test case', row) post_result = treepredict.post_classify(result) results.append(preprocessor.label_decoding(post_result)) hammingloss = postprocessor.hamming_loss(test_data, results) print('hamming loss:', hammingloss)