def Dealing_with_Missing_Data(): print '>>Dealing with Missing Data' reload(treepredict) tree=treepredict.buildtree(treepredict.my_data) print '------------------' print treepredict.mdclassify(['google',None,'yes',None],tree) print treepredict.mdclassify(['google','France',None,None],tree)
gender = doc2.getElementsByTagName('gender')[0].firstChild.data age = doc2.getElementsByTagName('age')[0].firstChild.data loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2] # 将州转换成地区 for r, s in stateregions.items(): if loc in s: region = r if region != None: result.append((gender, int(age), region, rating)) except: pass return result l1 = getrandomratings(500) print len(l1) pdata = getpeopledata(l1) print pdata[0] import treepredict hottree = treepredict.buildtree(pdata, scoref=treepredict.variance) treepredict.prune(hottree, 0.5) treepredict.drawtree(hottree, 'hottree.jpg') south = treepredict.mdclassify((None, None, 'south'), hottree) midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree) print south[10] / sum(south.values()) print midat[10] / sum(midat.values())
import treepredict as tr tree = tr.buildtree(tr.my_data) tr.printtree(tree) print tr.mdclassify(['google',None,'yes',None],tree) print tr.mdclassify(['google','France',None,None],tree)
# #测试divideSet # print treepredict.divideSet(data.my_data, 2, "yes") # #测试经过训练后,基尼不纯度和熵的变化 # print treepredict.giniImpurity(data.my_data) # print treepredict.entropy(data.my_data) # set1,set2=treepredict.divideSet(data.my_data, 2, "yes") # print treepredict.giniImpurity(set1) # print treepredict.entropy(set1) #测试buildTree tree=treepredict.buildTree(data.my_data) draw=DrawTree.DrawTree(tree,'treeview.jpg') draw.drawTree() # #使用classify函数进行预测 # tree=treepredict.buildTree(data.my_data) # print treepredict.classify(['(direct)','USA','yes',5], tree) #尝试剪枝函数,并绘图 tree=treepredict.buildTree(data.my_data) treepredict.prune(tree, 1.0) draw=DrawTree.DrawTree(tree,'treeview2.jpg') draw.drawTree() # #使用mdclassify函数进行预测 tree=treepredict.buildTree(data.my_data) print treepredict.classify(['(direct)','USA','yes',5], tree) print treepredict.mdclassify(['google',None,'yes',None], tree) print treepredict.mdclassify(['google','France',None,None], tree)
print "\nFinal tree...\n" treepredict.printtree(tree) # Produce a png of the tree treepredict.drawtree(tree, jpeg="sample_tree.jpg") print "\npng of tree generated using PIL (Python Imaging Library) modules.\n" # Let's classify an incoming record of '(direct), USA, yes, 5' ... incoming = ["(direct)", "USA", "yes", 5] print "Prediction of new record: ", treepredict.classify(incoming, tree) # Let's see how the missing data classification via # the "mdclassify" function performs on our sample data. # Suppose the page field is mssing... reload(treepredict) missing1 = ["google", "France", None, None] treepredict.mdclassify(missing1, tree) print "Prediction when missing pages: ", treepredict.mdclassify(missing1, tree) # Finally, what does pruning do with say a mingain = 0.9 ? print "\nPruned tree...\n" treepredict.prune(tree, 0.9) treepredict.printtree(tree) # For group homework, modify "buildtree" function so that it stops # when a threshold value on entropy is no longer satisfied. It should # accept a minimum gain parameter and stop dividing the branch if # this condition is not met. Pruning the tree will not be used in # this cas.