示例#1
0
文件: run.py 项目: wz125/courses
def Dealing_with_Missing_Data():
  print '>>Dealing with Missing Data'
  reload(treepredict)
  tree=treepredict.buildtree(treepredict.my_data)
  print '------------------'
  print treepredict.mdclassify(['google',None,'yes',None],tree)
  print treepredict.mdclassify(['google','France',None,None],tree)
示例#2
0
            gender = doc2.getElementsByTagName('gender')[0].firstChild.data
            age = doc2.getElementsByTagName('age')[0].firstChild.data
            loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2]

            # 将州转换成地区
            for r, s in stateregions.items():
                if loc in s: region = r

            if region != None:
                result.append((gender, int(age), region, rating))
        except:
            pass
    return result


l1 = getrandomratings(500)
print len(l1)
pdata = getpeopledata(l1)
print pdata[0]

import treepredict

hottree = treepredict.buildtree(pdata, scoref=treepredict.variance)
treepredict.prune(hottree, 0.5)
treepredict.drawtree(hottree, 'hottree.jpg')

south = treepredict.mdclassify((None, None, 'south'), hottree)
midat = treepredict.mdclassify((None, None, 'Mid Atlantic'), hottree)
print south[10] / sum(south.values())
print midat[10] / sum(midat.values())
import treepredict as tr

tree = tr.buildtree(tr.my_data)
tr.printtree(tree)
print tr.mdclassify(['google',None,'yes',None],tree)
print tr.mdclassify(['google','France',None,None],tree)
示例#4
0
# #测试divideSet
# print treepredict.divideSet(data.my_data, 2, "yes")

# #测试经过训练后,基尼不纯度和熵的变化
# print treepredict.giniImpurity(data.my_data)
# print treepredict.entropy(data.my_data)
# set1,set2=treepredict.divideSet(data.my_data, 2, "yes")
# print treepredict.giniImpurity(set1)
# print treepredict.entropy(set1)

#测试buildTree
tree=treepredict.buildTree(data.my_data)
draw=DrawTree.DrawTree(tree,'treeview.jpg')
draw.drawTree()

# #使用classify函数进行预测
# tree=treepredict.buildTree(data.my_data)
# print treepredict.classify(['(direct)','USA','yes',5], tree)

#尝试剪枝函数,并绘图
tree=treepredict.buildTree(data.my_data)
treepredict.prune(tree, 1.0)
draw=DrawTree.DrawTree(tree,'treeview2.jpg')
draw.drawTree()

# #使用mdclassify函数进行预测
tree=treepredict.buildTree(data.my_data)
print treepredict.classify(['(direct)','USA','yes',5], tree)
print treepredict.mdclassify(['google',None,'yes',None], tree)
print treepredict.mdclassify(['google','France',None,None], tree)
print "\nFinal tree...\n"
treepredict.printtree(tree)

# Produce a png of the tree
treepredict.drawtree(tree, jpeg="sample_tree.jpg")
print "\npng of tree generated using PIL (Python Imaging Library) modules.\n"

# Let's classify an incoming record of '(direct), USA, yes, 5' ...
incoming = ["(direct)", "USA", "yes", 5]
print "Prediction of new record: ", treepredict.classify(incoming, tree)

# Let's see how the missing data classification via
# the "mdclassify" function performs on our sample data.

# Suppose the page field is mssing...
reload(treepredict)
missing1 = ["google", "France", None, None]
treepredict.mdclassify(missing1, tree)
print "Prediction when missing pages: ", treepredict.mdclassify(missing1, tree)

# Finally, what does pruning do with say a mingain = 0.9 ?
print "\nPruned tree...\n"
treepredict.prune(tree, 0.9)
treepredict.printtree(tree)

# For group homework, modify "buildtree" function so that it stops
# when a threshold value on entropy is no longer satisfied. It should
# accept a minimum gain parameter and stop dividing the branch if
# this condition is not met.  Pruning the tree will not be used in
# this cas.