def Choosing_the_Best_Split(): print '>>Choosing the Best Split' reload(treepredict) print treepredict.giniimpurity(treepredict.my_data) print treepredict.entropy(treepredict.my_data) set1,set2=treepredict.divideset(treepredict.my_data,2,'yes') print treepredict.entropy(set1) print treepredict.giniimpurity(set1)
results = treepredict.divideset(treepredict.my_data, 2, "yes") # results is now a list of lists # See if records are divided according to FAQ field (column) ... print "\nDivision on Read FAQ field...\n" for list in results: for item in list: print "%15s %15s %5s %10d %15s" % tuple(item) # Let's see the difference between gini- and entropy-based impurities # of the current data (no splitting) print "\nParent node...\n" gini = treepredict.giniimpurity(treepredict.my_data) entr = treepredict.entropy(treepredict.my_data) print "Gini: %8f Entropy: %8f" % (gini, entr) # Let's now split on the Read FAQ field and assess impurity node1, node2 = treepredict.divideset(treepredict.my_data, 2, "yes") print "\nRead FAQ = Yes leaf node...\n" gini = treepredict.giniimpurity(node1) entr = treepredict.entropy(node1) print "Gini: %8f Entropy: %8f" % (gini, entr) # Build the DT recursively using the buildtree function; assumes # last column/field is the classification attribute. tree = treepredict.buildtree(treepredict.my_data) # Let's see what it looks like...
#! /usr/bin/.env python2 import treepredict print "Gini impurity\n" print treepredict.giniimpurity(treepredict.my_data) print "\n" print "treepredict.entropy\n" print treepredict.entropy(treepredict.my_data) print "\n" set1, set2 = treepredict.divideset(treepredict.my_data, 2, 'yes') print "Gini impurity\n" print treepredict.giniimpurity(set1) print "treepredict.entropy\n" print treepredict.entropy(set1) print '\n' tree = treepredict.buildtree(treepredict.my_data) print 'tree: ', tree print '\n' print 'classify: ', treepredict.classify(['(direct)', 'USA', 'yes', 5], tree)
pprint(treepredict.divideset(treepredict.my_data, 2, 'yes')) # ([['slashdot', 'USA', 'yes', 18, 'None'], # ['google', 'France', 'yes', 23, 'Premium'], # ['digg', 'USA', 'yes', 24, 'Basic'], # ['kiwitobes', 'France', 'yes', 23, 'Basic'], # ['slashdot', 'France', 'yes', 19, 'None'], # ['digg', 'New Zealand', 'yes', 12, 'Basic'], # ['google', 'UK', 'yes', 18, 'Basic'], # ['kiwitobes', 'France', 'yes', 19, 'Basic']], # [['google', 'UK', 'no', 21, 'Premium'], # ['(direct)', 'New Zealand', 'no', 12, 'None'], # ['(direct)', 'UK', 'no', 21, 'Basic'], # ['google', 'USA', 'no', 24, 'Premium'], # ['digg', 'USA', 'no', 18, 'None'], # ['google', 'UK', 'no', 18, 'None'], # ['kiwitobes', 'UK', 'no', 19, 'None'], # ['slashdot', 'UK', 'no', 21, 'None']]) print(treepredict.giniimpurity(treepredict.my_data)) # 0.6328125 print(treepredict.entropy(treepredict.my_data)) # 1.50524081494 set1, set2 = treepredict.divideset(treepredict.my_data, 2, 'yes') print(treepredict.entropy(set1)) # 1.2987949407 print(treepredict.entropy(set2)) # 1.2987949407
def testBasics(self): d = treepredict.testdata() self.assertAlmostEquals(1.5052408, treepredict.entropy(d)) s1, s2 = treepredict.divideset(d, 2, 'yes') self.assertAlmostEquals(1.2987949, treepredict.entropy(s1))