Exemplo n.º 1
0
def A2():
  for att in m.attributes:
    #print att[ 0 ]
    print "Gain in Monk1 is : ", dT.averageGain( m.monk1, att )
    print "Gain in Monk2 is : ", dT.averageGain( m.monk2, att )
    print "Gain in Monk3 is : ", dT.averageGain( m.monk3, att )
    print '\n'
Exemplo n.º 2
0
def A2():
    for att in m.attributes:
        #print att[ 0 ]
        print "Gain in Monk1 is : ", dT.averageGain(m.monk1, att)
        print "Gain in Monk2 is : ", dT.averageGain(m.monk2, att)
        print "Gain in Monk3 is : ", dT.averageGain(m.monk3, att)
        print '\n'
Exemplo n.º 3
0
def assignment2():
    print("Assignment 2")
    l = [[d.averageGain(m.monk1, a) for a in m.attributes],
            [d.averageGain(m.monk2, a) for a in m.attributes],
            [d.averageGain(m.monk3, a) for a in m.attributes]]
    for i in range(0, 3):
        # print("MONK-%d: %f %f %f %f %f %f" % tuple([i + 1] + l[i]), end=" | ")
        print("Split on: a%d" % (max(enumerate(l[i]), key=lambda p: p[1])[0] + 1))
Exemplo n.º 4
0
def assignment2():
    print("Assignment 2")
    l = [[d.averageGain(m.monk1, a) for a in m.attributes],
         [d.averageGain(m.monk2, a) for a in m.attributes],
         [d.averageGain(m.monk3, a) for a in m.attributes]]
    for i in range(0, 3):
        # print("MONK-%d: %f %f %f %f %f %f" % tuple([i + 1] + l[i]), end=" | ")
        print("Split on: a%d" %
              (max(enumerate(l[i]), key=lambda p: p[1])[0] + 1))
Exemplo n.º 5
0
def assignment3():
    monk1 = ['monk1']
    monk2 = ['monk2']
    monk3 = ['monk3']
    for j in range (0,6):
        monk1.append(round(tree.averageGain(m.monk1,m.attributes[j]),5))
        monk2.append(round(tree.averageGain(m.monk2,m.attributes[j]),5))
        monk3.append(round(tree.averageGain(m.monk3,m.attributes[j]),5))
    print(tabulate([monk1, monk2,monk3], headers=['Dataset', 'a1','a2','a3','a4','a5','a6'], tablefmt='orgtbl'))
Exemplo n.º 6
0
def getBestAttribute(dataset, attributes, available):
    # Returns the index of the best attribute to split on in attributes[]
    mostGain = -1;
    bestAttribute = None;
    for i in range(0, len(attributes)):
        if available[i] and d.averageGain(dataset, attributes[i]) > mostGain:
            mostGain = d.averageGain(dataset, attributes[i])
            bestAttribute = i
    return bestAttribute
Exemplo n.º 7
0
def getBestAttribute(dataset, attributes, available):
    # Returns the index of the best attribute to split on in attributes[]
    mostGain = -1
    bestAttribute = None
    for i in range(0, len(attributes)):
        if available[i] and d.averageGain(dataset, attributes[i]) > mostGain:
            mostGain = d.averageGain(dataset, attributes[i])
            bestAttribute = i
    return bestAttribute
Exemplo n.º 8
0
def gain():
	monk1_gains = [];
	monk2_gains = [];
	monk3_gains = [];
	for i in range(0,6):
		monk1_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk1,mdata.attributes[i]))) 
		monk2_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk2,mdata.attributes[i])))
		monk3_gains.append((mdata.attributes[i],dtree.averageGain(mdata.monk3,mdata.attributes[i])))
	return [monk1_gains,monk2_gains,monk3_gains]
Exemplo n.º 9
0
def assignment3():
    gains1 = [d.averageGain(m.monk1, m.attributes[i]) for i in range(0, 6)]
    print("The gains of entropy on the first dataset per attribute is %s" %
          gains1)
    gains2 = [d.averageGain(m.monk2, m.attributes[i]) for i in range(0, 6)]
    print("The gains of entropy on the second dataset per attribute is %s" %
          gains2)
    gains3 = [d.averageGain(m.monk3, m.attributes[i]) for i in range(0, 6)]
    print("The gains of entropy on the third dataset per attribute is %s" %
          gains3)
Exemplo n.º 10
0
def calc_gain():
  print "\n------------------------------\nAssignment 2 - Average gain\n------------------------------"
  i = 1
  print "Dataset\t  a1\t\t  a2\t\t  a3\t\t  a4\t\t  a5\t\t  a6"
  s = "Monk1\t"
  for attr in data.attributes: 
    s = s + "%.6f\t" % (dt.averageGain(data.monk1, attr))
  print s
  s = "Monk2\t"
  for attr in data.attributes: 
    s = s + "%.6f\t" % (dt.averageGain(data.monk2, attr))
  print s
  s = "Monk3\t"
  for attr in data.attributes: 
    s = s + "%.6f\t" % (dt.averageGain(data.monk3, attr))
  print s
Exemplo n.º 11
0
def printGains(datasets, attributes):
    # Print GAIN for all datasets
    for i in range(0, len(datasets)):
        for j in range(0, len(attributes)):
            print("Gain monk" + str(i + 1) + ", a" + str(j + 1) + ": " +
                  str(d.averageGain(datasets[i], attributes[j])))
        print("---------------")
Exemplo n.º 12
0
def infoGainOnSubsets(subsets):
    l = []  # List containing subsets depending on what choice was made
            # in previous node
    for sset in subsets:
        attribute_gains = []
        j = 0
        while (j < len(m.attributes)):
            attribute_gains.append(dtree.averageGain(sset, m.attributes[j]))
            j+=1
        l.append(attribute_gains)
    result = [0] * len(m.attributes)
    i = 0
    while i < len(l):
        j = 0
        while j < len(m.attributes):
            result[j] += (l[i][j])
            #print((l[i][j]))
            j+=1
        i+=1
    """
    Summing over the information gains given by the attributes in different
    nodes on level two shows that attribute 1 gives the best information gain
    """
    print(result)
    return
Exemplo n.º 13
0
def get_avg_gain_dict(dataset):
    avg_gain_dict = dict()
    for i in range(len(m.attributes)):
        attribute_key = "A" + str(i + 1)
        attribute = m.attributes[i]
        avg_gain = d.averageGain(dataset, attribute)
        avg_gain_dict[attribute_key] = avg_gain
    return avg_gain_dict
Exemplo n.º 14
0
def information_gain(datasets):
    attributes_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6']
    information_gain_matrix = np.zeros((len(datasets), len(m.attributes)))
    for idx, dataset in enumerate(datasets):
        for i in range(len(attributes_names)):
            average_gain = round(d.averageGain(dataset, m.attributes[i]), 4)
            information_gain_matrix[idx, i] = average_gain
    return information_gain_matrix
Exemplo n.º 15
0
def print_average_gain(datasets, attributes):
    # Calculate average gain for datasets
    print("Information gain")
    for data in datasets:
        print("For " + data + " the:")
        for attribute in attributes:
            print("information gain for " + attribute + " was: " +
                  str(d.averageGain(datasets[data], attributes[attribute])))
Exemplo n.º 16
0
def informationGainCalculation():

    print("Information gain results ", "\n")
    for attributeIndex in range(0, 6):
        result = d.averageGain(m.monk1, m.attributes[attributeIndex])
        print("Monk1|   ", attributeIndex + 1, ": ", result, "    ")
    print("Best attribute: ", d.bestAttribute(m.monk1, m.attributes), "\n")

    for attributeIndex in range(0, 6):
        result = d.averageGain(m.monk2, m.attributes[attributeIndex])
        print("Monk2|   ", attributeIndex + 1, ": ", result, "    ")
    print("Best attribute: ", d.bestAttribute(m.monk2, m.attributes), "\n")

    for attributeIndex in range(0, 6):
        result = d.averageGain(m.monk3, m.attributes[attributeIndex])
        print("Monk3|   ", attributeIndex + 1, ": ", result, "    ")
    print("Best attribute: ", d.bestAttribute(m.monk3, m.attributes), "\n")
Exemplo n.º 17
0
def bestAttribute(dataset, attributes):
    result = 0
    best = attributes[0]
    for a in attributes:
        value = dt.averageGain(dataset, a)
        if value > result:
            result = value
            best = a
    return best
Exemplo n.º 18
0
def chosenNode(DataSet, attributes):
    IG = {}
    for j in range(len(attributes)):
        IG[dtree.averageGain(DataSet, attributes[j])] = attributes[j].name
    #max IG
    maxIGKey = max(IG.keys())
    #the chosen attribute that has the highest value of IG such as A5 in monk1
    maxIGValue = IG[maxIGKey]
    #print('chosen node: %s'%(maxIGValue))
    return maxIGValue
Exemplo n.º 19
0
def get_avg_gain_dict_exclude(dataset, exclude=[]):
    avg_gain_dict = dict()
    for i in range(len(m.attributes)):
        if i not in exclude:
            attribute_key = "A" + str(i + 1)
            attribute = m.attributes[i]
            avg_gain = d.averageGain(dataset, attribute)
            avg_gain_dict[attribute_key] = avg_gain

    return avg_gain_dict
Exemplo n.º 20
0
def assignment_3():
    i = 0
    k = 0
    while i <= 5:
        f = dtree.averageGain(m.monk1, m.attributes[i])
        k += f
        print(i + 1, rnd(f, 8))
        i += 1

    print(k)
Exemplo n.º 21
0
def assignment2():
	print "--- Assignment 2 ---"
	print "Selecting the root of the decision tree"
	table = Texttable(max_width=100)
	table.add_row(["Dataset", "a1", "a2", "a3", "a4", "a5", "a6"])
	for i in range(3):
		gains = map(lambda att: d.averageGain(monkdata[i],att), m.attributes)
		table.add_row(["Monk-" + str(i+1)] + gains)
	print table.draw()
	print
Exemplo n.º 22
0
def optimal_attr_split(data, attributes):
    maxGain = -1
    maxGainInd = 0
    for attr in range(0,len(attributes)):
        gain = dtree.averageGain(data, attributes[attr])
        print("Attribute " + str(attr + 1) + " information gain: " + str(gain)) 
        if (maxGain < gain):
            maxGain = gain
            maxGainInd = attr

    print("Best split attribute is attribute " + str(maxGainInd + 1))
    return maxGainInd
Exemplo n.º 23
0
def informationGain(data):
	a1 = tree.averageGain(data.monk3, data.attributes[0])
	a2 = tree.averageGain(data.monk3, data.attributes[1])
	a3 = tree.averageGain(data.monk3, data.attributes[2])
	a4 = tree.averageGain(data.monk3, data.attributes[3])
	a5 = tree.averageGain(data.monk3, data.attributes[4])
	a6 = tree.averageGain(data.monk3, data.attributes[5])
Exemplo n.º 24
0
def getMaxGain(dataset, attributes):

    gains = [(a, dtree.averageGain(dataset, a)) for a in attributes]

    g = [str(round(b, 5)) for a, b in gains]

    print(" & ".join(g))

    attribute_max = max(gains, key=lambda x: x[1])

    print("Max gain on attribute", attribute_max)

    return attribute_max
Exemplo n.º 25
0
def compute_gain():
  print ("Compute information gain of attributes in training datasets:")

  ig_table = PrettyTable(['Dataset', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6'])

  for i in range(3):
    l = ["MONK-{0}".format(i+1)]
    for k in range(6):
      l.append(round(dt.averageGain(monks[i], m.attributes[k]), 10))
    ig_table.add_row(l)

  print(ig_table)
  print ()
Exemplo n.º 26
0
def find_information_gain(data_set, attributes):
    information_gain = [
        d.averageGain(data_set, attributes[0]),
        d.averageGain(data_set, attributes[1]),
        d.averageGain(data_set, attributes[2]),
        d.averageGain(data_set, attributes[3]),
        d.averageGain(data_set, attributes[4]),
        d.averageGain(data_set, attributes[5])
    ]
    return information_gain
Exemplo n.º 27
0
def calc_next_level():
  #print "\nAverage gain when a5 is choosen"
  print "\nA5\t  a1\t\t  a2\t\t  a3\t\t  a4\t\t  a5\t\t  a6"
  s = "A5(" 
  for val in data.attributes[4].values:
    subset = dt.select(data.monk1, data.attributes[4], val)
    t = "\t"
    for attr in data.attributes: 
      t = t + "%.6f\t" % (dt.averageGain(subset, attr))
    print val , t
    best = dt.bestAttribute(subset, data.attributes)
    s = s + best.name + "("
    #print "best attribute: ", best.name
    for value in best.values:
      #print "choose: ", value, "mostCommon: ", dt.mostCommon(dt.select(subset, best, value))
      if(dt.mostCommon(dt.select(subset, best, value))): 
        s = s + "+"
      else:
        s = s + "-"
    s = s + ")"
  s = s + ")"
  print "\nOur tree:\t", s
  print "Build tree:\t", dt.buildTree(data.monk1, data.attributes, 2)
Exemplo n.º 28
0
print("3. Building decision tree: \n")

print("Subset division of MONK-1 at attribute 5: \n")
subsets = [];
for x in range(0, len(m.attributes[4].values)):
    subsets.append(d.select(m.monk1, m.attributes[4], x+1))

for set in subsets:
    gain = 0;
    maxgain = 0;
    bestatr = 0;
    print("Value: %d" % (subsets.index(set) + 1))
    print("Most common: " + str(d.mostCommon(set)))
    for x in range(0, len(m.attributes)):
        gain = d.averageGain(set, m.attributes[x]);
        print("Attribute A%d: %f" % (x+1, gain))
        if(gain > maxgain):
            maxgain = gain;
            bestatr = x;
    print("Attribute with best information gain: A%d \n" % (bestatr + 1));
    maxgain = 0
    bestatr = 0;


print("MONK-1:")
t = d.buildTree(m.monk1, m.attributes)
print("Testing set error %f: " % (1 - d.check(t, m.monk1test)))
print("Training set error %f: \n" % (1 - d.check(t, m.monk1)))

print("MONK-2:")
Exemplo n.º 29
0
import dtree
import drawtree_qt5 as draw
import numpy as np
import matplotlib.pyplot as plt
import random

entropyMonk1 = dtree.entropy(m.monk1)
entropyMonk2 = dtree.entropy(m.monk2)
entropyMonk3 = dtree.entropy(m.monk3)

print(f'Entropy for monk1: {entropyMonk1}')
print(f'Entropy for monk2: {entropyMonk2}')
print(f'Entropy for monk3: {entropyMonk3}')

informationGainMonk1 = list(
    map(lambda x: dtree.averageGain(m.monk1, x), m.attributes))
informationGainMonk2 = list(
    map(lambda x: dtree.averageGain(m.monk2, x), m.attributes))
informationGainMonk3 = list(
    map(lambda x: dtree.averageGain(m.monk3, x), m.attributes))

print(
    f'Information gain for all 6 attuributes for monk1: {informationGainMonk1}'
)
print(
    f'Information gain for all 6 attuributes for monk2: {informationGainMonk2}'
)
print(
    f'Information gain for all 6 attuributes for monk3: {informationGainMonk3}'
)
Exemplo n.º 30
0
def ASSIGNMENT3(dataset):
    for idx, attribute in enumerate(m.attributes):
        ag = dtree.averageGain(dataset, attribute)
        print("Average gain of a{:d}: {:f}".format(idx + 1, ag))
Exemplo n.º 31
0
# Needed import for drawing the decision tree.
#import drawtree as drawtree

# Datasets
train = [monk.monk1, monk.monk2, monk.monk3]
test = [monk.monk1test, monk.monk2test, monk.monk3test]

print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1)))
print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2)))
print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3)))

for i, dataset in enumerate(train):
    print("")
    print("Average gain for monk{} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute)))

monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)]

for i, monk1 in enumerate(monk1a5):
    print("")
    print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        if j != 4:
            print("a{} = {}".format(j + 1, dt.averageGain(monk1, attribute)))
    print("Majority class = {}".format(dt.mostCommon(monk1)))


# Building the decision tree.
tree1 = dt.buildTree(monk.monk1, monk.attributes)
tree2 = dt.buildTree(monk.monk2, monk.attributes)
def avgGain(datasets):
    return [[dtree.averageGain(d, a) for a in m.attributes] for d in datasets]
Exemplo n.º 33
0
def infGain(s):
	for x in range(0, 6):
		print "a", x + 1, " ", d.averageGain(s, m.attributes[x])	
Exemplo n.º 34
0
import monkdata as m
import dtree as dtree

foo = dtree.select(m.monk1, m.attributes[4], 3)
print '-- information gain of monk-1 dataset: --'
print 'a_1: ' + str(dtree.averageGain(foo, m.attributes[0]))
print 'a_2: ' + str(dtree.averageGain(foo, m.attributes[1]))
print 'a_3: ' + str(dtree.averageGain(foo, m.attributes[2]))
print 'a_4: ' + str(dtree.averageGain(foo, m.attributes[3]))
print 'a_6: ' + str(dtree.averageGain(foo, m.attributes[5]))

foo = dtree.select(m.monk1, m.attributes[4], 1)
print '-- is a_5 with value = 1 a majority class? --'
print dtree.mostCommon(foo)
Exemplo n.º 35
0
def printGain(dataset, attributes):
    for j in range(0,len(attributes)):
            print("Gain a"+str(j+1)+": "+str(d.averageGain(dataset, attributes[j])))
Exemplo n.º 36
0
monkEntropy = [round(t.entropy(m.monk1), 5), round(t.entropy(m.monk2), 5), round(t.entropy(m.monk3), 5)]
"--Answer to Assignment 1"
print(monkEntropy, "\n")

"4 INFORMATION GAIN"
"--Assignment 2"
monkTrainingSets = [m.monk1, m.monk2, m.monk3]
informationGain = []

print("Assignment 2: Expected information gains")
att = []
# save values for each attribute
for monk in monkTrainingSets:  # for each data set
    for attribute in m.attributes:  # for every attribute
        # calculate the gain of splitting by the attribute
        att.append(round(t.averageGain(monk, attribute), 5))

    informationGain.append(att)  # save a "row vector"
    att = []

"--Answer to Assignment 2"
print(informationGain[2], "\n")

# print(t.bestAttribute(m.monk1, m.attributes))

""" 
Attribute a5 has the largest information gain meaning that it reduces the 
uncertainty the most. Thus, it should be used for splitting at the root node.
"""

Exemplo n.º 37
0
def assignment2():
    print("Average Gain for ", "Monk-1")
    print(
        d.averageGain(m.monk1, m.attributes[0]),
        d.averageGain(m.monk1, m.attributes[1]),
        d.averageGain(m.monk1, m.attributes[2]),
        d.averageGain(m.monk1, m.attributes[3]),
        d.averageGain(m.monk1, m.attributes[4]),
        d.averageGain(m.monk1, m.attributes[5]),
    )

    print("Average Gain for ", "Monk-2")
    print(
        d.averageGain(m.monk2, m.attributes[0]),
        d.averageGain(m.monk2, m.attributes[1]),
        d.averageGain(m.monk2, m.attributes[2]),
        d.averageGain(m.monk2, m.attributes[3]),
        d.averageGain(m.monk2, m.attributes[4]),
        d.averageGain(m.monk2, m.attributes[5]),
    )

    print("Average Gain for ", "Monk-3")
    print(
        d.averageGain(m.monk3, m.attributes[0]),
        d.averageGain(m.monk3, m.attributes[1]),
        d.averageGain(m.monk3, m.attributes[2]),
        d.averageGain(m.monk3, m.attributes[3]),
        d.averageGain(m.monk3, m.attributes[4]),
        d.averageGain(m.monk3, m.attributes[5]),
    )
Exemplo n.º 38
0
def myBuildTree(dataset, levels):
    treeLevels = []
    splits = []
    treeLevels.append(dataset)
    datasubsets = dataset
    datasubsetsAvgGains = []
    for level in range(0, levels):
        print("\n===Level #: ", level)
        if level == 0:
            attribAvgGains = []
            largestGain = 0
            largestAttribIndex = 0
            if len(datasubsets) > 5:
                for attribute in range(0, len(m.attributes)):
                    avgGain = d.averageGain(datasubsets, m.attributes[attribute])
                    if avgGain > largestGain:
                        largestGain = avgGain
                        largestAttribIndex = attribute
                    attribAvgGains.append(avgGain)
                    print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain)
                    datasubsetsAvgGains.append(attribAvgGains)
                print("---Splitting at attribute: ", m.attributes[largestAttribIndex])
                datasubsets = split(datasubsets, m.attributes[largestAttribIndex])
                splits.append(m.attributes[largestAttribIndex])
                treeLevels.append(datasubsets)

        elif level > 0:
            print("---No. of datasets: ", len(datasubsets))
            newdatasubsets = []
            for i in range(0, len(datasubsets)):
                print("\n---Datasubset: ", i, "\t\tEntropy: ", d.entropy(datasubsets[i]))
                attribAvgGains = []
                newdatasubsets = []
                largestGain = 0
                largestAttribIndex = 0
                if len(datasubsets[i]) > 5:
                    for attribute in range(0, len(m.attributes)):
                        avgGain = d.averageGain(datasubsets[i], m.attributes[attribute])
                        if avgGain > largestGain:
                            largestGain = avgGain
                            largestAttribIndex = attribute
                        attribAvgGains.append(avgGain)
                        print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain)
                    if avgGain > 0:
                        print("---Splitting at attribute: ", m.attributes[largestAttribIndex].name)
                        newdatasubsets.append(split(datasubsets[i], m.attributes[largestAttribIndex]))
                        splits.append(m.attributes[largestAttribIndex])
                    else:
                        print(
                            "---Skipping splitting at attribute: ",
                            m.attributes[largestAttribIndex].name,
                            "Dataset #",
                            i,
                        )
                    datasubsetsAvgGains.append(attribAvgGains)

            if len(newdatasubsets[0]) > 1:
                datasubsets = newdatasubsets[0]
                print("---No. of New datasets: ", len(datasubsets))
            treeLevels.append(datasubsets)

    return splits
Exemplo n.º 39
0
__author__ = 'swebo_000'

import monkdata as m
import dtree as d
#import drawtree

monkset = [m.monk1, m.monk2, m.monk3]

print("1. Entropy of the MONK datasets:")
for x in range(0, len(monkset)):
    print("MONK-%d: %f" % (x+1, d.entropy(monkset[x])))
print();

print("2. Information gain from attributes:")
for set in monkset:
    print("MONK-%d" % (monkset.index(set) + 1))
    for x in range(0, len(m.attributes)):
        print("Attribute %d: %f" %(x+1, d.averageGain(set, m.attributes[x])))
    print()
Exemplo n.º 40
0
    pruned_trees_performance = [0 for x in range(len(pruned_trees))]
    for candidate in pruned_trees:
        index = pruned_trees.index(candidate)
        pruned_trees_performance[index] = d.check(candidate, validation)
    if d.check(tree, validation) <= max(pruned_trees_performance):
        tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))]
        tree = prune_tree(tree, validation)
    return tree

print(d.entropy(m.monk1))
print(d.entropy(m.monk2))
print(d.entropy(m.monk3))
print("\n")

print("monk-1: %f %f %f %f %f %f" % (
    d.averageGain(m.monk1, m.attributes[0]), d.averageGain(m.monk1, m.attributes[1]),
    d.averageGain(m.monk1, m.attributes[2]), d.averageGain(m.monk1, m.attributes[3]),
    d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5])
))

print("monk-2: %f %f %f %f %f %f" % (
    d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]),
    d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]),
    d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5])
))

print("monk-3: %f %f %f %f %f %f" % (
    d.averageGain(m.monk3, m.attributes[0]), d.averageGain(m.monk3, m.attributes[1]),
    d.averageGain(m.monk3, m.attributes[2]), d.averageGain(m.monk3, m.attributes[3]),
    d.averageGain(m.monk3, m.attributes[4]), d.averageGain(m.monk3, m.attributes[5])
))
Exemplo n.º 41
0
def getClasification(dataset,fraction):
    monk1train, monk1val = partition(dataset,fraction)
    testTree = tree.buildTree(monk1val,m.attributes)
    prunedTrees = tree.allPruned(testTree)
    pValue = 0
    for pruned in prunedTrees:
        if(tree.check(pruned,monk1train) > pValue):
            bestTree = pruned
            pValue = tree.check(pruned,monk1train)
    return pValue, bestTree

print "Entropy Monk1: " + str(tree.entropy(m.monk1))
print "Entropy Monk2: " + str(tree.entropy(m.monk2))
print "Entropy Monk3: " + str(tree.entropy(m.monk3))

print "Gain Monk1 a1: " + str(tree.averageGain(m.monk1,m.attributes[0]))
print "Gain Monk1 a2: " + str(tree.averageGain(m.monk1,m.attributes[1]))
print "Gain Monk1 a3: " + str(tree.averageGain(m.monk1,m.attributes[2]))
print "Gain Monk1 a4: " + str(tree.averageGain(m.monk1,m.attributes[3]))
print "Gain Monk1 a5: " + str(tree.averageGain(m.monk1,m.attributes[4]))
print "Gain Monk1 a6: " + str(tree.averageGain(m.monk1,m.attributes[5]))

print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0]))
print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1]))
print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2]))
print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3]))
print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5]))

print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0]))
print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1]))
Exemplo n.º 42
0
#!/usr/bin/env python

import dtree as d
import monkdata as m

monkset = [m.monk1, m.monk2, m.monk3]
mtrain = [m.monk1test, m.monk2test, m.monk3test]

#Assignement 1
print 'Entropy for monk1-3'
j = 1
for monk in monkset:
   #s = '\ta' + str(j++) + ': ' + str(d.entropy(monk))
   print d.entropy(monk)

#Assignement 2
attributes = [0, 0, 0]
print '\nInformation gain for attributes a1 to a6'
for i in range(0, len(monkset)):
   print 'Monk', i+1
   s = ""
   greatest = 0
   for x in range(0, 6):
       averageGain = d.averageGain(monkset[i], m.attributes[x])
       if averageGain > greatest: greatest = averageGain
       s = s + str(averageGain)+ ' '
   print s
   attributes[i] = greatest
Exemplo n.º 43
0
def printGains(datasets, attributes):
    # Print GAIN for all datasets
    for i in range(0, len(datasets)):
        for j in range(0,len(attributes)):
            print("Gain monk"+str(i+1)+", a"+str(j+1)+": "+str(d.averageGain(datasets[i], attributes[j])))
        print("---------------")
Exemplo n.º 44
0
#Entorpy

#calling the predefined function that calculates the entropy for all the three datasets
#assignment 1
print dt.entropy(m.monk1)
print dt.entropy(m.monk2)
print dt.entropy(m.monk3)
print '\n'

##############################################################################
#Information Gain

#cycles for calling average gains for all the three datasets and for every attribute
#assignment 2
for atr in m.attributes:
    gain = dt.averageGain(m.monk1, atr)
    print gain
    
print '\n'    
for atr in m.attributes:
    print dt.averageGain(m.monk2, atr)

print '\n'    
for atr in m.attributes:
    print dt.averageGain(m.monk3, atr)
print '\n' 

 
#############################################################################
#Building decision trees
Exemplo n.º 45
0
def calcgainforset(dataset, name):
    for attr in m.attributes:
        print("Information Gain %s %s %f" % (name, attr.name, d.averageGain(dataset, attr)))
    print()
Exemplo n.º 46
0
        
#Assignment 1 calculatee enropy

print("")
m1e = dt.entropy(m.monk1)
m2e = dt.entropy(m.monk2)
m3e = dt.entropy(m.monk3)

print("Monk1 entropy: ", m1e) 
print("Monk2 entropy: ", m2e)
print("Monk3 entropy: ", m3e) 
print("")

M1 = []
M2 = [None]*6
M3 = [None]*6

for i in range(6):
	M1.append(dt.averageGain(m.monk1, m.attributes[i]))
	M2[i] = dt.averageGain(m.monk2, m.attributes[i])
	M3[i] = dt.averageGain(m.monk3, m.attributes[i])

print("\ta1: \ta2: \ta3: \ta4: \ta5: \ta6:")
print_average_gains(M1, "M1")
print_average_gains(M2, "M2")
print_average_gains(M3, "M3")	

#t = dt.buildTree(m.monk1, m.attributes)
#dw.drawtree(dt.check(t, m.monk1test)) 
#print(dt.check(t, m.monk1test))
Exemplo n.º 47
0
def calculateGainTuplesForAllAttributes(dataset, attributes):
    gainlist = [d.averageGain(dataset, m.attributes[i]) for i in range(attributes.__len__())]
    return zip(gainlist, range(attributes.__len__()))
Exemplo n.º 48
0
import matplotlib.pyplot as plot

sets = [monkdata.monk1, monkdata.monk2, monkdata.monk3]

entropies = [dtree.entropy(s) for s in sets]

def printlines(values):
    for line in values:
       print(', '.join(map(str, line)))

print("Initial entropies:")
print(entropies)
print("")


gain = [[dtree.averageGain(s, attr) for attr in monkdata.attributes] for s in sets]

print("Expected gain:")
printlines(gain)
print("")

def tests(pair):
    tree=dtree.buildTree(pair[0], monkdata.attributes)
    return [
            pair[2],
            dtree.check(tree,pair[0]),
            dtree.check(tree,pair[1])
    ]


setpairs = [
Exemplo n.º 49
0
t = d.buildTree(m.monk1, m.attributes)
print(d.check(t, m.monk1))
print(d.check(t, m.monk1test))

t = d.buildTree(m.monk2, m.attributes)
print(d.check(t, m.monk2))
print(d.check(t, m.monk2test))

t = d.buildTree(m.monk3, m.attributes)
print(d.check(t, m.monk3))
print(d.check(t, m.monk3test))

print("First Node IG")
for i in range(0, 6):
    print(d.averageGain(m.monk1, m.attributes[i]))

a5_1 = d.select(m.monk1, m.attributes[4], 1)
a5_2 = d.select(m.monk1, m.attributes[4], 2)
a5_3 = d.select(m.monk1, m.attributes[4], 3)
a5_4 = d.select(m.monk1, m.attributes[4], 4)

print("subset a5_1 IG")
for i in range(6):
    print(d.averageGain(a5_1, m.attributes[i]))

print("subset a5_2 IG")
for i in range(6):
    print(d.averageGain(a5_2, m.attributes[i]))

print("subset a5_3 IG")
Exemplo n.º 50
0
    accuracy = d.check(final_tree, test_data)
    #print("Accuracy for Monk1.test", accuracy)
    return accuracy


print(d.entropy(m.monk1))
print(d.entropy(m.monk2))
print(d.entropy(m.monk3))
#Printout the entropy of all datasets.

a = list()
b = list()
c = list()

for i in range(0, 6, 1):
    a.append(d.averageGain(m.monk1, m.attributes[i]))
for i in range(0, 6, 1):
    b.append(d.averageGain(m.monk2, m.attributes[i]))
for i in range(0, 6, 1):
    c.append(d.averageGain(m.monk3, m.attributes[i]))

print(a)
print(b)
print(c)
#
#Calculate and printout the information get for all properties and datasets.
#

#r = d.select(m.monk1, m.attributes[1], 2)
#for x in r:
#    print(x.attribute, "Positive:", x.positive)
Exemplo n.º 51
0
def getClasification(dataset,fraction):
    monk1train, monk1val = partition(dataset,fraction)
    testTree = tree.buildTree(monk1val,m.attributes)
    prunedTrees = tree.allPruned(testTree)
    pValue = 0
    for pruned in prunedTrees:
        if(tree.check(pruned,monk1train) > pValue):
            bestTree = pruned
            pValue = tree.check(pruned,monk1train)
    return pValue, bestTree

print "Entropy Monk1: " + str(tree.entropy(m.monk1))
print "Entropy Monk2: " + str(tree.entropy(m.monk2))
print "Entropy Monk3: " + str(tree.entropy(m.monk3))

print "Gain Monk1 a1: " + str(tree.averageGain(m.monk1,m.attributes[0]))
print "Gain Monk1 a2: " + str(tree.averageGain(m.monk1,m.attributes[1]))
print "Gain Monk1 a3: " + str(tree.averageGain(m.monk1,m.attributes[2]))
print "Gain Monk1 a4: " + str(tree.averageGain(m.monk1,m.attributes[3]))
print "Gain Monk1 a5: " + str(tree.averageGain(m.monk1,m.attributes[4]))
print "Gain Monk1 a6: " + str(tree.averageGain(m.monk1,m.attributes[5]))

print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0]))
print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1]))
print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2]))
print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3]))
print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
print "Gain Monk2 a6: " + str(tree.averageGain(m.monk2,m.attributes[5]))

print "Gain Monk3 a1: " + str(tree.averageGain(m.monk3,m.attributes[0]))
print "Gain Monk3 a2: " + str(tree.averageGain(m.monk3,m.attributes[1]))
Exemplo n.º 52
0
################

#Setting up lists
info_gain_m1 = []
info_gain_m2 = []
info_gain_m3 = []
attribute = []

#starting counter
i = 0;
#iterating over all the test sets
for sets in [info_gain_m1, info_gain_m2, info_gain_m3]:

    #for all attributes in the sets, the average information gain is added to the list
    for k in range(6):
        attribute.append(dtree.averageGain(data_sets[i], m.attributes[k]));
    sets.append(attribute)

    attribute = []
    i += 1;
    
#print(info_gain_m1)
#print(info_gain_m2)
#print(info_gain_m3)

# Assignment 3 #
################

selected = dtree.select(m.monk1, m.attributes[4], 1)

t=dtree.buildTree(m.monk1, m.attributes);
Exemplo n.º 53
0
print "Monk1 entropy: ", init_entropy_monk1 
print "Monk2 entropy: ", init_entropy_monk2
print "Monk3 entropy: ", init_entropy_monk3

print
print "------------------------------"

print "-------- Assignment 2 --------"
print 

gain_monk1  = []
gain_monk2  = []
gain_monk3  = []
for x in range(0, 6):
  gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x]))
  gain_monk2.append(dt.averageGain(m.monk2,m.attributes[x]))
  gain_monk3.append(dt.averageGain(m.monk3,m.attributes[x]))

print "Dataset\tA1\t\tA2\t\tA3\t\tA4\t\tA5\t\tA6"
print "Monk1: ","\t".join(["%.7f"%y for y in gain_monk1])
print "Monk2: ","\t".join(["%.7f"%y for y in gain_monk2])
print "Monk3: ","\t".join(["%.7f"%y for y in gain_monk3])

print
print "------------------------------"

print "-------- Assignment 3 --------"
print 

partition1 = dt.select(m.monk1,m.attributes[4],1)
Exemplo n.º 54
0
a = dtree.bestAttribute(mdata.monk1, mdata.attributes)
attributesLeft = [x for x in mdata.attributes if x != a]
#print(a,attributesLeft) #a5

subsets = []
for v in a.values:
    temp = dtree.select(mdata.monk1, a, v)
    subsets.append(temp)

ag_in2level = []
subsets_ag = []
#print(len(a.values))
for subset in subsets:
    for i in range(len(attributesLeft)):
        gain1 = dtree.averageGain(subset, attributesLeft[i])
        ag_in2level.append(gain1)
    subsets_ag.append(ag_in2level)
    ag_in2level = []
#print(subsets_ag)


def Tree(dataset, attributes, maxdepth=3):
    def Branch(dataset, default, attributes):
        if not dataset:
            return dtree.TreeLeaf(default)
        if dtree.allPositive(dataset):
            return dtree.TreeLeaf(True)
        if dtree.allNegative(dataset):
            return dtree.TreeLeaf(False)
        return Tree(dataset, attributes, maxdepth - 1)
Exemplo n.º 55
0
def avgForDataset(dataset):
	res = "";
	for attr in m.attributes:
		res += "{:.5f}".format(d.averageGain(dataset, attr))
		res += " "
	return res
Exemplo n.º 56
0
def printAverageGain(s, dataset):
    for x in range(0, 6):
        s = s + str(dt.averageGain(dataset, m.attributes[x])) + " "
    print(s)
    print("Average gain\n")
Exemplo n.º 57
0
import monkdata as m
from dtree import entropy
from dtree import averageGain
print(entropy(m.monk1), 'monk1')
print(entropy(m.monk2), 'monk2')
print(entropy(m.monk3), 'monk3')

for i in range(6):
    print("\nattribute ", i)
    print(averageGain(m.monk1, m.attributes[i]))
    print(averageGain(m.monk2, m.attributes[i]))
    print(averageGain(m.monk3, m.attributes[i]))
    
    
    
    
    
    
    
Exemplo n.º 58
0
import monkdata as m
import dtree as dtree

datasets = [{
    'name': 'monk1',
    'ref': m.monk1
}, {
    'name': 'monk2',
    'ref': m.monk2
}, {
    'name': 'monk3',
    'ref': m.monk3
}]

print "Sample class: " + m.Sample.__doc__

# TODO:
# print dtree.averageGain(m.monk1, m.attributes)

for dataset in datasets:
    print "    Dataset: " + dataset['name']
    i = 0
    for attribute in m.attributes:
        print "a%s - %s" % (i, dtree.averageGain(dataset['ref'], attribute))
        i = i + 1
Exemplo n.º 59
0
print "monk3  "+ str(d.entropy(m.monk3))
print ""
print ""


#finding information average gain

print " Assignment2"
att =[x for x in m.attributes]
monkar = [m.monk1, m.monk2, m.monk3]


for j in monkar:
    entropyGain=[]
    for i in att :
      entropyGain.append(d.averageGain(j,i))
    for i in entropyGain:
      print i 
    print "The attribute used for splitting in data set MONK%d is A%d which has an entropy of %f"%(monkar.index(j)+1,entropyGain.index(max(entropyGain))+1, max(entropyGain))
print ""
print ""


#Building decision Trees

print "Assignment 3"
k=0
print "        "+ "e-train      " +"e-test"
onkar=[m.monk1test, m.monk2test, m.monk3test]
for j, i in zip(monkar, onkar):
       k=k+1