Пример #1
0
def calc_entropy():
  ent1 = dt.entropy(data.monk1)
  ent2 = dt.entropy(data.monk2)
  ent3 = dt.entropy(data.monk3)
  print "\n------------------------------\nAssignment 1 - Entropy\n------------------------------"
  print "Dataset\tEntropy"
  print "Monk1\t%.6f\nMonk2\t%.6f\nMonk3\t%.6f" % (ent1, ent2, ent3)
Пример #2
0
def calc_entropy():
    entropy1 = entropy(monk1)
    entropy2 = entropy(monk2)
    entropy3 = entropy(monk3)
    print('entropy of monk1', entropy1)
    print('entropy of monk2', entropy2)
    print('entropy of monk3', entropy3)
Пример #3
0
def assignment1():
    monk1 = ['MONK1']
    monk1.append(round(tree.entropy(m.monk1),5))
    monk2 = ['MONK2']
    monk2.append(round(tree.entropy(m.monk2),5))   
    monk3 = ['MONK3']
    monk3.append(round(tree.entropy(m.monk3),5))
    print(tabulate([monk1, monk2,monk3], headers=['Dataset', 'Entropy'], tablefmt='orgtbl'),'\n')
Пример #4
0
def ASSIGNMENT1():
    e1 = dtree.entropy(m.monk1)
    e2 = dtree.entropy(m.monk2)
    e3 = dtree.entropy(m.monk3)

    print("Entropy of MONK-1 Training Set:", e1)
    print("Entropy of MONK-2 Training Set:", e2)
    print("Entropy of MONK-3 Training Set:", e3)
Пример #5
0
def assignment1():
    entropy1 = d.entropy(m.monk1)
    entropy2 = d.entropy(m.monk2)
    entropy3 = d.entropy(m.monk3)

    print("Entropy of first dataset is %.6f" % entropy1)
    print("Entropy of second dataset is %.6f" % entropy2)
    print("Entropy of third dataset is %.6f" % entropy3)
Пример #6
0
def calculateEntropy():
    entropy = [
        d.entropy(m.monk1test),
        d.entropy(m.monk2test),
        d.entropy(m.monk3test)
    ]
    print(entropy)
    return entropy
Пример #7
0
def entropyCalculation():
    print()
    print("Entropy Results", "\n")

    monk1Entropy = d.entropy(m.monk1)
    print("MONK1: ", monk1Entropy, "\n")

    monk2Entropy = d.entropy(m.monk2)
    print("MONK2: ", monk2Entropy, "\n")

    monk3Entropy = d.entropy(m.monk3)
    print("MONK3: ", monk3Entropy, "\n")
    print()
Пример #8
0
def monkEntropyAndInfoGain(monks):
    i = 0
    while (i < len(monks)):
        print("Entropy of dataset for Monk-" + str(i+1) + " = "
            + str(dtree.entropy(monks[i])))
        j = 0
        while (j < len(m.attributes)):
            print("For Monk-" + str(i+1) +". Information gain on attribute "
                + str(j+1) + " = " + str(dtree.entropy(monks[i])))
            j+=1
        i+=1
        print("\n")
    return
Пример #9
0
def assignment1():
    print("Assignment 1")
    print("MONK-1: %f\nMONK-2: %f\nMONK-3: %f" %
          tuple([d.entropy(x) for x in (m.monk1, m.monk2, m.monk3)]))
    print(
        "Note: The impurity of MONK-1 is 1. This can only happen when there is an equal amount of true and false samples in the data."
    )
Пример #10
0
def entropy_matrix(datasets, attribute_index, max_att_list):
    entropy_matrix = np.zeros(
        (len(datasets), len(m.attributes[attribute_index].values)))
    for idx, dataset in enumerate(datasets):
        att = m.attributes[max_att_list[idx]]
        for j, v in enumerate(att.values):
            entropy_matrix[idx, j] = d.entropy(d.select(dataset, att, v))
    print(entropy_matrix)
Пример #11
0
def assignment1():
	print "--- Assignment 1 ---"
	print "Initial entropy of the datasets"
	table = Texttable(max_width=100)
	table.add_row(["Dataset","Entropy"])
	for i in range(3):
		table.add_row(["Monk-" + str(i+1), d.entropy(monkdata[i])])
	print table.draw()
	print
Пример #12
0
def assignment4():
    datasets = [
        (m.monk1, 'monk1', m.attributes[0]),
        (m.monk1, 'monk1', m.attributes[1]),
        (m.monk1, 'monk1', m.attributes[2]),
        (m.monk1, 'monk1', m.attributes[3]),
        (m.monk1, 'monk1 max', m.attributes[4]),
    ]

    for data, name, attribute in datasets:
        summ = 0
        for value in attribute.values:
            subset = dtree.select(data, attribute, value)

            print(f'Entropy of S{value} for {name}:\t{dtree.entropy(subset)}')

            summ += len(subset) / len(data) * dtree.entropy(subset)

        print(dtree.entropy(data) - summ)
        print()
Пример #13
0
def main(argv): 
    
    print "Entropy Monk1: " + str(tree.entropy(m.monk1))
    print "Entropy Monk2: " + str(tree.entropy(m.monk2))
    print "Entropy Monk3: " + str(tree.entropy(m.monk3))
    
    print "Average Gain Monk1(a1): " + str(tree.averageGain(m.monk1, m.attributes[0])) 
    print "Average Gain Monk1(a2): " + str(tree.averageGain(m.monk1, m.attributes[1]))
    print "Average Gain Monk1(a3): " + str(tree.averageGain(m.monk1, m.attributes[2]))
    print "Average Gain Monk1(a4): " + str(tree.averageGain(m.monk1, m.attributes[3]))
    print "Average Gain Monk1(a5): " + str(tree.averageGain(m.monk1, m.attributes[4]))
    print "Average Gain Monk1(a6): " + str(tree.averageGain(m.monk1, m.attributes[5]))
    
    print "Average Gain Monk2(a1): " + str(tree.averageGain(m.monk2, m.attributes[0])) 
    print "Average Gain Monk2(a2): " + str(tree.averageGain(m.monk2, m.attributes[1]))
    print "Average Gain Monk2(a3): " + str(tree.averageGain(m.monk2, m.attributes[2]))
    print "Average Gain Monk2(a4): " + str(tree.averageGain(m.monk2, m.attributes[3]))
    print "Average Gain Monk2(a5): " + str(tree.averageGain(m.monk2, m.attributes[4]))
    print "Average Gain Monk2(a6): " + str(tree.averageGain(m.monk2, m.attributes[5]))
    
    print "Average Gain Monk3(a1): " + str(tree.averageGain(m.monk3, m.attributes[0])) 
    print "Average Gain Monk3(a2): " + str(tree.averageGain(m.monk3, m.attributes[1]))
    print "Average Gain Monk3(a3): " + str(tree.averageGain(m.monk3, m.attributes[2]))
    print "Average Gain Monk3(a4): " + str(tree.averageGain(m.monk3, m.attributes[3]))
    print "Average Gain Monk3(a5): " + str(tree.averageGain(m.monk3, m.attributes[4]))
    print "Average Gain Monk3(a6): " + str(tree.averageGain(m.monk3, m.attributes[5]))
    
    #print "Average Gain Level 2 Monk1(a1): " + str(tree.averageGain(tree.select(m.monk1, m.attributes[0], value), m.attributes[0])) 
    #draw.drawTree(tree.buildTree(m.monk1, m.attributes, 2))

    t=tree.buildTree(m.monk1,m.attributes);
    print(tree.check(t, m.monk1test))
    print(tree.check(t, m.monk1))
    
    t2=tree.buildTree(m.monk2,m.attributes);
    print(tree.check(t2, m.monk2test))
    print(tree.check(t2, m.monk2))
    
    t3=tree.buildTree(m.monk3,m.attributes);
    print(tree.check(t3, m.monk3test))
    print(tree.check(t3, m.monk3))
Пример #14
0
def compute_entropy():
  print ("Compute entropy of training datasets:")

  ent_table = PrettyTable(['Dataset', 'Entropy'])

  for i in range(3):
    l = ["MONK-{0}".format(i+1)]
    l.append(round(dt.entropy(monks[i]), 10))
    ent_table.add_row(l)

  print(ent_table)
  print ()
Пример #15
0
def assignment1():
    print("monk1 entropy: ", d.entropy(m.monk1))
    print("monk1Test entropy: ", d.entropy(m.monk1test))
    print("monk2 entropy: ", d.entropy(m.monk2))
    print("monk2test entropy: ", d.entropy(m.monk2test))
    print("monk3 entropy: ", d.entropy(m.monk3))
    print("monk3test entropy: ", d.entropy(m.monk3test))
Пример #16
0
Файл: lab1.py Проект: mkufel/ML
def buildTreeCustom(dataset, depth):
    if (depth > 0):
        bestAttr = dt.bestAttribute(dataset, m.attributes)
        print(str(bestAttr), end='')

        # Select datasets splits for each value of the bestAttr
        splits = []
        for value in bestAttr.values:
            splits.append(dt.select(dataset, bestAttr, value))

        for split in splits:
            # If entropy of the split > 0, the split is impure and we can further split it. Recursive call with reduced depth
            if (dt.entropy(split) > 0):
                buildTreeCustom(split, depth - 1)
            else:
                print('+' if dt.mostCommon(split) else '-', end='')
    else:
        print('+' if dt.mostCommon(dataset) else '-', end='')
Пример #17
0
def ass1():
    for dset in [mdata.monk1, mdata.monk2, mdata.monk3]:
        print("-------- New dataset --------")
        print("Entropy: " + str(dtree.entropy(dset)))
Пример #18
0
import monkdata as m
import dtree as dt
import drawtree as dw

def print_average_gains(av_gain, av_gain_name = "", print_range = 6):
        print(av_gain_name, end = "\t")
        for i in range(print_range):
                print(round(av_gain[i], 4), end = "\t")
        print("")
        
#Assignment 1 calculatee enropy

print("")
m1e = dt.entropy(m.monk1)
m2e = dt.entropy(m.monk2)
m3e = dt.entropy(m.monk3)

print("Monk1 entropy: ", m1e) 
print("Monk2 entropy: ", m2e)
print("Monk3 entropy: ", m3e) 
print("")

M1 = []
M2 = [None]*6
M3 = [None]*6

for i in range(6):
	M1.append(dt.averageGain(m.monk1, m.attributes[i]))
	M2[i] = dt.averageGain(m.monk2, m.attributes[i])
	M3[i] = dt.averageGain(m.monk3, m.attributes[i])
Пример #19
0
# Compute the entropy of monk datasets.
import monkdata as m
import dtree as d
monk1 = d.entropy(m.monk1)
print "MONK-1 entropy:", monk1
monk2 = d.entropy(m.monk2)
print "MONK-2 entropy:", monk2
monk3 = d.entropy(m.monk3)
print "MONK-3 entropy:", monk3
Пример #20
0
import monkdata as m
from dtree import entropy
from dtree import averageGain
print(entropy(m.monk1), 'monk1')
print(entropy(m.monk2), 'monk2')
print(entropy(m.monk3), 'monk3')

for i in range(6):
    print("\nattribute ", i)
    print(averageGain(m.monk1, m.attributes[i]))
    print(averageGain(m.monk2, m.attributes[i]))
    print(averageGain(m.monk3, m.attributes[i]))
    
    
    
    
    
    
    
Пример #21
0
import monkdata as m
import dtree as dt
import math as math
import random as r
#import drawtree as draw


#Assignment 1
init_entropy_monk1 = dt.entropy(m.monk1)
init_entropy_monk2 = dt.entropy(m.monk2)
init_entropy_monk3 = dt.entropy(m.monk3)

#Printing results
print "-------- Assignment 1 --------"
print 

print "Monk1 entropy: ", init_entropy_monk1 
print "Monk2 entropy: ", init_entropy_monk2
print "Monk3 entropy: ", init_entropy_monk3

print
print "------------------------------"

print "-------- Assignment 2 --------"
print 

gain_monk1  = []
gain_monk2  = []
gain_monk3  = []
for x in range(0, 6):
  gain_monk1.append(dt.averageGain(m.monk1,m.attributes[x]))
Пример #22
0
import random
import matplotlib.pyplot as plt

# Importing lab specific packages.
sys.path.append('dectrees-py/')
import monkdata as monk
import dtree as dt

# Needed import for drawing the decision tree.
#import drawtree as drawtree

# Datasets
train = [monk.monk1, monk.monk2, monk.monk3]
test = [monk.monk1test, monk.monk2test, monk.monk3test]

print("Entropy for monk1 dataset is {}".format(dt.entropy(monk.monk1)))
print("Entropy for monk2 dataset is {}".format(dt.entropy(monk.monk2)))
print("Entropy for monk3 dataset is {}".format(dt.entropy(monk.monk3)))

for i, dataset in enumerate(train):
    print("")
    print("Average gain for monk{} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
        print("a{} = {}".format(j + 1, dt.averageGain(dataset, attribute)))

monk1a5 = [dt.select(monk.monk1, monk.attributes[4], 1), dt.select(monk.monk1, monk.attributes[4], 2), dt.select(monk.monk1, monk.attributes[4], 3), dt.select(monk.monk1, monk.attributes[4], 4)]

for i, monk1 in enumerate(monk1a5):
    print("")
    print("Average gain for monk1 where a5 = {} for each attribute".format(i + 1))
    for j, attribute in enumerate(monk.attributes):
Пример #23
0
__author__ = 'jonas'

import monkdata as m
import dtree

if __name__ == "__main__":

    data = {
        'monk1': {
            'name': 'MONK-1',
            'data': m.monk1,
            'entropy': 'NA'
        },
        'monk2': {
            'name': "MONK-2",
            'data': m.monk2,
            'entropy': 'NA'
        },
        'monk3': {
            'name': 'MONK-3',
            'data': m.monk3,
            'entropy': 'NA'
        }
    }
    for set in data:
        data[set]['entropy'] = dtree.entropy(data[set]['data'])
        print(data[set]['name'] + " entropy: ", data[set]['entropy'])
A4 = m.attributes[3]
A5 = m.attributes[4]
A6 = m.attributes[5]

## DATASET
monk1 = m.monk1
monk2 = m.monk2
monk3 = m.monk3
monktest1 = m.monk1test
monktest2 = m.monk2test
monktest3 = m.monk3test

print("#---------------- Assignment 1 and 2 ----------------#")
print(" ")
# Entropy Calculation from Monk dataset on training variables
print("Entropy Monk1 dataset: ", d.entropy(monk1))
print("Entropy Monk2 dataser: ", d.entropy(monk2))
print("Entropy Monk3 dataset: ", d.entropy(monk3))
print(" ")

print("#---------------- Assignment 3 and 4 ----------------#")
print(" ")
print("Information gain for the MONK1 dataset")
for i in range(0, 6):
    print(" Info Gain ", m.attributes[i], ":",
          d.averageGain(monk1, m.attributes[i]))
print(" ")
print("Information gain for the MONK2 dataset")
for i in range(0, 6):
    print(" Info Gain ", m.attributes[i], ":",
          d.averageGain(monk2, m.attributes[i]))
Пример #25
0
import monkdata as m
import dtree as d 
import drawtree as l
import random
from matplotlib import pyplot
from numpy import arange

#finding entropy

print "Assignment 1" 
print "dataset  "+"entropy"   
 
print "monk1  "+ str(d.entropy(m.monk1))
print "monk2  "+ str(d.entropy(m.monk2))
print "monk3  "+ str(d.entropy(m.monk3))
print ""
print ""


#finding information average gain

print " Assignment2"
att =[x for x in m.attributes]
monkar = [m.monk1, m.monk2, m.monk3]


for j in monkar:
    entropyGain=[]
    for i in att :
      entropyGain.append(d.averageGain(j,i))
    for i in entropyGain:
Пример #26
0
#!/usr/bin/env python

import dtree as d
import monkdata as m

monkset = [m.monk1, m.monk2, m.monk3]
mtrain = [m.monk1test, m.monk2test, m.monk3test]

#Assignement 1
print 'Entropy for monk1-3'
j = 1
for monk in monkset:
   #s = '\ta' + str(j++) + ': ' + str(d.entropy(monk))
   print d.entropy(monk)

#Assignement 2
attributes = [0, 0, 0]
print '\nInformation gain for attributes a1 to a6'
for i in range(0, len(monkset)):
   print 'Monk', i+1
   s = ""
   greatest = 0
   for x in range(0, 6):
       averageGain = d.averageGain(monkset[i], m.attributes[x])
       if averageGain > greatest: greatest = averageGain
       s = s + str(averageGain)+ ' '
   print s
   attributes[i] = greatest
Пример #27
0
__author__ = 'swebo_000'

import monkdata as m
import dtree as d
#import drawtree

monkset = [m.monk1, m.monk2, m.monk3]

print("1. Entropy of the MONK datasets:")
for x in range(0, len(monkset)):
    print("MONK-%d: %f" % (x+1, d.entropy(monkset[x])))
print();

print("2. Information gain from attributes:")
for set in monkset:
    print("MONK-%d" % (monkset.index(set) + 1))
    for x in range(0, len(m.attributes)):
        print("Attribute %d: %f" %(x+1, d.averageGain(set, m.attributes[x])))
    print()
Пример #28
0
import monkdata as m
import dtree as dt

print("Entropy\n")
print("Monk1: " + str(dt.entropy(m.monk1)))
print("Monk2: " + str(dt.entropy(m.monk2)))
print("Monk3: " + str(dt.entropy(m.monk3)))
Пример #29
0
def printEntropy(dataset, nr):    
    # Print the entropy for the dataset
    print("Entropy(monk"+str(nr+1)+"): "+str(d.entropy(dataset)))
Пример #30
0
    for temp in prun_set:
        s_dict[temp] = (d.check(temp, data_set.Test))
    return key_with_maxval(s_dict)


def test_pruning_algo(train_data, test_data, ratio):
    monk_set = SplitDataSet()
    # Here some uncertainty occurs.
    monk_set.Train, monk_set.Test = partition(train_data, ratio)
    final_tree = check_pruning(monk_set)
    accuracy = d.check(final_tree, test_data)
    #print("Accuracy for Monk1.test", accuracy)
    return accuracy


print(d.entropy(m.monk1))
print(d.entropy(m.monk2))
print(d.entropy(m.monk3))
#Printout the entropy of all datasets.

a = list()
b = list()
c = list()

for i in range(0, 6, 1):
    a.append(d.averageGain(m.monk1, m.attributes[i]))
for i in range(0, 6, 1):
    b.append(d.averageGain(m.monk2, m.attributes[i]))
for i in range(0, 6, 1):
    c.append(d.averageGain(m.monk3, m.attributes[i]))
Пример #31
0
import matplotlib.pyplot as plt

#Assignment 0

#Monk 1: a1 and a2 related and it is hard to split on one of the attributes
#Monk 2: True concept has the value of an attribute involved with the value of another attribute and therefore hard to split based on single attribute
#Monk 3: Contains noice and has the smallest set of training data. Alla datasets have small training sets compared to testing sets.

#Assignment 1:

monk = [m.monk1, m.monk2, m.monk3]

entropy_table = PrettyTable(["Dataset", "Entropy"])

for i in range(len(monk)):
    row = ["MONK-{0}".format(i + 1), round(d.entropy(monk[i]), 10)]
    entropy_table.add_row(row)

print(entropy_table)

#Assignment 2:

#Assignment 3:

#info_gain_table = PrettyTable(["Dataset", "A1", "A2", "A3", "A4", "A5", "A6"])
header = ["Dataset"]
for attr in m.attributes:
    header.append(attr)
info_gain_table = PrettyTable(header)

#for i in range(3):
Пример #32
0
import monkdata as m 
import dtree as dtree

print "Sample class: " + m.Sample.__doc__

# for sample in m.monk1:
#     print sample.positive, sample.identity, " |||| ", sample.attribute
#print "\n\n\n"

print "MONK 1       ", dtree.entropy(m.monk1)
print "MONK 2       ", dtree.entropy(m.monk2)
print "MONK 3       ", dtree.entropy(m.monk3)
print "MONK 1 TEST  ", dtree.entropy(m.monk1test)
print "MONK 2 TEST  ", dtree.entropy(m.monk2test)
print "MONK 3 TEST  ", dtree.entropy(m.monk3test)
Пример #33
0
def main():
    # Assignement 1
    print("Assignement 1")
    monks = [monkdata.monk1, monkdata.monk2, monkdata.monk3]
    monk_tests = [monkdata.monk1test, monkdata.monk2test, monkdata.monk3test]
    entropies = [dtree.entropy(monk) for monk in monks]
    print("*** Monk1 entropy: ", entropies[0])
    print("*** Monk2 entropy: ", entropies[1])
    print("*** Monk3 entropy: ", entropies[2])

    # Assignement 3
    print(" ")
    print("Assignement 3")
    attributes = monkdata.attributes
    info_gain1 = info_gain(monks[0], attributes)
    info_gain2 = info_gain(monks[1], attributes)
    info_gain3 = info_gain(monks[2], attributes)
    print("*** Monk1 information gain for attribute:",
          ['%.5f' % x for x in info_gain1])
    print("*** Monk2 information gain for attribute:",
          ['%.5f' % x for x in info_gain2])
    print("*** Monk3 information gain for attribute:",
          ['%.5f' % x for x in info_gain3])

    # Assignement 5
    print("")
    print("Assignement 5")
    print("*** Attribute:",
          np.argmax(info_gain1) + 1, "maximizes info gain for MONK1 dataset")
    print("*** Attribute:",
          np.argmax(info_gain2) + 1, "maximizes info gain for MONK2 dataset")
    print("*** Attribute:",
          np.argmax(info_gain3) + 1, "maximizes info gain for MONK3 dataset")
    print("***")
    max0 = np.argmax(info_gain1)  # attribute of first split
    attributes_left = [
        attrib for attrib in attributes if attrib != attributes[max0]
    ]
    print("*** 1) Attributes the next nodes should be tested on: ",
          attributes_left)

    # Attributes to split on in second step
    splits = [
        np.argmax(
            info_gain(dtree.select(monks[0], attributes[max0], value),
                      attributes)) + 1 for value in attributes[max0].values
    ]
    print("*** 2) Second split is on the attriburtes: ", splits)

    # Decision after second split
    subsets = [
        dtree.select(monks[0], attributes[max0], split) for split in splits
    ]
    print("*** 3) Assignement after second split: ",
          [dtree.mostCommon(subset) for subset in subsets])
    print("***")

    print("*** Train and test set errors")
    t1 = dtree.buildTree(monkdata.monk1, monkdata.attributes)
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monkdata.monk1),
          " Etest=", 1 - dtree.check(t1, monkdata.monk1test))
    t2 = dtree.buildTree(monkdata.monk2, monkdata.attributes)
    print("*** Monk2:", "Etrain=", 1 - dtree.check(t2, monkdata.monk2),
          " Etest=", 1 - dtree.check(t2, monkdata.monk2test))
    t3 = dtree.buildTree(monkdata.monk3, monkdata.attributes)
    print("*** Monk3:", "Etrain=", 1 - dtree.check(t3, monkdata.monk3),
          " Etest=", 1 - dtree.check(t3, monkdata.monk3test))

    import drawtree_qt5
    #print(t1) # tree in text form(weird)
    #drawtree_qt5.drawTree(t1) # uncoment to visualize the decision tree

    # Assignement 7
    print("")
    print("Assignement 7")

    # The prunning for the exanple of monk1
    monk1train, monk1val = partition(monkdata.monk1, 0.9)
    t1 = dtree.buildTree(monk1train,
                         monkdata.attributes)  # tree trained from monk1train
    t11 = prune(t1, monk1val)  # prunned tree
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t1, monk1val), " Etest=",
          1 - dtree.check(t1, monkdata.monk1test))
    print("*** Monk1:", "Etrain=", 1 - dtree.check(t11, monk1val), " Etest=",
          1 - dtree.check(t11, monkdata.monk1test))

    # Statistic information for different fraction for monk1 and monk3
    fraction = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

    # Evaluation of Monk1
    eval1 = [
        evaluate_fraction(monkdata.monk1, frac, monkdata.monk1test)
        for frac in fraction
    ]
    means1 = [np.mean(x) for x in eval1]
    vars1 = [np.var(x) for x in eval1]

    plt.figure(1)
    plt.subplot(121)
    plt.plot(fraction, means1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars1, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk1')

    # Evaluation of Monk2
    eval3 = [
        evaluate_fraction(monkdata.monk3, frac, monkdata.monk3test)
        for frac in fraction
    ]
    means3 = [np.mean(x) for x in eval3]
    vars3 = [np.var(x) for x in eval3]

    plt.figure(2)
    plt.subplot(121)
    plt.plot(fraction, means3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Mean of error for different " + r'$\lambda$s')
    plt.subplot(122)
    plt.plot(fraction, vars3, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.title("Variance of error for different " + r'$\lambda$s')
    plt.suptitle('Monk2')
    plt.show()
Пример #34
0
import monkdata as m
import math
import dtree


# Assignment 1 #
################

#Setting up lists
monk_entropy = []
data_sets = [m.monk1, m.monk2, m.monk3]

#calculating entropy of the monk sets
for set in data_sets:
    monk_entropy.append(dtree.entropy(set))

#print(monk_entropy)

# Assignment 2 #
################

#Setting up lists
info_gain_m1 = []
info_gain_m2 = []
info_gain_m3 = []
attribute = []

#starting counter
i = 0;
#iterating over all the test sets
for sets in [info_gain_m1, info_gain_m2, info_gain_m3]:
Пример #35
0
    r.shuffle(ldata)
    breakPoint = int(len(ldata) * fraction)
    return ldata[:breakPoint], ldata[breakPoint:]

def getClasification(dataset,fraction):
    monk1train, monk1val = partition(dataset,fraction)
    testTree = tree.buildTree(monk1val,m.attributes)
    prunedTrees = tree.allPruned(testTree)
    pValue = 0
    for pruned in prunedTrees:
        if(tree.check(pruned,monk1train) > pValue):
            bestTree = pruned
            pValue = tree.check(pruned,monk1train)
    return pValue, bestTree

print "Entropy Monk1: " + str(tree.entropy(m.monk1))
print "Entropy Monk2: " + str(tree.entropy(m.monk2))
print "Entropy Monk3: " + str(tree.entropy(m.monk3))

print "Gain Monk1 a1: " + str(tree.averageGain(m.monk1,m.attributes[0]))
print "Gain Monk1 a2: " + str(tree.averageGain(m.monk1,m.attributes[1]))
print "Gain Monk1 a3: " + str(tree.averageGain(m.monk1,m.attributes[2]))
print "Gain Monk1 a4: " + str(tree.averageGain(m.monk1,m.attributes[3]))
print "Gain Monk1 a5: " + str(tree.averageGain(m.monk1,m.attributes[4]))
print "Gain Monk1 a6: " + str(tree.averageGain(m.monk1,m.attributes[5]))

print "Gain Monk2 a1: " + str(tree.averageGain(m.monk2,m.attributes[0]))
print "Gain Monk2 a2: " + str(tree.averageGain(m.monk2,m.attributes[1]))
print "Gain Monk2 a3: " + str(tree.averageGain(m.monk2,m.attributes[2]))
print "Gain Monk2 a4: " + str(tree.averageGain(m.monk2,m.attributes[3]))
print "Gain Monk2 a5: " + str(tree.averageGain(m.monk2,m.attributes[4]))
Пример #36
0
def A1():
  print "Entropy for Monk1 is : ", dT.entropy( m.monk1 )
  print "Entropy for Monk2 is : ", dT.entropy( m.monk2 )
  print "Entropy for Monk3 is : ", dT.entropy( m.monk3 )
Пример #37
0
def main():
	print ("Entropy monk1")
	entropy1 = tree.entropy(data.monk1)
	print (entropy1)
	print ("\n")

	print ("Entropy monk2")
	entropy2 = tree.entropy(data.monk2)
	print (entropy2)
	print ("\n")

	print ("Entropy monk3")
	entropy3 = tree.entropy(data.monk3)
	print (entropy3)
	print ("\n")

	informationGain(data)

	#COMPUTING ENTROPY FOR SUBSET, WhY 0?!
	monk1Tree = tree.buildTree(data.monk1, data.attributes)
	#draw.drawTree(monk1Tree)
	#print(tree.bestAttribute(data.monk3, data.attributes))
	subSet = tree.select(data.monk1, data.attributes[4], 1)

	# newEntropy = tree.entropy(subSet)
	# print ("SubSet")
	# print (newEntropy)
	#END

	n = 0
	sumList = np.array([0.0] * 6)
	l1 = []
	l2 = []
	l3 = []
	l4 = []
	l5 = []
	l6 = []

	for x in range(100):
		errorList = np.array(pruneTree(data.monk1, data.monk1test))
		sumList += errorList
		l1.append(errorList[0])
		l2.append(errorList[1])
		l3.append(errorList[2])
		l4.append(errorList[3])
		l5.append(errorList[4])
		l6.append(errorList[5])

	finalList = sumList/100
	stdDevList = [np.std(l1),np.std(l2),np.std(l3),np.std(l4), np.std(l5),np.std(l6)]  

	print(finalList)
	print(stdDevList)

	line1, = plt.plot(finalList, label="Monk1 means", marker='o')
	# Create a legend for the first line.
	first_legend = plt.legend(handles=[line1], loc=1)

	x = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
	# create an index for each tick position
	xi = [i for i in range(0, len(x))]

	plt.xticks(xi, x)
	plt.ylabel('Mean Errors')
	plt.xlabel('Fractions')
	plt.show()
Пример #38
0
def assignment1():
	print "   ", "Entropy"
	print "M1 ", d.entropy(m.monk1)
	print "M2 ", d.entropy(m.monk2)
	print "M3 ", d.entropy(m.monk3)
Пример #39
0
import monkdata as m
import dtree as dt
import drawtree as draw
import matplotlib.pyplot as plt
import random, operator

#Entorpy

#calling the predefined function that calculates the entropy for all the three datasets
#assignment 1
print dt.entropy(m.monk1)
print dt.entropy(m.monk2)
print dt.entropy(m.monk3)
print '\n'

##############################################################################
#Information Gain

#cycles for calling average gains for all the three datasets and for every attribute
#assignment 2
for atr in m.attributes:
    gain = dt.averageGain(m.monk1, atr)
    print gain
    
print '\n'    
for atr in m.attributes:
    print dt.averageGain(m.monk2, atr)

print '\n'    
for atr in m.attributes:
    print dt.averageGain(m.monk3, atr)
Пример #40
0
def A1():
    print "Entropy for Monk1 is : ", dT.entropy(m.monk1)
    print "Entropy for Monk2 is : ", dT.entropy(m.monk2)
    print "Entropy for Monk3 is : ", dT.entropy(m.monk3)
Пример #41
0
def assignment1():
    print("Assignment 1")
    print("MONK-1: %f\nMONK-2: %f\nMONK-3: %f" % tuple([d.entropy(x) for x in (m.monk1, m.monk2, m.monk3)]))
    print("Note: The impurity of MONK-1 is 1. This can only happen when there is an equal amount of true and false samples in the data.")
Пример #42
0
"""
DD2431 HT15
Lab 1
"""
import monkdata as m
import dtree as t
import drawtree as draw
import random

"3 ENTROPY"
"--Assignment 1"
print("--------------------------------------")
print("Assignment 1: Entropy of training sets")
monkEntropy = [round(t.entropy(m.monk1), 5), round(t.entropy(m.monk2), 5), round(t.entropy(m.monk3), 5)]
"--Answer to Assignment 1"
print(monkEntropy, "\n")

"4 INFORMATION GAIN"
"--Assignment 2"
monkTrainingSets = [m.monk1, m.monk2, m.monk3]
informationGain = []

print("Assignment 2: Expected information gains")
att = []
# save values for each attribute
for monk in monkTrainingSets:  # for each data set
    for attribute in m.attributes:  # for every attribute
        # calculate the gain of splitting by the attribute
        att.append(round(t.averageGain(monk, attribute), 5))

    informationGain.append(att)  # save a "row vector"
Пример #43
0
#!/usr/bin/env python
import dtree
import monkdata
import random
import matplotlib.pyplot as plot

sets = [monkdata.monk1, monkdata.monk2, monkdata.monk3]

entropies = [dtree.entropy(s) for s in sets]

def printlines(values):
    for line in values:
       print(', '.join(map(str, line)))

print("Initial entropies:")
print(entropies)
print("")


gain = [[dtree.averageGain(s, attr) for attr in monkdata.attributes] for s in sets]

print("Expected gain:")
printlines(gain)
print("")

def tests(pair):
    tree=dtree.buildTree(pair[0], monkdata.attributes)
    return [
            pair[2],
            dtree.check(tree,pair[0]),
            dtree.check(tree,pair[1])
Пример #44
0
    "Calculate the entropy of a dataset"
    #nr of monk1 records
    n = len(dataset)
    # nr of monk1 records with postive = True
    nPos = len([x for x in dataset if x.positive])
    #nr of monk1 records with positive = False
    nNeg = n - nPos
    #if all records are negative or all are positive than entropy is 0 since one can immediately classify or predict unlabeled records.
    if nPos == 0 or nNeg == 0:
        return 0.0
    #Entropy calc
    return -float(nPos)/n * math.log(float(nPos)/n,2) + \
        -float(nNeg)/n * math.log(float(nNeg)/n,2)


print('Monk1 entropy:', dtree.entropy(m.monk1))
print('Monk2 entropy:', dtree.entropy(m.monk2))
print('Monk3 entropy:', dtree.entropy(m.monk3))

"Entropy Calculation"

"Information Gain calculation"


def averageGain(dataset, attribute):
    "Calculate the expected information gain when an attribute becomes known"
    weighted = 0.0
    #ex monk1: A1 attribute values are {1,2,3} v= 1 or 2 or 3
    for v in attribute.values:
        #ex monk1: for v=1 subset = (True,  (1, 1, 1, 1, 3, 1), 5) , v=2 subset= (False, (2, 1, 1, 1, 3, 1), 149), v=3 subset = (True,  (3, 1, 1, 1, 1, 1), 289)
        #select: selects all samples with attribute= v
Пример #45
0
def compute_entropy(datasets_names, datasets):
    for dataset_name, dataset in zip(datasets_names, datasets):
        print(dataset_name, ':', round(d.entropy(dataset), 3))
Пример #46
0
def calcentropy():
    print("Entropy Monk 1: %f" % d.entropy(m.monk1))
    print("Entropy Monk 2: %f" % d.entropy(m.monk2))
    print("Entropy Monk 3: %f" % d.entropy(m.monk3))
Пример #47
0
import monkdata
import dtree

e1 = dtree.entropy(monkdata.monk1)
e2 = dtree.entropy(monkdata.monk2)
e3 = dtree.entropy(monkdata.monk3)

print("Entropy Monk-1: {}".format(e1))
print("Entropy Monk-2: {}".format(e2))
print("Entropy Monk-3: {}".format(e3))
Пример #48
0
    return ldata[:breakPoint], ldata[breakPoint:]

monk1train, monk1val = partition(m.monk1, 0.6)

def prune_tree(tree, validation):
    pruned_trees = d.allPruned(tree)
    pruned_trees_performance = [0 for x in range(len(pruned_trees))]
    for candidate in pruned_trees:
        index = pruned_trees.index(candidate)
        pruned_trees_performance[index] = d.check(candidate, validation)
    if d.check(tree, validation) <= max(pruned_trees_performance):
        tree = pruned_trees[pruned_trees_performance.index(max(pruned_trees_performance))]
        tree = prune_tree(tree, validation)
    return tree

print(d.entropy(m.monk1))
print(d.entropy(m.monk2))
print(d.entropy(m.monk3))
print("\n")

print("monk-1: %f %f %f %f %f %f" % (
    d.averageGain(m.monk1, m.attributes[0]), d.averageGain(m.monk1, m.attributes[1]),
    d.averageGain(m.monk1, m.attributes[2]), d.averageGain(m.monk1, m.attributes[3]),
    d.averageGain(m.monk1, m.attributes[4]), d.averageGain(m.monk1, m.attributes[5])
))

print("monk-2: %f %f %f %f %f %f" % (
    d.averageGain(m.monk2, m.attributes[0]), d.averageGain(m.monk2, m.attributes[1]),
    d.averageGain(m.monk2, m.attributes[2]), d.averageGain(m.monk2, m.attributes[3]),
    d.averageGain(m.monk2, m.attributes[4]), d.averageGain(m.monk2, m.attributes[5])
))
def entropy(datasets):
    return [dtree.entropy(d) for d in datasets]
Пример #50
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 26 12:58:18 2017

"""

import numpy as np
import monkdata as m
import dtree as d
import drawtree_qt5 as dqt
import matplotlib.pyplot as plt


## Assignment 1: calculate entropy of a dataset
en_m1 = d.entropy(m.monk1)
en_m2 = d.entropy(m.monk2)
en_m3 = d.entropy(m.monk3)

# output print
print '-------- Assignment 1 --------'
print 'entropy:'
print 'monk 1: ' + str(en_m1)
print 'monk 2: ' + str(en_m2)
print 'monk 3: ' + str(en_m3)
print ''


## Assignment 3: calculate information gain
Ga_m1 = np.empty([6,1], dtype = float)
Ga_m2 = np.empty([6,1], dtype = float)
Пример #51
0
import monkdata as m
import dtree
import drawtree_qt5 as draw
import numpy as np
import matplotlib.pyplot as plt
import random

entropyMonk1 = dtree.entropy(m.monk1)
entropyMonk2 = dtree.entropy(m.monk2)
entropyMonk3 = dtree.entropy(m.monk3)

print(f'Entropy for monk1: {entropyMonk1}')
print(f'Entropy for monk2: {entropyMonk2}')
print(f'Entropy for monk3: {entropyMonk3}')

informationGainMonk1 = list(
    map(lambda x: dtree.averageGain(m.monk1, x), m.attributes))
informationGainMonk2 = list(
    map(lambda x: dtree.averageGain(m.monk2, x), m.attributes))
informationGainMonk3 = list(
    map(lambda x: dtree.averageGain(m.monk3, x), m.attributes))

print(
    f'Information gain for all 6 attuributes for monk1: {informationGainMonk1}'
)
print(
    f'Information gain for all 6 attuributes for monk2: {informationGainMonk2}'
)
print(
    f'Information gain for all 6 attuributes for monk3: {informationGainMonk3}'
)
Пример #52
0
def myBuildTree(dataset, levels):
    treeLevels = []
    splits = []
    treeLevels.append(dataset)
    datasubsets = dataset
    datasubsetsAvgGains = []
    for level in range(0, levels):
        print("\n===Level #: ", level)
        if level == 0:
            attribAvgGains = []
            largestGain = 0
            largestAttribIndex = 0
            if len(datasubsets) > 5:
                for attribute in range(0, len(m.attributes)):
                    avgGain = d.averageGain(datasubsets, m.attributes[attribute])
                    if avgGain > largestGain:
                        largestGain = avgGain
                        largestAttribIndex = attribute
                    attribAvgGains.append(avgGain)
                    print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain)
                    datasubsetsAvgGains.append(attribAvgGains)
                print("---Splitting at attribute: ", m.attributes[largestAttribIndex])
                datasubsets = split(datasubsets, m.attributes[largestAttribIndex])
                splits.append(m.attributes[largestAttribIndex])
                treeLevels.append(datasubsets)

        elif level > 0:
            print("---No. of datasets: ", len(datasubsets))
            newdatasubsets = []
            for i in range(0, len(datasubsets)):
                print("\n---Datasubset: ", i, "\t\tEntropy: ", d.entropy(datasubsets[i]))
                attribAvgGains = []
                newdatasubsets = []
                largestGain = 0
                largestAttribIndex = 0
                if len(datasubsets[i]) > 5:
                    for attribute in range(0, len(m.attributes)):
                        avgGain = d.averageGain(datasubsets[i], m.attributes[attribute])
                        if avgGain > largestGain:
                            largestGain = avgGain
                            largestAttribIndex = attribute
                        attribAvgGains.append(avgGain)
                        print("Attribute: ", attribute, "\t\tAverage gain: ", avgGain)
                    if avgGain > 0:
                        print("---Splitting at attribute: ", m.attributes[largestAttribIndex].name)
                        newdatasubsets.append(split(datasubsets[i], m.attributes[largestAttribIndex]))
                        splits.append(m.attributes[largestAttribIndex])
                    else:
                        print(
                            "---Skipping splitting at attribute: ",
                            m.attributes[largestAttribIndex].name,
                            "Dataset #",
                            i,
                        )
                    datasubsetsAvgGains.append(attribAvgGains)

            if len(newdatasubsets[0]) > 1:
                datasubsets = newdatasubsets[0]
                print("---No. of New datasets: ", len(datasubsets))
            treeLevels.append(datasubsets)

    return splits
Пример #53
0
# learn, since "ai = 1 for exactly two i of {1, 2, ..., 6} is difficult to
# express concisely with binary questions. Low information gain from each
# sub question. On the other hand, it has more training data.
#
# MONK-3 has the least amount of training data. Apart from that, it also has
# random noise added.
#
# MONK-2 has the lowest entropy.
################################################################################

################################# Assignment 1 #################################
# Calculate the entropy of the training datasets.

print("\n\nAssignment 1 - Calculate the entropy of each training dataset")

entropy_monk1 = d.entropy(m.monk1)  # Yields 1.0 because there's a 50/50 split.
entropy_monk2 = d.entropy(m.monk2)  # Yields 0.9571....
entropy_monk3 = d.entropy(m.monk3)  # yields 0.9998....
print()
print("monk1: ", entropy_monk1)
print("monk2: ", entropy_monk2)
print("monk3: ", entropy_monk3)
################################################################################

################################# Assignment 2 #################################
# Explain entropy for a uniform distribution and a non-uniform distribution,
# present some example distributions with high and low entropy.
#
# "(Shannon) entropy is a measure of uncertainty"
#   In the case of a uniform distribution, different outcomes have an equal
# probability of being picked. An example is a (non-weighted) die, which would
Пример #54
0
import monkdata as m
import dtree as dt
import drawtree as draw

entropy = dt.entropy(m.monk1)
best_gain = 0
for attribute in m.attributes:
    gain = dt.averageGain(m.monk1, attribute)
    if gain > best_gain:
        best_gain = gain
        best_attribute = attribute


for v in best_attribute.values:
    subset = dt.select(m.monk1, best_attribute, v)
    majority_class = dt.mostCommon(subset)

values = {v: dt.mostCommon(dt.select(m.monk1, best_attribute, v)) for v in best_attribute.values}
print(best_attribute, values)
draw.drawTree(dt.buildTree(m.monk1, m.attributes, 2))