def makeTree(dataset, attributes, defaultValue) :
    # you write; See assignment & notes for description of algorithm

    if len(dataset) == 0:
        return TreeNode(None,defaultValue)
    #calculate entropy for whole dataset
    entropyD = entropy([item[-1] for item in dataset])

    if entropyD == 0:
        return TreeNode(None,dataset[0][-1])
    if len(attributes) == 0:
        return TreeNode(None, readARFF.computeZeroR(attributes, dataset))


    copyAttr = copy.copy(attributes)
    dV = readARFF.computeZeroR(attributes,dataset)

    attrSpread = selectAttribute(dataset,attributes) # index
    vlist = attributes[attrSpread].values()[0]

    del copyAttr[attrSpread]
    node = TreeNode(attributes[attrSpread].keys()[0],None)

    for v in vlist:
        ## for each value of that removed attribute
        ## get tuples for a specific value of that removed attribute
        subDataset = [item for item in dataset if item[attrSpread] == v]

        if len(subDataset) == 0:
            node.children[v] = TreeNode(None, readARFF.computeZeroR(attributes,dV))

        node.children[v] = makeTree(subDataset,copyAttr,dV)

    return node
Пример #2
0
def makeTree(dataset, alist, attributes, defaultValue) :
    if len(dataset) == 0:
        leafNode = TreeNode(None, defaultValue)
        leafNode.children = {}
        return leafNode

    elif len(alist) == 0 :
        leafNode = TreeNode(None, readARFF.computeZeroR(attributes, dataset))
        leafNode.children = {}
        return leafNode

    elif entropy([data[-1] for data in dataset]) == 0.0:
        leafNode = TreeNode(None, dataset[0][-1])
        leafNode.children = {}
        return leafNode

    else :
        selectAttr = selectAttribute(dataset, alist)
        rootAttr = TreeNode(alist[selectAttr], None)
        rootAttr.children = {}
        #possibleValues = []
        for tempIndex in range(len(attributes)) :
            if attributes[tempIndex].keys() == [alist[selectAttr]] :
                index = [tempIndex][0]
        possibleValues = attributes[index][alist[selectAttr]]

        defaultValue = readARFF.computeZeroR(attributes, dataset)
        for val in possibleValues :
            subSet = createSubSet(val, dataset, selectAttr)
            rootAttr.children.update({val: makeTree(subSet, alist[:selectAttr]+alist[selectAttr+1:], attributes, defaultValue)})

        return rootAttr
def evalZeroR(trainDataset,testDataset,classification,attrs):
    classification = classification.values()[0]
    evalResult = {}
    zeroR = readARFF.computeZeroR(attrs,trainDataset)

    for c in classification:
        TPCount = 0
        TNCount = 0
        FPCount = 0
        FNCount = 0
        if zeroR == c:
            for i in testDataset:
                if i[-1] == zeroR:
                    TPCount += 1
                else:
                    FPCount += 1
        else:
            for i in testDataset:
                if i[-1] == zeroR:
                    TNCount += 1
                else:
                    FNCount += 1

        p = computePrecision(TPCount,FPCount,TNCount,FNCount)
        r = computeRecall(TPCount,FPCount,TNCount,FNCount)
        a = computeAccuracy(TPCount,FPCount,TNCount,FNCount)
        evalResult[c] = (p,r,a)

    drawChart(evalResult)
    return evalResult
Пример #4
0
def makeTree(dataset, alist, attributes, defaultValue):
    # you write; See assignment & notes for description of algorithm
    # if the dataset is empty
    if len(dataset) == 0:
        # print defaultValue
        return TreeNode(None, defaultValue)
    # if the dataset contains zero entropy, that is, all classes are the same.
    # that is, the entropy is zero
    elif entropy([item[-1] for item in dataset]) == 0:
        return TreeNode(None, dataset[0][-1])
    elif len(alist) == 0:
        return TreeNode(None, readARFF.computeZeroR(dataset))
    else:
        i = selectAttribute(dataset, alist)
        # print alist,alist[i], i
        # if alist[i] == 'age':
        #     print dataset
        #     print [item[-1] for item in dataset].count(dataset[0][-1]) , len(dataset)
        #     print [item[-1] for item in dataset].count(dataset[0][-1]) == len(dataset)
        current_Treenode = TreeNode(alist[i], None)
        current_Treenode.defaultValue = readARFF.computeZeroR(dataset)
        # print i
        # print dataset
        real_index = [
            j for j in range(len(attributes))
            if attributes[j].keys() == [alist[i]]
        ][0]
        # print dataset,attributes[real_index].keys()[0]
        del_attribute = attributes[real_index][alist[i]]

        for item in del_attribute:
            sub_data = [
                data[:i] + data[i + 1:] for data in dataset if data[i] == item
            ]
            # print [data[i] for data in dataset],item
            # print [data[i] for data in dataset].count(item)
            # print data[i][0] == item
            # print 'sub_data', sub_data, item
            # print sub_data,item
            # for eachdata in sub_data:
            #     del eachdata[i]
            current_Treenode.children[item] = makeTree(
                sub_data, alist[:i] + alist[i + 1:], attributes,
                current_Treenode.defaultValue)
        return current_Treenode
Пример #5
0
def makeTree(dataset, attributes, defaultValue):
    # def makeTree(dataset) :
    #   if data is empty:
    #       return a single Node with defaultValue
    #   if there are no attributes left to test:
    #       return a single Node with majority classification
    #   if all data are in the same class :
    #       return a single Node with that classification
    #   if our dataset has entropy 0:
    #       return a single Node with that classification
    #   else :
    #       select the attribute that produces the largest information gain
    #       split the dataset according to the values of this attribute to create v smaller datasets.
    #       create a new Node - each child will be created by calling makeTree with one on the v subsets.

    node = TreeNode(None, None)

    if len(dataset) == 0:
        node.value = defaultValue
        return node
    if len(attributes) == 0:
        classification = readARFF.computeZeroR(attributes, dataset)
        node.value = classification
        return node
    if entropy([d[-1] for d in dataset]) == 0:
        node.value = dataset[0][-1]
        return node

    selectIndex = selectAttribute(dataset, attributes)
    key = attributes[selectIndex].keys()[0]
    values = attributes[selectIndex][key]
    node.attribute = key
    next_attributes = attributes.copy()
    del next_attributes[selectIndex]

    for v in values:
        childrenList = [d for d in dataset if d[selectIndex] == v]
        defaultValue = readARFF.computeZeroR(attributes, dataset)
        node.children[v] = makeTree(childrenList, next_attributes, defaultValue)

    return node
Пример #6
0
def makeTree(dataSet, aList, attributes, defaultValue):
    if entropy([d[-1] for d in dataSet]) == 0:
        return TreeNode(None, dataSet[0][-1])
    elif len(aList) == 0:
        return TreeNode(None, defaultValue)
    else:
        listAttributes = readARFF.getAttrList(attributes)
        for index, item in enumerate(listAttributes):
            if item not in aList:
                listAttributes[index] = None
        attribute = selectAttribute(dataSet, listAttributes)
        index = listAttributes.index(attribute)
        possibleValue = attributes[index][attribute]
        aList.remove(attribute)
        node = TreeNode(attribute, None)
        for value in possibleValue:
            subSet = [d for d in dataSet if d[index] == value]
            if len(subSet) == 0:
                node.children[value] = TreeNode(None, readARFF.computeZeroR(attributes, dataSet))
            else:
                node.children[value] = makeTree(subSet, aList, attributes, readARFF.computeZeroR(attributes, subSet))
        return node
Пример #7
0
def evaluation(nfold, attrs, data):
    train_precision = []
    train_recall = []
    train_accuracy = []

    test_precision = []
    test_recall = []
    test_accuracy = []

    for k in range(nfold):
        random.seed()
        random.shuffle(data)
        traindata = data[:len(data) / 5 * 4]
        # print len(traindata)
        testdata = data[len(data) / 5 * 4:]
        # print len(testdata)
        # print data[:len(data)/10]
        attrslist = readARFF.getAttrList(attrs)
        root = makeTree(traindata, attrslist, attrs,
                        readARFF.computeZeroR(data))
        # print '####fold####',k
        # root.printTree()

        precision, recall, correct_num = calc_precision_recall_accuracy(
            root, attrs, testdata)
        test_precision.append(precision)
        test_recall.append(recall)
        test_accuracy.append(float(correct_num) / len(testdata))

        precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(
            root, attrs, traindata)
        train_precision.append(precision_train)
        train_recall.append(recall_train)
        train_accuracy.append(float(correct_num_train) / len(traindata))

    test_precision_average, test_recall_average, test_accuracy_average = calc_average(
        nfold, test_precision, test_recall, test_accuracy)
    train_precision_average, train_recall_average, train_accuracy_average = calc_average(
        nfold, train_precision, train_recall, train_accuracy)
    print '#####   The performance of decision tree   #####'
    print '     test_precision:'
    print_pr_re(test_precision_average)
    print '     test_recall:'
    print_pr_re(test_recall_average)
    print '     test_accuracy: %f%%' % (test_accuracy_average * 100)
    print
    print '     training_precision:'
    print_pr_re(train_precision_average)
    print '     training_recall:'
    print_pr_re(train_recall_average)
    print '     training_accuracy: %f%%' % (train_accuracy_average * 100)
def makeTree(dataset, alist, attributes, defaultValue) :
    # you write; See assignment & notes for description of algorithm
    # if the dataset is empty
    if len(dataset) == 0:
        # print defaultValue
        return TreeNode(None, defaultValue)
    # if the dataset contains zero entropy, that is, all classes are the same.
    # that is, the entropy is zero
    elif entropy([item[-1] for item in dataset]) == 0:
        return TreeNode(None, dataset[0][-1])
    elif len(alist) == 0:
        return TreeNode(None, readARFF.computeZeroR(dataset))
    else:
        i = selectAttribute(dataset, alist)
        # print alist,alist[i], i
        # if alist[i] == 'age':
        #     print dataset
        #     print [item[-1] for item in dataset].count(dataset[0][-1]) , len(dataset)
        #     print [item[-1] for item in dataset].count(dataset[0][-1]) == len(dataset)
        current_Treenode = TreeNode(alist[i], None)
        current_Treenode.defaultValue = readARFF.computeZeroR(dataset)
        # print i
        # print dataset
        real_index = [j for j in range(len(attributes)) if attributes[j].keys() == [alist[i]]][0]
        # print dataset,attributes[real_index].keys()[0]
        del_attribute = attributes[real_index][alist[i]]
        
        for item in del_attribute:
            sub_data = [data[:i]+data[i+1:] for data in dataset if data[i] == item]
            # print [data[i] for data in dataset],item
            # print [data[i] for data in dataset].count(item)
            # print data[i][0] == item
            # print 'sub_data', sub_data, item
            # print sub_data,item
            # for eachdata in sub_data:
            #     del eachdata[i]
            current_Treenode.children[item] = makeTree(sub_data, alist[:i]+alist[i+1:], attributes, current_Treenode.defaultValue)
        return current_Treenode
def evaluation(nfold , attrs, data):
    train_precision = []
    train_recall = []
    train_accuracy = []

    test_precision = []
    test_recall = []
    test_accuracy = []

    for k in range(nfold):
        random.seed()
        random.shuffle(data)
        traindata = data[:len(data)/5*4]
        # print len(traindata)
        testdata = data[len(data)/5*4:]
        # print len(testdata)
        # print data[:len(data)/10]
        attrslist = readARFF.getAttrList(attrs)
        root = makeTree(traindata, attrslist, attrs, readARFF.computeZeroR(data))
        # print '####fold####',k
        # root.printTree()

        precision, recall, correct_num = calc_precision_recall_accuracy(root, attrs, testdata)
        test_precision.append(precision)
        test_recall.append(recall)
        test_accuracy.append(float(correct_num) / len(testdata))

        precision_train, recall_train, correct_num_train = calc_precision_recall_accuracy(root, attrs, traindata)
        train_precision.append(precision_train)
        train_recall.append(recall_train)
        train_accuracy.append(float(correct_num_train) / len(traindata))

    test_precision_average, test_recall_average, test_accuracy_average = calc_average(nfold, test_precision, test_recall, test_accuracy)
    train_precision_average, train_recall_average, train_accuracy_average = calc_average(nfold, train_precision, train_recall, train_accuracy)
    print '#####   The performance of decision tree   #####'
    print '     test_precision:' 
    print_pr_re(test_precision_average)
    print '     test_recall:' 
    print_pr_re(test_recall_average)
    print '     test_accuracy: %f%%' % (test_accuracy_average*100)
    print 
    print '     training_precision:'
    print_pr_re(train_precision_average)
    print '     training_recall:'
    print_pr_re(train_recall_average)
    print '     training_accuracy: %f%%' % (train_accuracy_average*100)
Пример #10
0
def calc_precision_recall_accuracy_zeroR(testdata):
    actual_class = {}
    predicted_class = {}
    correct_class = {}
    majority = readARFF.computeZeroR(testdata)
    for i in testdata:
        a = majority

        if i[-1] in actual_class:
            actual_class[i[-1]] += 1
        else:
            actual_class[i[-1]] = 1

        if a in predicted_class:
            predicted_class[a] += 1
        else:
            predicted_class[a] = 1
        
        if a == i[-1]:
            if i[-1] in correct_class:
                correct_class[i[-1]] += 1
            else:
                correct_class[i[-1]] = 1

    # print 'actual_class %s' %actual_class
    # print 'predicted_class %s' %predicted_class
    # print 'correct_class %s' %correct_class

    correct_num = 0
    precision = {}
    recall = {}

    for j in correct_class:
        recall[j] = float(correct_class[j]) / actual_class[j]
        precision[j] = float(correct_class[j]) / predicted_class[j]
        correct_num += correct_class[j]

    for k in actual_class:
        if k not in precision:
            precision[k] = 0
        if k not in recall:
            recall[k] = 0

    return precision, recall, correct_num
Пример #11
0
def calc_precision_recall_accuracy_zeroR(testdata):
    actual_class = {}
    predicted_class = {}
    correct_class = {}
    majority = readARFF.computeZeroR(testdata)
    for i in testdata:
        a = majority

        if i[-1] in actual_class:
            actual_class[i[-1]] += 1
        else:
            actual_class[i[-1]] = 1

        if a in predicted_class:
            predicted_class[a] += 1
        else:
            predicted_class[a] = 1

        if a == i[-1]:
            if i[-1] in correct_class:
                correct_class[i[-1]] += 1
            else:
                correct_class[i[-1]] = 1

    # print 'actual_class %s' %actual_class
    # print 'predicted_class %s' %predicted_class
    # print 'correct_class %s' %correct_class

    correct_num = 0
    precision = {}
    recall = {}

    for j in correct_class:
        recall[j] = float(correct_class[j]) / actual_class[j]
        precision[j] = float(correct_class[j]) / predicted_class[j]
        correct_num += correct_class[j]

    for k in actual_class:
        if k not in precision:
            precision[k] = 0
        if k not in recall:
            recall[k] = 0

    return precision, recall, correct_num
Пример #12
0
def evaluationWithZeroR(trainingData, testingData, attributes, defaultValue) :
    accuracy = 0.0
    FP = 0
    FN = 0
    TP = 0
    TN = 0
    #root = makeTree(trainingData, readARFF.getAttrList(attributes), attributes, defaultValue)
    result = readARFF.computeZeroR(attributes, testingData)
    for item in testingData :
        classValue = item[-1]
        #result = root.classify(item[0:-1], attributes)

        if result == defaultValue:
            continue
        elif result == classValue:
            if isPositive(result):
                TP +=1
            else:
                TN +=1
        else:
            if isPositive(result):
                FP +=1
            else:
                FN +=1
    if TP == 0 :
        precision = 0
        recall = 0
    elif (TP + TN) == 0:
        accuracy = 0
    else :
        precision = float(TP)/float((TP + FP))
        recall = float(TP)/float((TP + FN))
        accuracy = float((TP + TN))/float((TP + FP + FN + TN))

    #print precision, recall, accuracy
    return precision, recall, accuracy
        trainSample = random.sample(index,int(len(data)*0.8))
        testSample = [i for i in index if i not in trainSample]

        trainDataset = [data[i] for i in trainSample]
        testDataset = [data[i] for i in testSample]

        print "\nUsing ZeroR:"
        rz = evalZeroR(trainDataset,testDataset,classification,attrs)
        for k in rz:
            if k in resultZeroR:
                resultZeroR[k] += rz[k]
            else:
                resultZeroR[k] = rz[k]

        alist = [i.keys()[0] for i in attrs.values()]
        defaultValue = readARFF.computeZeroR(attrs,trainDataset)
        root = makeTree(trainDataset,attrs,defaultValue)
        print "\nTest Set: "
        r1 = evaluate(root,testDataset,alist, classification)
        for k in r1:
            if k in resultTest:
                resultTest[k] += r1[k]
            else:
                resultTest[k] = r1[k]
        print "\nTraining Set:"
        r2 = evaluate(root,trainDataset,alist, classification)
        for k in r1:
            if k in resultTrain:
                resultTrain[k] += r2[k]
            else:
                resultTrain[k] = r2[k]
Пример #14
0
    if len(root.children) != 0:
        for k in root.children:
            child = root.children[k]
            printNode(child)


if __name__ == '__main__':
    fileName = sys.argv[-1]
    attributes, data = readARFF.readArff(open(fileName))
    listAttributes = readARFF.getAttrList(attributes)
    times = 5
    total = zero = 0
    precision = recall = precisionZero = recallZero = 0
    for i in range(times):
        trainData = random.sample(data, int(len(data) * 0.8))
        defaultValue = readARFF.computeZeroR(attributes, data)
        zeroRValue = readARFF.computeZeroR(attributes, trainData)
        root = makeTree(trainData, listAttributes, attributes, defaultValue)
        #printNode(root)
        TP = tp = 0
        testData = []
        for d in data:
            if d not in trainData:
                testData.append(d)
        for d in testData:
            value = root.classify(d, attributes)
            if value == d[-1]:
                TP += 1
            if zeroRValue == d[-1]:
                tp += 1
        accuracy = float(TP) / len(testData)