示例#1
0
def main():
    trainset, trainRaw = makeData(training, labels)
    testset, testRaw = makeData(testing, labels)
    trainLabels = [item[-1] for item in trainRaw]
    testLabels = [item[-1] for item in testRaw]

    #print(gf.gainE(trainset,labels[5], labels[-1]))

    #    mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None)
    #    printTree(mytree)
    print("Running the decision tree algorithm on the 'Cars' dataset.")

    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 7):
            currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i,
                                  item, None)

            trainPred = [id3.predict(currentTree, x, labels) for x in trainset]
            testPred = [id3.predict(currentTree, x, labels) for x in testset]

            trainAcc = id3.accuracy(trainPred, trainLabels)
            testAcc = id3.accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
示例#2
0
def boost(data, labels, attr_list, target, iterations, answers):
    treelist = []
    for i in range(iterations):
        normalize = 0
        for item in data:
            normalize += item['weight']
        currentTree = id3(data, labels, attr_list, target, 2, 'entropy', None)
        #        predictor = []
        #        for item in data:
        #            predictor.append([predict(currentTree, item, labels), item['weight']])

        trainError = 0
        #        print('weight =' , data[0]['weight'])
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                trainError += item['weight'] / normalize
#        print(trainError)
#        print(normalize)
#        alpha = 1/2*m.log((1-trainError)/trainError)
        alpha = 1 / 4 * m.log((1 - trainError) / trainError)
        #        print(m.exp(alpha), m.exp(-alpha))
        #        print(alpha, " = alpha")
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                item['weight'] = item['weight'] * m.exp(alpha)
#                print(item['weight'])
            else:
                item['weight'] = item['weight'] * m.exp(-alpha)


#                print(item['weight'])
        treelist.append({'tree': currentTree, 'alpha': alpha})
        #print(treelist)
    return treelist
示例#3
0
def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    for element in trainset:
        element['weight'] = 1 / len(trainset)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])

    train = []
    test = []
    for i in [1, 2, 4, 8, 16, 20, 21]:
        for element in trainset:
            element['weight'] = 1 / len(trainset)
        #currentTree = id3(trainset, labels, label_attr, labels[-1], i, 'entropy', None)
        treeList = boost(trainset, labels, label_attr, labels[-1], i,
                         trainLabels)
        trainPred = []
        testPred = []
        for entry in trainset:
            trainPred.append(
                boostGuess(treeList, entry, labels, label_attr['outcome']))
        for entry in testset:
            testPred.append(
                boostGuess(treeList, entry, labels, label_attr['outcome']))

        trainAcc = accuracy(trainPred, trainLabels)
        if trainAcc < 0.3:
            trainAcc = 1 - trainAcc
        testAcc = accuracy(testPred, testLabels)
        if testAcc < 0.3:
            testAcc = 1 - testAcc
        train.append(trainAcc)
        test.append(testAcc)
        print("Boosted decision tree with", i,
              'iterations, has a training accuracy of', trainAcc)
        print("Boosted decision tree with", i,
              'iterations, has a testing accuracy of', testAcc)

    showtree = boost(trainset, labels, label_attr, labels[-1], 100,
                     trainLabels)
    stumpTrainAcc = []
    stumpTestAcc = []
    for tree in showtree:
        stumpTrainPred = []
        stumpTestPred = []
        for entry in trainset:
            stumpTrainPred.append(predict(tree['tree'], entry, labels))
        for entry in testset:
            stumpTestPred.append(predict(tree['tree'], entry, labels))
        stumpTrainAcc.append(accuracy(stumpTrainPred, trainLabels))
        #        print(stumpTrainAcc[-1])
        stumpTestAcc.append(accuracy(stumpTestPred, testLabels))
    print('Training stumps', stumpTrainAcc)
    print('Testing stumps', stumpTestAcc)
示例#4
0
def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    fix = fixUnknown(trainset, labels)
    trainsetU = replaceUnknown(trainset, labels, fix)
    testsetU = replaceUnknown(testset, labels, fix)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])
    print(
        "Running decision tree algorithm on the bank dataset with unknown values"
    )
    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainset, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainset]
            testPred = [predict(currentTree, x, labels) for x in testset]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

    print(
        "Running decision tree algorithm on the bank dataset with unknown's replaced"
    )
    print("\n \n \n \n \n")

    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainsetU, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainsetU]
            testPred = [predict(currentTree, x, labels) for x in testsetU]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
示例#5
0
def boostGuess(trees, data, labels, outcomes):
    guess = 0
    for tree in trees:
        prediction = predict(tree['tree'], data, labels)
        if prediction == outcomes[0]:
            guess += tree['alpha']
        else:
            guess -= tree['alpha']
    if guess > 0:
        return outcomes[0]
    else:
        return outcomes[1]
示例#6
0
def bag_guess(trees, data, labels, outcomes):
    guessVotes = 0
    for tree in trees:
        guess = predict(tree, data, labels)
        if guess == outcomes[0]:
            guessVotes += 1
        else:
            guessVotes -= 1
    if guessVotes >= 0:
        return outcomes[0]
    else:
        return outcomes[1]
示例#7
0
def result():
    form_data=request.form
    form_data=form_data.to_dict()
    for keys in form_data: 
        form_data[keys] = int(form_data[keys])
    print(form_data)
    final_result = str(id3.predict(form_data,id3.tree))
    if (final_result == "1.0"):
        final_result = "Below 50%"
    elif (final_result == "2.0"):
        final_result =="50-60%"
    elif (final_result == "3.0"):
        final_result ="60-70%"
    elif (final_result == "4.0"):
        final_result ="70-80%"
    elif (final_result == "5.0"):
        final_result ="80-90%"
    elif (final_result == "6.0"):
        final_result = "Above 90%"
    print("The marks is " + final_result)
    return render_template('result.html',final_result=final_result)
示例#8
0
weatherGraph.write('./images/weather.png', prog=None, format='png')
print('Done.')

# Read car evaluation dataset
print('===========================================')
print('Reading car evaluation training dataset...')
carAttributes, carEvaluationTrainDataSet = utils.readDataSet(
    './datasets/car-evaluation-train.csv')
targetAttribute = carAttributes[-1]
carAttributes.remove(targetAttribute)

# Train
print('>>> Training car evaluation dataset...')
carTree = id3.id3(carAttributes, targetAttribute, carEvaluationTrainDataSet)
print(carTree)

# Test
print('Reading car evaluation test dataset')
_, testDataset = utils.readDataSet('./datasets/car-evaluation-test.csv')
counter = 0
testSize = len(testDataset)

print('>>> Testing...')
for d in testDataset:
    targetValue = id3.predict(d, carTree)
    if targetValue == d[targetAttribute]:
        counter += 1

ratio = counter / testSize * 100
print('Prediction accuracy: {:.2f}%'.format(ratio))
示例#9
0
def main_slow():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    for element in trainset:
        element['weight'] = 1

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    treenums = [1, 100, 200, 300, 500, 800, 1000]
    for k_val in [2, 4, 6]:
        print("When you set your attribute subset values to", k_val,
              "you get the following.")
        trainAcc = []
        testAcc = []
        for num in treenums:
            treelist = []
            for i in range(num):
                newTraining = rand.choices(trainset, k=len(trainset))
                newTree = rand_id3(newTraining, labels, label_attr, labels[-1],
                                   18, 'entropy', None, k_val)
                treelist.append(newTree)
            trainPred = []
            testPred = []
            for entry in trainset:
                thing = bag_guess(treelist, entry, labels,
                                  label_attr['outcome'])
                trainPred.append(thing)
            for entry in testset:
                thing = bag_guess(treelist, entry, labels,
                                  label_attr["outcome"])
                testPred.append(thing)
            trainAcc.append(accuracy(trainPred, trainLabels))
            testAcc.append(accuracy(testPred, testLabels))
            print(trainAcc, 'train accuracy')
            print(testAcc, 'test accuracy')

    tree_preds = []
    basics = []
    for i in range(100):
        train_i = rand.choices(trainset, k=1000)
        treelist_i = []
        for j in range(1000):
            train_j = rand.choices(train_i, k=1000)
            newTree = rand_id3(train_j, labels, label_attr, labels[-1], 18,
                               'entropy', None, 6)
            treelist_i.append(newTree)
#            if j%100 == 0:
#                print("100 more trees from set", i, 'have been trained.  iteration = ',j)
        tree_preds.append(treelist_i)
        basics.append(treelist_i[0])
        print("Tree set", i, "has been trained")

    singleVar = []
    singleBias = []
    singleMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for tree in basics:
            guess = predict(tree, entry, labels)
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        singleMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        singleBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        singleVar.append(var)

    bagVar = []
    bagBias = []
    bagMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for trees in tree_preds:
            guess = bag_guess(trees, entry, labels, label_attr['outcome'])
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        bagMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        bagBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        bagVar.append(var)

    sVariance = mean(singleVar)
    sBias = mean(singleBias)
    sMSE = sBias + sVariance
    print("The bias and the variance of the single trees are: Variance:",
          sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE)
    bVariance = mean(bagVar)
    bBias = mean(bagBias)
    bMSE = bBias + bVariance
    print("The bias and the variance of the bagged trees are: Variance:",
          bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)