示例#1
0
def decisionTreeMain():
    # ----- Import csv file -----
    resource_package = 'resources'
    filename = 'wine-quality-red.csv'
    resource_path = os.path.join('training_data', filename)
    filePath = pkg_resources.resource_filename(
        resource_package,
        resource_path)  # Gets path of file from another package
    lines = csv.reader(open(filePath, "rb"))
    dataset = list(lines)

    #  ---- Prepare dataset for analysing  ------
    del dataset[0]  # removes headers from dataset
    dataset = datasetModule.normalizeDataset(dataset)
    #datasetModule.postponeColumn(dataset, 2) # Shifts 'Survive' column to the last
    #removeColumn(dataset, 3)

    print(
        '***** Splits random subsets in order to create a random forest *****')
    subsets = datasetModule.randomSplit(dataset, 3)
    datasetModule.printDataSet(subsets)

    testSet = datasetModule.getTestSet(dataset, 11)
    datasetModule.printDataSet(testSet)
    # Delete balanced testSet from original dataset
    dataset = [x for x in dataset if x not in testSet]
    testRow = testSet[0]

    # variousTrees = random_forest.createDecisionTrees(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100)
    # for i in variousTrees:
    #    decision_tree.printtree(i)
    #
    # variousTreesPool = random_forest.createDecisionTreesPool(subsets, decision_tree.buildTreeWithMaxElementsInNode, processes = 5 ,minNodes = 100)
    # for i in variousTreesPool:
    #    decision_tree.printtree(i)

    variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(
        subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes=100)
    for i in variousTreesMultiprocessinng:
        decision_tree.printtree(i)

    classificationResult = random_forest.classifyForestMultiprocessing(
        variousTreesMultiprocessinng, testRow)
    print(classificationResult)
    print("The final classification is: {}".format(
        random_forest.getFinalResult(classificationResult)))
    #tree = decision_tree.buildTreeWithHeigth(dataset, maxHeigth=5)
    #decision_tree.printtree(tree)
    print("Decided the next row:")
    print(testRow)
def decisionTreeMain():
    # ----- Import csv file -----
    resource_package = 'resources'
    filename = 'wine-quality-red.csv'
    resource_path = os.path.join('training_data', filename)
    filePath = pkg_resources.resource_filename(resource_package, resource_path) # Gets path of file from another package
    lines = csv.reader(open(filePath, "rb"))
    dataset = list(lines)

    #  ---- Prepare dataset for analysing  ------
    del dataset[0] # removes headers from dataset
    dataset = datasetModule.normalizeDataset(dataset)
    #datasetModule.postponeColumn(dataset, 2) # Shifts 'Survive' column to the last
    #removeColumn(dataset, 3)

    print('***** Splits random subsets in order to create a random forest *****')
    subsets = datasetModule.randomSplit(dataset, 3)
    datasetModule.printDataSet(subsets)

    testSet = datasetModule.getTestSet(dataset,11)
    datasetModule.printDataSet(testSet)
    # Delete balanced testSet from original dataset
    dataset = [x for x in dataset if x not in testSet]
    testRow = testSet[0]

    # variousTrees = random_forest.createDecisionTrees(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100)
    # for i in variousTrees:
    #    decision_tree.printtree(i)
    #
    # variousTreesPool = random_forest.createDecisionTreesPool(subsets, decision_tree.buildTreeWithMaxElementsInNode, processes = 5 ,minNodes = 100)
    # for i in variousTreesPool:
    #    decision_tree.printtree(i)

    variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100)
    for i in variousTreesMultiprocessinng:
        decision_tree.printtree(i)

    classificationResult = random_forest.classifyForestMultiprocessing(variousTreesMultiprocessinng, testRow)
    print(classificationResult)
    print("The final classification is: {}".format(random_forest.getFinalResult(classificationResult)))
    #tree = decision_tree.buildTreeWithHeigth(dataset, maxHeigth=5)
    #decision_tree.printtree(tree)
    print("Decided the next row:")
    print(testRow)
示例#3
0
def maxElementsInNodeTest(columnToTestIndex=11, numberOfTrees=3, minNodes=15):
    # ----- Import csv file -----
    resource_package = 'resources'
    filename = 'wine-color.csv'
    resource_path = os.path.join('training_data', filename)
    filePath = pkg_resources.resource_filename(
        resource_package,
        resource_path)  # Gets path of file from another package
    lines = csv.reader(open(filePath, "rb"))
    dataset = list(lines)

    #  ---- Prepare dataset for analysing  ------
    del dataset[0]  # removes headers from dataset
    datasetModule.normalizeDataset(dataset)

    # ----- Separates testSet from Dataset ------
    testSet = datasetModule.getTestSet(dataset, columnToTestIndex, 10)
    # Delete balanced testSet from original dataset
    dataset = [x for x in dataset if x not in testSet]

    #Split dataSet into different subsets in order to create decision trees
    subsets = datasetModule.randomSplit(dataset, numberOfTrees)

    # Creates random Forest
    print "***** Creating random forest with {} min nodes({}) trees *****".format(
        numberOfTrees, minNodes)
    variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(
        subsets,
        decision_tree.buildTreeWithMaxElementsInNode,
        minNodes=minNodes)

    # Classify testSet against the forest
    rightAnswersCount = 0
    for testRow in testSet:
        classificationResult = random_forest.classifyForestMultiprocessing(
            variousTreesMultiprocessinng, testRow)
        finalResult = random_forest.getFinalResult(classificationResult)
        if finalResult == testRow[columnToTestIndex]:
            rightAnswersCount += 1
    print "evaluated correctly {} out of {} tests".format(
        rightAnswersCount, len(testSet))
    print
def maxElementsInNodeTest(columnToTestIndex=11, numberOfTrees=3, minNodes=15):
    # ----- Import csv file -----
    resource_package = 'resources'
    filename = 'wine-color.csv'
    resource_path = os.path.join('training_data', filename)
    filePath = pkg_resources.resource_filename(resource_package,
                                               resource_path)  # Gets path of file from another package
    lines = csv.reader(open(filePath, "rb"))
    dataset = list(lines)

    #  ---- Prepare dataset for analysing  ------
    del dataset[0]  # removes headers from dataset
    datasetModule.normalizeDataset(dataset)

    # ----- Separates testSet from Dataset ------
    testSet = datasetModule.getTestSet(dataset, columnToTestIndex, 10)
    # Delete balanced testSet from original dataset
    dataset = [x for x in dataset if x not in testSet]

    # Split dataSet into different subsets in order to create decision trees
    subsets = datasetModule.randomSplit(dataset, numberOfTrees)

    # Creates random Forest
    print("***** Creating random forest with {} min nodes({}) trees *****".format(numberOfTrees, minNodes))
    variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(subsets,
                                                                                    decision_tree.buildTreeWithMaxElementsInNode,
                                                                                    minNodes=minNodes)

    # Classify testSet against the forest
    rightAnswersCount = 0
    for testRow in testSet:
        classificationResult = random_forest.classifyForestMultiprocessing(variousTreesMultiprocessinng, testRow)
        finalResult = random_forest.getFinalResult(classificationResult)
        if finalResult == testRow[columnToTestIndex]:
            rightAnswersCount += 1
    print("evaluated correctly {} out of {} tests".format(rightAnswersCount, len(testSet)))
    print()