def decisionTreeMain(): # ----- Import csv file ----- resource_package = 'resources' filename = 'wine-quality-red.csv' resource_path = os.path.join('training_data', filename) filePath = pkg_resources.resource_filename( resource_package, resource_path) # Gets path of file from another package lines = csv.reader(open(filePath, "rb")) dataset = list(lines) # ---- Prepare dataset for analysing ------ del dataset[0] # removes headers from dataset dataset = datasetModule.normalizeDataset(dataset) #datasetModule.postponeColumn(dataset, 2) # Shifts 'Survive' column to the last #removeColumn(dataset, 3) print( '***** Splits random subsets in order to create a random forest *****') subsets = datasetModule.randomSplit(dataset, 3) datasetModule.printDataSet(subsets) testSet = datasetModule.getTestSet(dataset, 11) datasetModule.printDataSet(testSet) # Delete balanced testSet from original dataset dataset = [x for x in dataset if x not in testSet] testRow = testSet[0] # variousTrees = random_forest.createDecisionTrees(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100) # for i in variousTrees: # decision_tree.printtree(i) # # variousTreesPool = random_forest.createDecisionTreesPool(subsets, decision_tree.buildTreeWithMaxElementsInNode, processes = 5 ,minNodes = 100) # for i in variousTreesPool: # decision_tree.printtree(i) variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing( subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes=100) for i in variousTreesMultiprocessinng: decision_tree.printtree(i) classificationResult = random_forest.classifyForestMultiprocessing( variousTreesMultiprocessinng, testRow) print(classificationResult) print("The final classification is: {}".format( random_forest.getFinalResult(classificationResult))) #tree = decision_tree.buildTreeWithHeigth(dataset, maxHeigth=5) #decision_tree.printtree(tree) print("Decided the next row:") print(testRow)
def decisionTreeMain(): # ----- Import csv file ----- resource_package = 'resources' filename = 'wine-quality-red.csv' resource_path = os.path.join('training_data', filename) filePath = pkg_resources.resource_filename(resource_package, resource_path) # Gets path of file from another package lines = csv.reader(open(filePath, "rb")) dataset = list(lines) # ---- Prepare dataset for analysing ------ del dataset[0] # removes headers from dataset dataset = datasetModule.normalizeDataset(dataset) #datasetModule.postponeColumn(dataset, 2) # Shifts 'Survive' column to the last #removeColumn(dataset, 3) print('***** Splits random subsets in order to create a random forest *****') subsets = datasetModule.randomSplit(dataset, 3) datasetModule.printDataSet(subsets) testSet = datasetModule.getTestSet(dataset,11) datasetModule.printDataSet(testSet) # Delete balanced testSet from original dataset dataset = [x for x in dataset if x not in testSet] testRow = testSet[0] # variousTrees = random_forest.createDecisionTrees(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100) # for i in variousTrees: # decision_tree.printtree(i) # # variousTreesPool = random_forest.createDecisionTreesPool(subsets, decision_tree.buildTreeWithMaxElementsInNode, processes = 5 ,minNodes = 100) # for i in variousTreesPool: # decision_tree.printtree(i) variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes = 100) for i in variousTreesMultiprocessinng: decision_tree.printtree(i) classificationResult = random_forest.classifyForestMultiprocessing(variousTreesMultiprocessinng, testRow) print(classificationResult) print("The final classification is: {}".format(random_forest.getFinalResult(classificationResult))) #tree = decision_tree.buildTreeWithHeigth(dataset, maxHeigth=5) #decision_tree.printtree(tree) print("Decided the next row:") print(testRow)
def maxElementsInNodeTest(columnToTestIndex=11, numberOfTrees=3, minNodes=15): # ----- Import csv file ----- resource_package = 'resources' filename = 'wine-color.csv' resource_path = os.path.join('training_data', filename) filePath = pkg_resources.resource_filename( resource_package, resource_path) # Gets path of file from another package lines = csv.reader(open(filePath, "rb")) dataset = list(lines) # ---- Prepare dataset for analysing ------ del dataset[0] # removes headers from dataset datasetModule.normalizeDataset(dataset) # ----- Separates testSet from Dataset ------ testSet = datasetModule.getTestSet(dataset, columnToTestIndex, 10) # Delete balanced testSet from original dataset dataset = [x for x in dataset if x not in testSet] #Split dataSet into different subsets in order to create decision trees subsets = datasetModule.randomSplit(dataset, numberOfTrees) # Creates random Forest print "***** Creating random forest with {} min nodes({}) trees *****".format( numberOfTrees, minNodes) variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing( subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes=minNodes) # Classify testSet against the forest rightAnswersCount = 0 for testRow in testSet: classificationResult = random_forest.classifyForestMultiprocessing( variousTreesMultiprocessinng, testRow) finalResult = random_forest.getFinalResult(classificationResult) if finalResult == testRow[columnToTestIndex]: rightAnswersCount += 1 print "evaluated correctly {} out of {} tests".format( rightAnswersCount, len(testSet)) print
def maxElementsInNodeTest(columnToTestIndex=11, numberOfTrees=3, minNodes=15): # ----- Import csv file ----- resource_package = 'resources' filename = 'wine-color.csv' resource_path = os.path.join('training_data', filename) filePath = pkg_resources.resource_filename(resource_package, resource_path) # Gets path of file from another package lines = csv.reader(open(filePath, "rb")) dataset = list(lines) # ---- Prepare dataset for analysing ------ del dataset[0] # removes headers from dataset datasetModule.normalizeDataset(dataset) # ----- Separates testSet from Dataset ------ testSet = datasetModule.getTestSet(dataset, columnToTestIndex, 10) # Delete balanced testSet from original dataset dataset = [x for x in dataset if x not in testSet] # Split dataSet into different subsets in order to create decision trees subsets = datasetModule.randomSplit(dataset, numberOfTrees) # Creates random Forest print("***** Creating random forest with {} min nodes({}) trees *****".format(numberOfTrees, minNodes)) variousTreesMultiprocessinng = random_forest.createDecisionTreesMultiprocessing(subsets, decision_tree.buildTreeWithMaxElementsInNode, minNodes=minNodes) # Classify testSet against the forest rightAnswersCount = 0 for testRow in testSet: classificationResult = random_forest.classifyForestMultiprocessing(variousTreesMultiprocessinng, testRow) finalResult = random_forest.getFinalResult(classificationResult) if finalResult == testRow[columnToTestIndex]: rightAnswersCount += 1 print("evaluated correctly {} out of {} tests".format(rightAnswersCount, len(testSet))) print()