def main(): trainSetFilename, testSetFilename, m = getArgs() trainData = arffParser.parse(trainSetFilename) testData = arffParser.parse(testSetFilename) data = trainData['data'] attr = trainData['attributes'] targetIndex = len(trainData['attributes']) - 1 tree = DecisionTree(data, attr, targetIndex, m) if tree.root == None: return tree.printTree() print('<Predictions for the Test Set Instances>') predictedData = [] for row in testData['data']: predictedClass = tree.classify(row, testData['attributes']) predictedRow = row[-1:] + [predictedClass] predictedData.append(predictedRow) numCorrect = 0 for i, row in zip(list(range(1, len(predictedData) + 1)), predictedData): print('{0}: Actual: {1} Predicted: {2}'.format(i, row[0], row[1])) if row[0] == row[1]: numCorrect += 1 print('Number of correctly classified: ' + str(numCorrect), 'Total number of test instances: ' + str(len(predictedData)))
def main(): # Check if the number of command line arguments is correct if len(sys.argv) < 6: print( "-- python --|-- main.py --|-- L --|-- K --|-- training_set.csv --|-- validation_set.csv --|-- test_set.csv --|" ) sys.exit(1) # The program takes two integer L and K as input to prune the decision tree L = int(sys.argv[1]) K = int(sys.argv[2]) # Get the file path of the training data, validation data and test data dataDir = './data1/' training_set = dataDir + sys.argv[3] validation_set = dataDir + sys.argv[4] test_set = dataDir + sys.argv[5] # Build a decision tree on training data decisionTree = DecisionTree(training_set) ############################################# # decisionTree.exportTree('tree.txt') # print decisionTree ############################################# # Create a validator using test data to calculate the prediction accuracy of a given decision tree validator = Validator(test_set) # Calculate the prediction accuracy of the original decision tree on test data validator.calculateAccuracy(decisionTree.root) # Display the prediction accuracy before pruning print("\nA decision tree is fully grown to fit the training data.") validator.displayAccuracy() # Post pruning the decision tree print("\nPost prunning", '.' * 30) print("L =", L, ", K =", K, ", the pruned decision tree is:\n") # Prune the original decision tree using L, K and validation data as inputs decisionTree.pruneTree(L, K, validation_set) ############################################## # decisionTree.exportTree('pruned_tree.txt') ############################################## # print the decision tree to standard output print(decisionTree) # Override the __str__ method in DecisionTree class # Calculate the prediction accuracy of the pruned decision tree on test data validator.calculateAccuracy(decisionTree.root) # Display the prediction accuracy after pruning validator.displayAccuracy()
def train(self, data, labels): self.data = data self.labels = labels for i in range(self.num_trees): sample_index = np.random.choice(self.data.shape[0], self.num_sample, replace=True) train_data = self.data[sample_index, :] train_labels = self.labels[sample_index] tree = DecisionTree(self.max_depth, self.num_feature) tree.train(train_data, train_labels) self.trees.append(tree)
def test_all_file(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga" } tr = DecisionTree() model = tr.train(options) for _, row in options['df'].iterrows(): target_label = row["Joga"] predicted = model.predict(row.drop("Joga")) self.assertEqual(target_label, predicted)
def test_benchmark(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga" } tr = DecisionTree() model = tr.train(options) inf_data = pd.Series( ["Ensolarado", "Quente", "Normal", "Verdadeiro"], index=["Tempo", "Temperatura", "Umidade", "Ventoso"], name="InferenceData") self.assertEqual(model.predict(inf_data), 'Sim')
def testSimpleCase(self): tree = DecisionTree() tree.fit(simpleData) for datum in simpleData: self.assertEqual(datum[-1], tree.predict(datum)) tree.print()
def fit(self, dataset): self.forest = [] for i in range(self.num_trees): # Generate a random subset of the dataset to train the tree on subset = [ dataset[random.randrange(0, len(dataset))] for a in range(self.num_samples) ] self.forest.append(DecisionTree(self.max_depth).fit(subset)) return self
def train(self, options): """ train a random forest, using n_trees decision trees options['df']: pandas dataframe options['n_trees']: number of trees options['label_column']: label column to be predicted options['bootstrap_size']: the size of the bootstrap, entries not used in the bootstrap will be ignored """ num_trees = options['n_trees'] df = options['df'] bootstrap_size = options['bootstrap_size'] tree_options = { 'label_column': options['label_column'] } for i in range(num_trees): tree_options['df'] = get_bootstrap(df, bootstrap_size) new_tree = DecisionTree() self.ensemble.append(new_tree.train(tree_options)) return self
cat_data_test = np.array(cat_data_test).T cat_data_test = np.array(cat_data_test, dtype='float') # zip categorical and non-categorical data together train_data = np.concatenate((cat_data, non_cat_data), axis=1) train_label = data[:, -1].astype(int) validation_data = train_data[:200, :] validation_label = train_label[:200] train_data = train_data[:, :] train_label = train_label[:] test_data = np.concatenate((cat_data_test, non_cat_data_test), axis=1) # decision tree tree = DecisionTree(5, train_data.shape[0]) tree.train(train_data, train_label) res = tree.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # random forest forest = RandomForest(100,5,train_data.shape[0],6) forest.train(train_data, train_label) res = forest.predict(validation_data)
label2id = {'loss': 0, 'draw': 1, 'win': 2} p2f = {'b': 0, 'x': 1, 'o': 2} for line in fin: line = line.strip().split(',') label = label2id[line[-1]] feature = np.array([p2f[p] for p in line[:-1]]) data.append((feature, label)) fin.close() return data data = read('connect-4.data') x = np.array([d[0] for d in data]) y = np.array([d[1] for d in data]) #y = label_binarize(y, classes=list(range(3))) kf = KFold(5, True) all_f1 = [] for train_index, test_index in kf.split(x): x_train, y_train, x_test, y_test = x[train_index], y[train_index], x[ test_index], y[test_index] #x_train, x_test, y_train, y_test = train_test_split(x, y) #print('training') #model = OneVsRestClassifier(SVC(kernel='rbf')) model = DecisionTree() model.fit(x_train, y_train) #print('testing') y_pred = model.predict(x_test) all_f1.append(f1_score(y_test, y_pred, average='macro')) print(sum(all_f1) / 5)
__author__ = 'zephyrYin' from decisionTree import DecisionTree dT = DecisionTree('data/featnames.csv', 'data/trainfeat.csv', 'data/trainlabs.csv', 'data/testfeat.csv', 'data/testlabs.csv', 0.01) #dT = DecisionTree('data/weatherFeatureName.csv', 'data/weatherTrainFeature.csv', 'data/weatherTrainLabel.csv', 'data/weatherTrainFeature.csv', 'data/weatherTrainLabel.csv', 1) dT.buildTree() dT.predictTestSet() result = dT.evaluate(dT.contrastResult[0], dT.contrastResult[1]) print(result) print(str(dT.nodeCnt) + ' nodes') # dT.readTrainFeatures() # wholeCnt = len(dT.testFeatures) # posCnt = dT.countPositive(dT.testFeatures) # print(wholeCnt) # print(posCnt) # print(posCnt/float(wholeCnt)) # dT.readFeatureNames() # dT.readTrainFeatures() # dT.readTestFeatures() # # for testFea in dT.testFeatures: # for i in range(len(testFea)-1): # if testFea[i] not in dT.featureValue[i]: # print i # print(str(testFea[i]) + ' not in ' + str(dT.featureValue[i])) #
cat_data_test = np.array(cat_data_test, dtype='int') # zip categorical and non-categorical data together train_data = np.concatenate((cat_data, non_cat_data), axis=1) train_label = data[:, -1].astype(int) validation_data = train_data[:6000, :] validation_label = train_label[:6000] train_data = train_data[6000:,:] train_label = train_label[6000:] test_data = np.concatenate((cat_data_test,non_cat_data_test), axis=1) # plot accuracy accuracy = [] for i in range(40): tree = DecisionTree(i,105) tree.train(train_data,train_label) res = tree.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) accuracy.append(score) plt.plot(accuracy) plt.xlabel('depth') plt.ylabel('accuracy') plt.title('Accuracy vs. Decision Tree Depth') plt.savefig('p6.png') plt.show()
def testComplexCase(self): tree = DecisionTree() tree.fit(complexData) for datum in complexData: self.assertEqual(datum[-1], tree.predict(datum)) # Overcast = YES self.assertEqual( Label.YES, tree.predict( [Outlook.Overcast, Temperature.Hot, Humidity.High, Wind.Weak])) self.assertEqual( Label.YES, tree.predict([ Outlook.Overcast, Temperature.Cool, Humidity.Normal, Wind.Strong ])) # Sunny + Normal = YES self.assertEqual( Label.YES, tree.predict([ Outlook.Sunny, Temperature.Cool, Humidity.Normal, Wind.Strong ])) self.assertEqual( Label.YES, tree.predict( [Outlook.Sunny, Temperature.Hot, Humidity.Normal, Wind.Weak])) # Sunny + High = NO self.assertEqual( Label.NO, tree.predict( [Outlook.Sunny, Temperature.Cool, Humidity.High, Wind.Strong])) self.assertEqual( Label.NO, tree.predict( [Outlook.Sunny, Temperature.Hot, Humidity.High, Wind.Weak])) # Rain + Weak = Yes self.assertEqual( Label.YES, tree.predict( [Outlook.Rain, Temperature.Cool, Humidity.Normal, Wind.Weak])) self.assertEqual( Label.YES, tree.predict( [Outlook.Rain, Temperature.Hot, Humidity.High, Wind.Weak])) # Rain + Strong = No self.assertEqual( Label.NO, tree.predict( [Outlook.Rain, Temperature.Cool, Humidity.Normal, Wind.Strong])) self.assertEqual( Label.NO, tree.predict( [Outlook.Rain, Temperature.Hot, Humidity.High, Wind.Strong])) tree.print()
def run(self): dt = DecisionTree(self.pathname, self.G) dt.dtree()
f_score = f1_score(y_test, pred, pos_label='grapefruit') # print('Confusion matrix:\n', cm) # print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1_score: {}'.format( # acc, precision, recall, f_score)) return acc, precision, recall, f_score # load data df = pd.read_csv("citrus.csv") # split columns X = df.drop("name", axis=1) y = df["name"].values # initialize classifiers clf1 = DecisionTree() clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(kernel='rbf', probability=True) clf4 = GaussianNB() test_cases = [[clf1], [clf2], [clf3], [clf4], [clf1, clf2, clf3, clf4], [clf2, clf3, clf4]] test_number = 1 for classifiers in test_cases: accuracy = [] precision = [] recall = [] f_score = []
def test(): X = [[1, 2, 0, 1, 0], [0, 1, 1, 0, 1], [1, 0, 0, 0, 1], [2, 1, 1, 0, 1], [1, 1, 0, 1, 1]] y = ['yes','yes','no','no','no'] decision_tree = DecisionTree(mode = 'C4.5') decision_tree.fit(X,y) res = decision_tree.predict(X) print res model_name = 'test.dt' decision_tree.saveModel(model_name) new_tree = DecisionTree() new_tree.loadModel(model_name) print new_tree.predict(X)
import matplotlib.pyplot as plt import numpy as np from loadData import LoadData from decisionTree import Node, DecisionTree, Evaluate from inspection import Inspection if __name__ == '__main__': train_input = '../handout/education_train.tsv' test_input = '../handout/education_test.tsv' train_output = '../result/education_train.labels' test_output = '../result/education_test.labels' ld = LoadData() dataset = ld.load_data(train_input) dt = DecisionTree(ld) tr_err = [] te_err = [] x_arr = [] print(ld.head) for i in range(len(ld.head)): root = dt.construct(dataset, i) # dt.traverse(root) dt.classify(ld.load_data(train_input), root, train_output) dt.classify(ld.load_data(test_input), root, test_output) with open(train_output, 'r') as f: predcol = f.read().splitlines() realcol = np.loadtxt(train_input, dtype=str, delimiter='\t', skiprows=1)[:, -1]
from decisionTree import DecisionTree from decisionTreePlot import DecisionTreePlot if __name__ == "__main__": trainingSetFile = open('SampleSets/training_set.csv') trainingSetData = trainingSetFile.readlines() trainingSetFile.close() decisionTree = DecisionTree(trainingSetData) #Algorithm Type - Example decisionTree.C45() #decisionTree.ID3() #decisionTree.ID3K(2) #decisionTree.SID3() #decisionTree.LSID3(3) #decisionTree.LSID3PathSample(2) #decisionTree.LSID3MC(1, 0.1) #decisionTree.BLSID3(1) #decisionTree.BLSID3PathSample(1) #decisionTree.LSID3Sequenced(2) #decisionTree.IIDT(10, 0.5) print("****Tree Data BEFORE Pruning****") print("Tree Size - Number of Nodes:", decisionTree.size()) print("Number of Leafs:", decisionTree.getNumLeafs()) print("Tree Depth:", decisionTree.getTreeDepth()) testSetFile = open('SampleSets/test_set.csv') testSetData = testSetFile.readlines() testSetFile.close() print("Prediction:", str(decisionTree.predict(testSetData) * 100) + "%") #decisionTreePlot = DecisionTreePlot() #decisionTreePlot.createDecisionTreePlot(decisionTree)
from fileIO import FileIO from preprocess import Preprocessing from decisionTree import DecisionTree if __name__ == '__main__': filename = 'house-votes-84.data.txt' fileio = FileIO() data = fileio.read_csv(filename) preprocessing = Preprocessing() preprocessing.assume_missing_values(data) for percent in range(3, 8): training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10)) attributes_number = len(training_data[0]) - 1 decision_tree = DecisionTree() root_node = decision_tree.build(training_data) # decision_tree.print() # print("Classification: ") accuracy = 0 for row in testing_data: classified = decision_tree.classify(row, decision_tree.root) classified.calc_percentages(len(testing_data)) if classified.republicans_percent > 50.0 and row[0] == 'republican' or ( classified.democrats_percent > 50.0 and row[0] == 'democrat'): accuracy += 1 accuracy = accuracy / float(len(testing_data)) print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
std_dev = 0 for a in accuracies: std_dev += ((a - average_accuracy)**2) std_dev = math.sqrt(1 / len(accuracies) * std_dev) print("Accuracy: {}, Std dev: {}".format(average_accuracy, std_dev)) return (average_accuracy, std_dev) """ Main program. Uses decisionTree.py as a support program to classify data """ values = [list(a) for a in skset.data] targets = [int(a) for a in skset.target] dataset = [Data(values[i], targets[i]) for i in range(len(values))] # print([str(a) for a in dataset]) # dataset = d.processFile('data/digitsModified.txt') if algorithm == Algorithm.DECISION_TREE: d = DecisionTree(max_depth=max_depth) # tree = d.fit(datase) check_accuracy(dt=d, dataset=dataset, num_repeats=10) elif algorithm == Algorithm.RANDOM_FOREST: forest = Forest(max_depth=max_depth, num_trees=num_trees, num_samples=num_samples) # f = forest.fit(dataset) check_accuracy(dt=forest, dataset=dataset, num_repeats=10)