def test_DT(self): records, attributes = load_data("data/mushrooms_train.data") test_records = load_data("data/mushrooms_train.data")[0] #print(records, attributes) RF = RandomForest(tree_num=10) RF.train(records, attributes)
def predict_test_data(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points)) guesses = [] for i in range(TEST_SIZE): point = testing_data[i] guess = tree.predict(point) guesses.append(int(guess)) with open('titanic_1.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Category']) i = 1 for g in guesses: writer.writerow([i, g]) i += 1
def get_frequent_splits(): forest = RandomForest(num_trees=100, max_depth=2) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat): classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0)) # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("census_predictions_random_forest.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def get_frequent_splits(): forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split if feature in cat_set: value = inverse_list[feature - CONTINUOUS_FEATURES][value] word = ' is ' name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def train(rf): ''' Trains a random forest on the data from all data ''' theData = generateTrainData() testForest = RandomForest(theData) print("Training") testForest.train() print("Done!") with open(rf, 'wb') as f: cPickle.dump(testForest, f) print('randomForest model saved to: ' + rf)
def classify_with_random_forest(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points))
def graph_accuracy(): accuracy = [] num_trees = [] for j in range(5, 41, 5): forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 accuracy.append(num_right / num_validation_points) num_trees.append(j) print(j) sys.stdout.flush() plt.figure() plt.plot(num_trees, accuracy) plt.title("Census Accuracy For Random Forest") plt.ylabel("Accuracy Rate") plt.xlabel("Number of Trees") plt.show()
def crossValidationPositions(): ''' Performs 10 fold cross validation on the total joint position dataset ''' theData = generateAllPositionTrainingData() means, stdDevs = theData.normalizeData() k = 10 #Partition the data into 10 subsets dataSets = theData.getKSegments(k) #For each of the 10 subsets leave one out, train on the # other 9, test on the one left out, print the accuracy. results = confusionMatrix(labels) for i in xrange(k): print i #testing set testSet = dataSets[i] #Build the training set trainingSet = TrainingData("CrossVal") trainingList = copy.deepcopy(dataSets) trainingList.pop(i) for elem in trainingList: trainingSet.combineWithNewData(elem) #train the classifier on the trainingSet testForest = RandomForest(trainingSet) testForest.train() #Evaluate the classifer on the test set for samp in testSet.getData(): resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def crossValidationPositions(): ''' Performs 10 fold cross validation on the total joint position dataset ''' theData = generateAllPositionTrainingData() means, stdDevs = theData.normalizeData() k = 10 #Partition the data into 10 subsets dataSets = theData.getKSegments(k) #For each of the 10 subsets leave one out, train on the # other 9, test on the one left out, print the accuracy. results = confusionMatrix(labels) for i in xrange(k): print i #testing set testSet = dataSets[i] #Build the training set trainingSet = TrainingData("CrossVal") trainingList = copy.deepcopy(dataSets) trainingList.pop(i) for elem in trainingList: trainingSet.combineWithNewData(elem) #train the classifier on the trainingSet testForest = RandomForest(trainingSet) testForest.train() #Evaluate the classifer on the test set for samp in testSet.getData(): resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def twoVsOneAngles(): ''' Trains a random forest on the data from participants 1 and 2 and tests it on participant 3. The data used here uses the angle features ''' theData = generateTwoAngleTrainingData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateOneTestAngleData() results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def twoVsOneAngles(): ''' Trains a random forest on the data from participants 1 and 2 and tests it on participant 3. The data used here uses the angle features ''' theData = generateTwoAngleTrainingData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateOneTestAngleData() results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def oneVsTwoPositions(): ''' Trains a random forest on the data from participant 1 and tests it on participant 2 and 3. The data used here uses the position features ''' theData = generateOneTrainPositionData() means, stdDevs = theData.normalizeData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateTwoTestPositionData(means, stdDevs) results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def oneVsTwoPositions(): ''' Trains a random forest on the data from participant 1 and tests it on participant 2 and 3. The data used here uses the position features ''' theData = generateOneTrainPositionData() means, stdDevs = theData.normalizeData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateTwoTestPositionData(means, stdDevs) results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def main(): argument_parser = ArgumentParser( description="Script to run the RandomForest program.", add_help=False) mutually_exclusive_group = argument_parser.add_mutually_exclusive_group() mutually_exclusive_group.add_argument( '--use_gini', action='store_true', help="Use the Gini index for attribute splitting in the decision trees." ) mutually_exclusive_group.add_argument( '--use_entropy', action='store_true', help="Use entropy for attribute splitting in the decision trees.") mutually_exclusive_group.add_argument( '--use_variance', action='store_true', help="Use entropy for attribute splitting in the decision trees.") mutually_exclusive_group2 = argument_parser.add_mutually_exclusive_group() mutually_exclusive_group2.add_argument( '--use_hockey_preprocessor', action='store_true', help= "Use hockey dataset preprocessing logic on the given dataset. (default)" ) mutually_exclusive_group2.add_argument( '--use_custom_preprocessor', help= "Use custom dataset preprocessing logic on the given dataset. Where USE_CUSTOM_PREPROCESSOR is the" "filename of the preprocessor file in the preprocessors directory to use, e.g. TemplateDataSetPreprocessor." ) argument_parser.add_argument('-d', '--data_file', required=True, help="File containing the dataset.") argument_parser.add_argument( '-t', '--number_of_trees', type=int, default=4, help="The number of trees to create for the random forest.") argument_parser.add_argument( '-m', '--max_depth', type=int, help= "The maximum depth of all trees in the random forest. (default: None)." ) argument_parser.add_argument( '-s', '--min_split_size', type=int, default=1, help= "The threshold number of samples required at a node to stop further splitting. (default: 1)." ) argument_parser.add_argument( '-f', '--n_features', type=int, help= "The number of features to use when building each tree in the random forest. Specifying None will use all" " the features (default: None).") argument_parser.add_argument('-c', '--target_label', type=str, required=True, help="Target label that we want to predict.") argument_parser.add_argument( '-k', '--sklearn_rf', action='store_true', help='Train and test dataset on SKlearn Random Forest') argument_parser.add_argument( '-w', '--number_of_workers', type=int, help= "The number of workers to spawn during training of the random forest. Specifying None will disable this" "feature. (default: None).") argument_parser.add_argument( '-o', '--output_file', help= "Output file of the results. If the file exists already, new entries will be appended to the end. (default: None)." ) argument_parser.add_argument('-h', '--help', action='help', help="Show this message and exit.") arguments = argument_parser.parse_args() dataset_file = arguments.data_file output_file = arguments.output_file preprocessor = None if arguments.use_custom_preprocessor: preprocessor = import_module("preprocessors." + arguments.use_custom_preprocessor) else: preprocessor = HockeyPP #select class name class_name = arguments.target_label #select splitting cost function split_function = 'gini' if arguments.use_entropy: split_function = 'entropy' elif arguments.use_variance: split_function = 'variance' #Test regression with 'sum_7yr_GP' train_data, test_data = preprocessor.process(dataset_file, class_name) random_forest = RandomForest(arguments.number_of_trees, arguments.max_depth, arguments.min_split_size, arguments.n_features, arguments.number_of_workers, split_function) t0 = datetime.now() random_forest.train(train_data, class_name) diff = datetime.now() - t0 t = divmod(diff.days * 86400 + diff.seconds, 60) train_results = random_forest.bagging_predict(train_data) t0 = datetime.now() test_results = random_forest.bagging_predict(test_data) diff = datetime.now() - t0 tp = divmod(diff.days * 86400 + diff.seconds, 60) if arguments.use_variance: train_accuracy = random_forest.mse(train_results, train_data[:, -1]) print("\nTrain Mean squared error: {}".format(train_accuracy)) test_accuracy = random_forest.mse(test_results, test_data[:, -1]) print("Test Mean squared error: {}\n".format(test_accuracy)) else: train_accuracy = random_forest.evaluate(train_results, train_data[:, -1]) print("\nTrain Percent Correct: {}".format(train_accuracy)) test_accuracy = random_forest.evaluate(test_results, test_data[:, -1]) print("Test Percent Correct: {}\n".format(test_accuracy)) print("\nTime for train: {}min {}sec".format(t[0], t[1])) print("Time for prediction: {}min {}sec\n".format(tp[0], tp[1])) if arguments.sklearn_rf is True: sk_rf = Sklearn_RF(arguments.number_of_trees, arguments.max_depth, arguments.min_split_size, arguments.n_features) # TODO: Need a sklearn_regression tree as well sk_rf.train(train_data, class_name) accuracy_sk = sk_rf.evaluate( test_data, tree_type='regressor' if arguments.use_variance else 'classifier') if arguments.use_variance: print('{}{}'.format('sklearn rf MSE: ', accuracy_sk)) else: print('{}{}'.format('sklearn rf Percent correct: ', accuracy_sk * 100)) # Write out the results to a file, if one is specified, for downstream processing. if output_file: creating_new_file = True if os.path.isfile(output_file): creating_new_file = False headers = [ "Features", "MaxDepth", "MinSplitThreshold", "Trees", "SplitCriteria", "Target", "TrainAccuracy", "TestAccuracy" ] with open(output_file, "a") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=headers, lineterminator="\n") if creating_new_file: print("Creating a new file {}!\n".format(output_file)) writer.writeheader() else: print("Appending to the file {}!\n".format(output_file)) writer.writerow({ "Features": arguments.n_features if arguments.n_features else "ALL", "MaxDepth": arguments.max_depth if arguments.max_depth else "NOLIMIT", "MinSplitThreshold": arguments.min_split_size, "Trees": arguments.number_of_trees, "SplitCriteria": split_function, "Target": class_name, "TrainAccuracy": train_accuracy, "TestAccuracy": test_accuracy })
parser = argparse.ArgumentParser() parser.add_argument('-dataset', nargs=1, type=str, required=True) parser.add_argument('-target', nargs=1, type=str, required=True) parser.add_argument('-alias', nargs=1, type=str, required=True) arguments = parser.parse_args() newDF = readCSV(arguments.dataset[0], arguments.target[0]) nTrees = [50] k = 8 m = floor(sqrt(newDF.shape[1])) for n in nTrees: print("-----------------") print("Number of trees: " + str(n)) newDF = readCSV(arguments.dataset[0], arguments.target[0]) folds = generate_kfolds(newDF, arguments.target[0], k) for i in range(k): trainList = [x for j, x in enumerate(folds) if j != i] trainDF = pd.concat(trainList) testDF = folds[i] randForest = RandomForest() randForest.train(trainDF, arguments.target[0], n, m, False) newStr = randForest.eval(testDF) with open("outputResults/" + str(n) + arguments.alias[0] + ".txt", "a") as outputFile: outputFile.write(newStr)
X = iris.data y = iris.target ratio_train_test = 0.85 num_samples, num_features = X.shape idx = np.random.permutation(range(num_samples)) num_samples_train = int(num_samples * ratio_train_test) idx_train = idx[:num_samples_train] idx_test = idx[num_samples_train:] X_train, y_train = X[idx_train], y[idx_train] X_test, y_test = X[idx_test], y[idx_test] # HYPER PARAMETERS max_depth = 7 min_split_size = 5 ratio_samples = 0.2 num_trees = 30 num_features_node = int(np.sqrt(num_features)) coefficient = 'gini' percentile = 90 values = None min_std_deviation = 0 rf = RandomForest(max_depth, min_split_size, ratio_samples, num_trees, num_features_node, coefficient, percentile, values, min_std_deviation) rf.train(X_train, y_train) rf.predict(X_test, y_test)
def test10Fold(): global allWords splits = tenFoldCrossValidation() count = 0 total = 0 print("Naive Bayes") for split in splits: nb = naiveBayes() trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Random Forest") for split in splits: nb = RandomForest(100) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Neural 5") for split in splits: nb = neuralNetwork((5, ), 1000) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Neural 3") for split in splits: nb = neuralNetwork((3, ), 1000) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("SVM") for split in splits: nb = svm() trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10))
from Dataset import Dataset from RandomForest import RandomForest import ClassifierStats as stats dataset_path = 'synthetic.social' train = Dataset.from_file('../data/{}.train'.format(dataset_path)) test = Dataset.from_file('../data/{}.test'.format(dataset_path)) model = RandomForest(num_trees=100, max_depth=100, bagging_data_fraction=0.4) model.train(train) predictions = model.classify(test) accuracy = stats.accuracy(test.labels, predictions) print(accuracy)
from DecisionTree import DecisionTree from RandomForest import RandomForest import pandas as pd def getData(): data_train = pd.read_csv("./hw6_train.dat", sep=" ", header=None).rename(columns={10: "y"}) data_test = pd.read_csv("./hw6_test.dat", sep=" ", header=None).rename(columns={10: "y"}) return data_train, data_test if __name__ == "__main__": train_set, test_set = getData() RF = RandomForest(train=train_set, test=test_set) RF.train(n_tree=2000, get_oob=True) # RF.predict(mode="train") # RF.predict(mode="test") # DT = DecisionTree(train=train_set, test=test_set) # DT.train() # DT.predict(mode="test")