def test_all_file(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga", 'n_trees': 5, 'bootstrap_size': 10 } tr = RandomForest() model = tr.train(options) for _, row in options['df'].iterrows(): target_label = row["Joga"] predicted = model.predict(row.drop("Joga")) self.assertEqual(target_label, predicted)
def test_benchmark(self): options = { 'df': pd.read_csv("benchmark.csv", sep=';'), 'label_column': "Joga", 'n_trees': 5, 'bootstrap_size': 10 } tr = RandomForest() model = tr.train(options) inf_data = pd.Series( ["Ensolarado", "Quente", "Normal", "Verdadeiro"], index=["Tempo", "Temperatura", "Umidade", "Ventoso"], name="InferenceData") self.assertEqual(model.predict(inf_data), 'Sim')
def compare(binary=0, dataset_index=0, samples=10): datasets = [load_iris(), load_breast_cancer(), load_wine(), load_digits()] dataset = datasets[dataset_index] if binary == 1: print( "Binary Comparison is only comparing between data in classes 0 and 1" ) x, y = make_two_class(dataset.data, dataset.target) else: x = dataset.data y = dataset.target #classifier = Tree() #name = "Tree" #run_cross_validation(name, classifier, x, y, samples) classifier = RandomForest() name = "RandomForest" run_cross_validation(name, classifier, x, y, samples) if binary == 1: classifier = AdaBoost() name = "AdaBoost" run_cross_validation(name, classifier, x, y, samples)
def manual_validation(type, dataset, samples, stopping_criterion=0, n_trees=10, max_features=None, n_stumps=100): """ This is a manual version of the cross validation test. I'm using this so I can output both training and test error on a per iteration basis, used for the exploration functions """ average_train = 0 average_test = 0 average_time = 0 print("\nTrain\tTest\tTrain Time") for i in range(samples): data = dataset.data target = dataset.target if type == "adaboost": data, target = make_two_class(data, target) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2) if type == "tree": classifier = Tree() elif type == "randomforest": classifier = RandomForest(n_trees=n_trees, max_features=max_features) else: classifier = AdaBoost(n_stumps=n_stumps) tic = time.clock() if type == "tree": classifier.fit(x_train, y_train, stopping_criterion) else: classifier.fit(x_train, y_train) toc = time.clock() train_score = classifier.score(x_train, y_train) test_score = classifier.score(x_test, y_test) average_train += train_score average_test += test_score time_diff = toc - tic average_time += time_diff print("%0.2f\t%0.2f\t%0.4f" % (train_score * 100, test_score * 100, time_diff)) total_toc = time.clock print("\n=== Averages ===") print("%0.2f\t%0.2f\t%0.4f" % (average_train * 100 / samples, average_test * 100 / samples, average_time / samples))
def train_random_forest(train, test, model_parameters=[5, 10, 1, 1.0]): n_trees = int(model_parameters[0]) max_depth = int(model_parameters[1]) min_size = int(model_parameters[2]) sample_size = int(model_parameters[3]) num_classifier = len(train) predicted_label = numpy.zeros((num_classifier, test.shape[0])) predicted_max_label = numpy.zeros((test.shape[0])) for i in range(num_classifier): for j in range(i): randomforest = RandomForest(train[i], train[j]) label_1, label_2 = randomforest.evaluate_algorithm( test[:, 0:-1], max_depth, min_size, sample_size, n_trees, (train[0].shape[1] - 1)) predicted_label[i, :] = predicted_label[i, :] + label_1 predicted_label[j, :] = predicted_label[j, :] + label_2 compare_matrix = (predicted_label == numpy.max(predicted_label, axis=0)) for i in range(compare_matrix.shape[1]): for j in range(compare_matrix.shape[0]): if (compare_matrix[j][i] == 1): predicted_max_label[i] = j print_report('Random forest', test[:, -1], predicted_max_label)
def main(): #Instance of RandomForest class with 100 decision trees. rf = RandomForest(100, "SSH") #Split dataset into training and testing data randomly. train_features, test_features, train_labels, test_labels = rf.splitDataset("Dataset/Bruteforce/SSH.csv") print('Training Features Shape:', train_features.shape) print('Training Labels Shape:', len(train_labels)) print('Testing Features Shape:', test_features.shape) #Train the model and compute metrics. rf.trainModel(train_features, train_labels) rf.computeMetrics(test_features, test_labels)
def successful_example(): forest = RandomForest("mushrooms", n_boostrap=50, n_features=10, test_size=0.2, n_trees=10, tree_max_depth=10 ) forest.test_model() forest.print_forest() print("Successful forest accuracy " + str(forest.accuracy * 100) + "%")
def fail_example(): forest = RandomForest("diabetes", n_boostrap=50, n_features=8, test_size=0.2, n_trees=20, tree_max_depth=10 ) forest.test_model() forest.print_forest() print("Unsuccessful forest accuracy " + str(forest.accuracy * 100) + "%")
# decision tree tree = DecisionTree(5, train_data.shape[0]) tree.train(train_data, train_label) res = tree.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # random forest forest = RandomForest(100,5,train_data.shape[0],6) forest.train(train_data, train_label) res = forest.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # write to csv # with open('titanic_prediction.csv', 'wt') as f: # writer = csv.writer(f, delimiter=',') # writer.writerow(['Id', 'Category'])
from KFoldValidation import KFoldValidation from randomForest import RandomForest import pandas as pd import random import numpy as np seed = 5 np.random.seed(seed) random.seed(seed) df = pd.read_csv('datasets/iris.data') options = { 'train_algorithm': RandomForest(), 'df': df, 'label_column': 'Y', 'num_folds': 5, 'n_trees': 15, 'bootstrap_size': 2, } runner = KFoldValidation() runner.train_with_kfold(options)
import pandas as pd import numpy as np from randomForest import RandomForest from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.model_selection import train_test_split train_x = pd.read_csv('./data/x_train.csv') train_y = pd.read_csv('./data/y_train.csv') train_data = pd.merge(train_x, train_y) forest = RandomForest(depth=5, min_sample_leaf=13, min_gini=0.001, n_tree=20) train_set, eval_set = train_test_split(train_data, test_size=0.2) forest.fit(train_set) result = forest.predict(eval_set) forest.save() print('ac ', accuracy_score(eval_set['label'], result)) print('precision ', precision_score(eval_set['label'], result)) print('recall ', recall_score(eval_set['label'], result)) print('f1_score ', f1_score(eval_set['label'], result))
tree = DecisionTree(10, train_data.shape[0]) tree.train(train_data, train_label) res = tree.predict(validation_data[:1,:]) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # random forest rf = RandomForest(10,10,train_data.shape[0],train_data.shape[1]) rf.train(train_data,train_label) res = rf.predict(validation_data) score = 0 for i in range(len(res)): if res[i] == validation_label[i]: score += 1 score /= len(res) print(score) # with open('titanic_prediction.csv', 'wt') as f: # writer = csv.writer(f, delimiter=',') # writer.writerow(['Id', 'Category']) # for i, cat in enumerate(res): # writer.writerow([str(i + 1), str(cat)])
indices = np.arange(150) np.random.shuffle(indices) train_dx, test_idx = indices[:100], indices[100:] Xtrain, Ytrain = X[train_dx], Y[train_dx] Xtest, Ytest = X[test_idx], Y[test_idx] # test of decision tree # model = dtc(6,10) # model.build_tree(Xtrain,Ytrain) # print('-------------------------------------------------------------------------------------------------------') # predicted_calsses = model.predict(Xtest) # score= 0 # for i in range(len(predicted_calsses)): # score += (predicted_calsses[i]==Ytest[i]) # print("the algorithm has an accuracy of ",score/len(predicted_calsses)) # test of random forest m = RandomForest(3, 2, 5) m.build_forest(Xtrain, Ytrain) predicted_calsses = m.predict(Xtest) score = 0 for i in range(len(predicted_calsses)): score += (predicted_calsses[i] == Ytest[i]) print("the algorithm has an accuracy of ", score / len(predicted_calsses))
import pandas as pd import numpy as np from randomForest import RandomForest from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.model_selection import train_test_split train_x = pd.read_csv('./data/x_train.csv') train_y = pd.read_csv('./data/y_train.csv') train_data = pd.merge(train_x, train_y) forest = RandomForest(depth=5, min_sample_leaf=13, min_gini=0.001, n_tree=20) train_set, eval_set = train_test_split(train_data, test_size=0.2) forest.load() result = forest.predict(eval_set) print('ac ', accuracy_score(eval_set['label'], result)) print('precision ', precision_score(eval_set['label'], result)) print('recall ', recall_score(eval_set['label'], result)) print('f1_score ', f1_score(eval_set['label'], result))