def test_model(n_folds=5, n_trees): kf = KFold(n_splits=n_folds) kf.get_n_splits(features) model = RandomForest() model.n_features = n_features accuracies = [] durations = [] for train_index, test_index in kf.split(features): train_features, test_features = features[train_index], features[test_index] train_labels, test_labels = labels[train_index], labels[test_index] rf.fit(train_features, train_labels) model.train_set = train_features model.test_set = test_labels rf_predictions = rf.predict(test_features) model_prediction = model.run() errors = abs(predictions - test_labels) mape = 100 * (errors / test_labels) accuracy = 100 - np.mean(mape) print('Accuracy:', round(accuracy, 2), '%.') rf_scratch = RandomForest()
def run_kfold(method, kf, X, y, text, transformer=None): accuracy = 0 fold = 0 print("Running " + str(text)) for train_index, test_index in kf: print("Starting fold " + str(fold)) fold += 1 X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] if transformer is not None: t = transformer.fit(X_train) X_train = t.transform(X_train) X_test = t.transform(X_test) if method == "rf": clf = RandomForest(X_train, y_train, n_estimators=1000) clf.fit() elif method == "lr": clf = linear_model.RidgeClassifier(alpha=2) clf.fit(X_train, y_train) elif method == "ex": clf = ExtraTreesClassifier(n_estimators=2000) clf.fit(X_train, y_train) y_hat = clf.predict(X_test) accuracy += score(y_hat, y_test) return (accuracy * 1.0 / len(kf))
def predict_test_data(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points)) guesses = [] for i in range(TEST_SIZE): point = testing_data[i] guess = tree.predict(point) guesses.append(int(guess)) with open('titanic_1.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Category']) i = 1 for g in guesses: writer.writerow([i, g]) i += 1
def main(): use_feature_index = [2, 3] iris = datasets.load_iris() X = iris.data[:, use_feature_index] y = iris.target feature_names = np.array(iris.feature_names)[use_feature_index] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) n_estimators = 50 rf = RandomForest(n_estimators=n_estimators, random_state=300) rf.fit(X_train, y_train) score_m = rf.score(X_test, y_test) # scoreの出力 print("-" * 50) print("score:" + str(score_m)) # 特徴量の重要度の出力 print("-" * 50) f_importance_m = rf.feature_importances print("feature importances:") for f_name, f_importance in zip(feature_names, f_importance_m): print(" ", f_name, ":", f_importance) # 決定した領域の出力 plt = PlotResult(rf, X_train, y_train, X_test, y_test, feature_names, "my_random_forest") plt.plot_result()
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument( '-t', '--training', default="C://Users//Mike Mraz//random-forest//random-forest//data//" "mushrooms_train.data") parser.add_argument( '-e', '--testing', default="C://Users//Mike Mraz//random-forest//random-forest//data//" "mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() if args.model == "0": print("Testing Decision Tree model") model = DecisionTree() else: print("Testing Random Forest model") model = RandomForest(args.tree_nums) test_model(model, args.training, args.testing)
def test_DT(self): records, attributes = load_data("data/mushrooms_train.data") test_records = load_data("data/mushrooms_train.data")[0] #print(records, attributes) RF = RandomForest(tree_num=10) RF.train(records, attributes)
def main(): ''' Main classification driver. Read in data files, classify the sensor data they contain, and evaluate the performance of the classifier ''' if len(sys.argv) < 2: print 'USAGE: ActivityClassifier.py (path to data file)' sys.exit(1) (sample_set, id_to_cat) = load_sample_data(sys.argv[1]) # TESTING: Reduce the number of samples # sample_set = filter_sample_set(sample_set) # Divide the samples into two, training and test. Keep 25% for test np.random.shuffle(sample_set) test_sample_count = sample_set.shape[0] / 4 test_samples = sample_set[:test_sample_count] training_samples = sample_set[test_sample_count:] random_forest = RandomForest(training_samples, 9) print random_forest confusion_matrix = ConfusionMatrix(id_to_cat.values()) for sample in test_samples: classified_category = random_forest.classify_activity(sample) confusion_matrix.add_result(int(sample[-1]), classified_category) print confusion_matrix confusion_matrix.report_stats()
def randomForestTest(feature_len, all_lines, all_features, all_labels): best_trees_num = 0 temp = 0 counts = {} for i in range(10): rate = 0 print("Test %d:" % (i + 1)) train_features = all_features[0:int(0.8 * len(all_features))] train_labels = all_labels[0:int(0.8 * len(all_features))] test_features = all_features[int(0.8 * len(all_features)):] test_labels = all_labels[int(0.8 * len(all_features)):] for trees_num in range(25, 36): rate = 0 if trees_num not in counts: counts[trees_num] = 0 print("trees_num:%d " % (trees_num), end=" ") new_forest = RandomForest(trees_num) new_forest.buildTrees(train_features, train_labels, len(train_features[0]), 3, 6) length = len(test_labels) for j in range(0, length): res = new_forest.predictForest(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) counts[trees_num] += rate / length if temp < counts[trees_num]: temp = counts[trees_num] best_trees_num = trees_num all_features, all_labels = now_provider.getFeatureAndLabel( all_lines, feature_len) print("Best trees_num:%d %f" % (best_trees_num, counts[best_trees_num] / 10)) for x in counts: print(x, counts[x])
def processData(self, modelName, gender, pClass, siblings, embarked): # loading the dataset df = pd.read_csv('train.csv', sep=',') # droping passengers id df = df.drop('PassengerId', axis=1) # changing strings to numeric values df["Sex"].replace({"male": 0, "female": 1}, inplace=True) df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True) # fillin empty values df["Embarked"].fillna(df["Embarked"].mean(), inplace=True) # seperating inputs and outputs x = df.drop('Survived', axis=1) y = df['Survived'] model = None if modelName == 'Decision Tree': model = DecisionTree(df) elif modelName == 'Naive Bayes': model = NaiveBayes(df) elif modelName == 'Neural Network': model = NeuralNetwork(df) elif modelName == 'Random Forest': model = RandomForest(df) else: model = SupportVector(df) return model
def get_classifier_object(self): # if self.classifier_name == 'LogReg': # self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'DeciTree': # self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'svm': # self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() if self.classifier_name == 'RForest': self.clf = RandomForest(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'XGB': self.clf = XGBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'NaiveBayes': self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'AdaBoost': self.clf = AdaBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() return self.clf.get_classifier()
def run_model(): # load data train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt' data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y']) data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y']) X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file) # train model col_y = 'y' T = 30000; max_height = 1 time_start = time.clock() RF_Prune = RandomForest() RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height) print("Using %.3f seconds" % (time.clock() - time_start)) # model accuracy print('\n--- Pruned Random forest model accuarcy ---') Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)] train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100 print('Model accuracy on the training set: %.2f %%' %train_acc) Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)] test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100 print('Accuracy on the testing set: %.2f %%\n' %test_acc)
def _fit_small_pc(self, images, y): start_time = time.time() print("PCA RANDOM FOREST") ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES) self.pca_randomForest = RandomForest(ds, y, n_estimators=2000) self.pca_randomForest.fit() print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))
def __init__(self, S, p, k, weights, name='BRAF'): """ :param raw_data: specify the name of the csv file :param S: Spesify the size of the Biased Random Forest method :param p: Specify the ratio between R1 and R2 :param k: Specify the KN Nearest Neighbours for minority class """ self.S = S self.p = p self.k = k self.name = name self.weights = weights "Initialize the Forests" self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S), True) self.R2 = RandomForest('R2_Forest', self.weights, int((1 - self.p) * self.S), True)
def run_randomforest(train_examples, train_labels, attributes, test_examples, test_labels, n_trees): rforest = RandomForest(entropy, 2, n_trees, len(attributes)) rforest.train_dataset(train_examples, attributes, train_labels) preds, error = rforest.test_dataset(test_examples, test_labels) return error
def get_frequent_splits(): forest = RandomForest(num_trees=100, max_depth=2) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat): classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0)) # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("census_predictions_random_forest.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def main(): plot_zipf() word2vec_explore() pickle_in = open("processed_text_list.pickle", "rb") processed_text_list = pickle.load(pickle_in) #preprocess_text(train_text) train_w2v_model(processed_text_list) #shuffle and partition dataset from sklearn.utils import shuffle data = pd.DataFrame({'text': processed_text_list, 'labels': polarity}) data = shuffle(data) get_w2v_array(data[:400000]) w2v_array = pickle.load(open('w2v_features.pickle', 'rb')) num_tweets = 400000 # number of tweets to consider w2v_array = w2v_array[:num_tweets] split_ratio = int(num_tweets * .8) w2v_train = w2v_array[:split_ratio] # w2v averages for each tweet w2v_test = w2v_array[split_ratio:] data = shuffle(data) simple_train = data['text'][:split_ratio] # preprocessed text simple_test = data['text'][split_ratio:] labels_list = data['labels'].tolist()[:num_tweets] train_labels = labels_list[:split_ratio] # list of labels test_labels = labels_list[split_ratio:] # get_w2v_array(data=data) # pickle_in = open("w2v_features.pickle", "rb") # w2v_features = pickle.load(pickle_in) # naive_bayes = NaiveBayes(simple_train.tolist(), simple_test.tolist(), labels_list) # accuracy = naive_bayes.evaluate() # print("Naive Bayes accuracy: " + str(accuracy)) #.499 # svm = SVM(simple_train, train_labels, simple_test, test_labels, 3000, .0000001) # accuracy = svm.predict() # print("SVM accuracy: " + str(accuracy)) #.744 with a=.0000001 and 3000 epochs random_forest = RandomForest(w2v_train, w2v_test, train_labels, test_labels, 'sqrt', max_depth=25, min_leaf=2, n_trees=500, model_type='scikit') accuracy = random_forest.evaluate() print("Random Forest accuracy: " + str(accuracy))
def makeForests( self, size: int, class_label: str, split_ratio: float, attribute_choice_fn) -> Tuple[List[RandomForest], List[float]]: forests = [] tests = [] for (train_pack, test_pack) in self.packs: forest = RandomForest(size, train_pack, class_label, split_ratio, attribute_choice_fn) forests.append(forest) tests.append(forest.test(test_pack)) return (forests, tests)
def best_params(): acc_max = 0 n_trees_max = 0 n_trees_list = [i for i in range(2, 11)] for n_tree in n_trees_list: clf = RandomForest(n_trees=n_tree) clf.fit(X_train, Y_train) predictions = clf.predict(X_test) acc = accuracy(Y_test, predictions) if acc > acc_max: acc_max = acc n_trees_max = n_tree return (n_trees_max, acc_max)
def train(rf): ''' Trains a random forest on the data from all data ''' theData = generateTrainData() testForest = RandomForest(theData) print("Training") testForest.train() print("Done!") with open(rf, 'wb') as f: cPickle.dump(testForest, f) print('randomForest model saved to: ' + rf)
def get_frequent_splits(): forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split if feature in cat_set: value = inverse_list[feature - CONTINUOUS_FEATURES][value] word = ' is ' name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
class CrossValidation(): classificationAlgorithms = [ logisticRegression(), RandomForest(), SVM(), AdaBoost(), XGBoost() ] def __init__(self, dataset, X_train, X_test, y_train, y_test): self.ds = dataset self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.accuracyDict = {} self.models = {} def run(self): for alg in self.classificationAlgorithms: results = alg.run(self.ds, self.X_train, self.X_test, self.y_train, self.y_test) #results incuding: the name of the algorithm and the model self.appendToAccuracyDict( results[0], self.kFoldCrossValidation(results[0], results[1])) self.appendModel(results[0], results[1]) def kFoldCrossValidation(self, algName, classifier): accuracies = cross_val_score(estimator=classifier, X=self.X_train, y=self.y_train, cv=300) accuracy = accuracies.mean() print algName + ' accuracy:', accuracy * 100, '%' return accuracy def appendToAccuracyDict(self, algName, accuracy): #tup[0]->algorithm name, tup[1]->accuracy self.accuracyDict[algName] = accuracy * 100 def appendModel(self, algName, model): #tup[0]->algorithm name, tup[1]->accuracy self.models[algName] = model def getAccuracyDict(self): return self.accuracyDict def getModel(self, name): return self.models[name]
def analyze_RF(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y) arr = list(range(50, 150, 10)) scores = [] for e in arr: model = RandomForest(num_trees=e, num_features=3) model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) fig, ax = plt.subplots() ax.plot(arr, scores) return scores, arr
def rfdef(self): print("RF Start") self.trainname.setText("RandomForest") file = self.trainfile.text() print(file) start = time.time() s = RandomForest() a = s.accuracy(file) end = time.time() t = (end - start) self.traintime.setText(str(round(t, 2)) + " (sec)") self.label_4.setText("Accuracy") self.trainstatus.setText(str(round(a, 3))) AccuracyStore.store('rf', a)
def classify_with_random_forest(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points))
def main(): if (len(sys.argv) < 3): print("Usage: python3 %s <dataset-csv> <target-attr>" % sys.argv[0]) exit(-1) datasetFile = sys.argv[1] targetAttr = sys.argv[2] separator = ',' random.seed(0) np.random.seed(0) # Read dataset D = pd.read_csv(datasetFile, sep=separator) t0 = time.time() # tree = DecisionTree(D, targetAttr, D.nunique(), sqrt) # tree.render() forest = RandomForest(D, targetAttr, D.nunique(), 10, sqrt, False)
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument('-t', '--training', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_train.data") parser.add_argument('-e', '--testing', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() if args.model == "0": print ("Testing Decision Tree model") model = DecisionTree() else: print ("Testing Random Forest model") model = RandomForest(args.tree_nums) test_model(model, args.training, args.testing)
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument('-t', '--training', default="data/mushrooms_train.data") parser.add_argument('-e', '--testing', default="data/mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() treeNum=int(args.tree_nums) if args.model == "0": print "Testing Decision Tree model" model = DecisionTree() else: print "Testing Random Forest model" model = RandomForest(treeNum) test_model(model, args.training, args.testing)
def graph_accuracy(): accuracy = [] num_trees = [] for j in range(5, 41, 5): forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 accuracy.append(num_right / num_validation_points) num_trees.append(j) print(j) sys.stdout.flush() plt.figure() plt.plot(num_trees, accuracy) plt.title("Census Accuracy For Random Forest") plt.ylabel("Accuracy Rate") plt.xlabel("Number of Trees") plt.show()
def crossValidationPositions(): ''' Performs 10 fold cross validation on the total joint position dataset ''' theData = generateAllPositionTrainingData() means, stdDevs = theData.normalizeData() k = 10 #Partition the data into 10 subsets dataSets = theData.getKSegments(k) #For each of the 10 subsets leave one out, train on the # other 9, test on the one left out, print the accuracy. results = confusionMatrix(labels) for i in xrange(k): print i #testing set testSet = dataSets[i] #Build the training set trainingSet = TrainingData("CrossVal") trainingList = copy.deepcopy(dataSets) trainingList.pop(i) for elem in trainingList: trainingSet.combineWithNewData(elem) #train the classifier on the trainingSet testForest = RandomForest(trainingSet) testForest.train() #Evaluate the classifer on the test set for samp in testSet.getData(): resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()