def __init__(self, num_trees, num_features, impurity_criterion, prune=False): """ num_trees: number of trees to create in the forest: num_features: the number of features to consider when choosing the best split for each node of the decision trees """ RandomForest.__init__(self, num_trees, num_features, impurity_criterion) self.prune = prune
def run_randomforest(train_examples, train_labels, attributes, test_examples, test_labels, n_trees): rforest = RandomForest(entropy, 2, n_trees, len(attributes)) rforest.train_dataset(train_examples, attributes, train_labels) preds, error = rforest.test_dataset(test_examples, test_labels) return error
def get_frequent_splits(): forest = RandomForest(num_trees=100, max_depth=2) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def random_forests_classification(X, y, test_dat): classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0)) # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("census_predictions_random_forest.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def main(): plot_zipf() word2vec_explore() pickle_in = open("processed_text_list.pickle", "rb") processed_text_list = pickle.load(pickle_in) #preprocess_text(train_text) train_w2v_model(processed_text_list) #shuffle and partition dataset from sklearn.utils import shuffle data = pd.DataFrame({'text': processed_text_list, 'labels': polarity}) data = shuffle(data) get_w2v_array(data[:400000]) w2v_array = pickle.load(open('w2v_features.pickle', 'rb')) num_tweets = 400000 # number of tweets to consider w2v_array = w2v_array[:num_tweets] split_ratio = int(num_tweets * .8) w2v_train = w2v_array[:split_ratio] # w2v averages for each tweet w2v_test = w2v_array[split_ratio:] data = shuffle(data) simple_train = data['text'][:split_ratio] # preprocessed text simple_test = data['text'][split_ratio:] labels_list = data['labels'].tolist()[:num_tweets] train_labels = labels_list[:split_ratio] # list of labels test_labels = labels_list[split_ratio:] # get_w2v_array(data=data) # pickle_in = open("w2v_features.pickle", "rb") # w2v_features = pickle.load(pickle_in) # naive_bayes = NaiveBayes(simple_train.tolist(), simple_test.tolist(), labels_list) # accuracy = naive_bayes.evaluate() # print("Naive Bayes accuracy: " + str(accuracy)) #.499 # svm = SVM(simple_train, train_labels, simple_test, test_labels, 3000, .0000001) # accuracy = svm.predict() # print("SVM accuracy: " + str(accuracy)) #.744 with a=.0000001 and 3000 epochs random_forest = RandomForest(w2v_train, w2v_test, train_labels, test_labels, 'sqrt', max_depth=25, min_leaf=2, n_trees=500, model_type='scikit') accuracy = random_forest.evaluate() print("Random Forest accuracy: " + str(accuracy))
def makeForests( self, size: int, class_label: str, split_ratio: float, attribute_choice_fn) -> Tuple[List[RandomForest], List[float]]: forests = [] tests = [] for (train_pack, test_pack) in self.packs: forest = RandomForest(size, train_pack, class_label, split_ratio, attribute_choice_fn) forests.append(forest) tests.append(forest.test(test_pack)) return (forests, tests)
def best_params(): acc_max = 0 n_trees_max = 0 n_trees_list = [i for i in range(2, 11)] for n_tree in n_trees_list: clf = RandomForest(n_trees=n_tree) clf.fit(X_train, Y_train) predictions = clf.predict(X_test) acc = accuracy(Y_test, predictions) if acc > acc_max: acc_max = acc n_trees_max = n_tree return (n_trees_max, acc_max)
def get_frequent_splits(): forest = RandomForest(num_trees = 100, max_depth = 2, categorical_vars = cat_set) forest.train(training_data, training_labels) lst = forest.most_frequent_first_splits() for item in lst: word = ' < ' split, frequency = item feature, value = split if feature in cat_set: value = inverse_list[feature - CONTINUOUS_FEATURES][value] word = ' is ' name = feature_names[feature] print(name + word + str(value) + ' (' + str(frequency) + ' trees)')
def train(rf): ''' Trains a random forest on the data from all data ''' theData = generateTrainData() testForest = RandomForest(theData) print("Training") testForest.train() print("Done!") with open(rf, 'wb') as f: cPickle.dump(testForest, f) print('randomForest model saved to: ' + rf)
def __init__(self, num_trees, num_features, impurity_criterion, prune=False): ''' num_trees: number of trees to create in the forest: num_features: the number of features to consider when choosing the best split for each node of the decision trees ''' RandomForest.__init__(self, num_trees, num_features, impurity_criterion) self.prune = prune
def analyze_RF(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y) arr = list(range(50, 150, 10)) scores = [] for e in arr: model = RandomForest(num_trees=e, num_features=3) model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) fig, ax = plt.subplots() ax.plot(arr, scores) return scores, arr
def processData(self, modelName, gender, pClass, siblings, embarked): # loading the dataset df = pd.read_csv('train.csv', sep=',') # droping passengers id df = df.drop('PassengerId', axis=1) # changing strings to numeric values df["Sex"].replace({"male": 0, "female": 1}, inplace=True) df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True) # fillin empty values df["Embarked"].fillna(df["Embarked"].mean(), inplace=True) # seperating inputs and outputs x = df.drop('Survived', axis=1) y = df['Survived'] model = None if modelName == 'Decision Tree': model = DecisionTree(df) elif modelName == 'Naive Bayes': model = NaiveBayes(df) elif modelName == 'Neural Network': model = NeuralNetwork(df) elif modelName == 'Random Forest': model = RandomForest(df) else: model = SupportVector(df) return model
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument( '-t', '--training', default="C://Users//Mike Mraz//random-forest//random-forest//data//" "mushrooms_train.data") parser.add_argument( '-e', '--testing', default="C://Users//Mike Mraz//random-forest//random-forest//data//" "mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() if args.model == "0": print("Testing Decision Tree model") model = DecisionTree() else: print("Testing Random Forest model") model = RandomForest(args.tree_nums) test_model(model, args.training, args.testing)
def rfdef(self): print("RF Start") self.trainname.setText("RandomForest") file = self.trainfile.text() print(file) start = time.time() s = RandomForest() a = s.accuracy(file) end = time.time() t = (end - start) self.traintime.setText(str(round(t, 2)) + " (sec)") self.label_4.setText("Accuracy") self.trainstatus.setText(str(round(a, 3))) AccuracyStore.store('rf', a)
def get_classifier_object(self): # if self.classifier_name == 'LogReg': # self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'DeciTree': # self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'svm': # self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() if self.classifier_name == 'RForest': self.clf = RandomForest(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'XGB': self.clf = XGBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'NaiveBayes': self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'AdaBoost': self.clf = AdaBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() return self.clf.get_classifier()
def _fit_small_pc(self, images, y): start_time = time.time() print("PCA RANDOM FOREST") ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES) self.pca_randomForest = RandomForest(ds, y, n_estimators=2000) self.pca_randomForest.fit() print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time))
def __init__(self, S, p, k, weights, name='BRAF'): """ :param raw_data: specify the name of the csv file :param S: Spesify the size of the Biased Random Forest method :param p: Specify the ratio between R1 and R2 :param k: Specify the KN Nearest Neighbours for minority class """ self.S = S self.p = p self.k = k self.name = name self.weights = weights "Initialize the Forests" self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S), True) self.R2 = RandomForest('R2_Forest', self.weights, int((1 - self.p) * self.S), True)
def test_model(n_folds=5, n_trees): kf = KFold(n_splits=n_folds) kf.get_n_splits(features) model = RandomForest() model.n_features = n_features accuracies = [] durations = [] for train_index, test_index in kf.split(features): train_features, test_features = features[train_index], features[test_index] train_labels, test_labels = labels[train_index], labels[test_index] rf.fit(train_features, train_labels) model.train_set = train_features model.test_set = test_labels rf_predictions = rf.predict(test_features) model_prediction = model.run() errors = abs(predictions - test_labels) mape = 100 * (errors / test_labels) accuracy = 100 - np.mean(mape) print('Accuracy:', round(accuracy, 2), '%.') rf_scratch = RandomForest()
def predict_test_data(): forest = RandomForest(num_trees=25, max_depth=25) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points)) guesses = [] for i in range(testdata.shape[0]): point = testdata[i] guess = forest.predict(point) guesses.append(int(guess)) with open('spam_1.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Category']) i = 0 for g in guesses: writer.writerow([i, g]) i += 1
def graph_accuracy(): accuracy = [] num_trees = [] for j in range(5, 41, 5): forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 accuracy.append(num_right / num_validation_points) num_trees.append(j) print(j) sys.stdout.flush() plt.figure() plt.plot(num_trees, accuracy) plt.title("Census Accuracy For Random Forest") plt.ylabel("Accuracy Rate") plt.xlabel("Number of Trees") plt.show()
def run_model(): # load data train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt' data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y']) data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y']) X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file) # train model col_y = 'y' T = 30000; max_height = 1 time_start = time.clock() RF_Prune = RandomForest() RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height) print("Using %.3f seconds" % (time.clock() - time_start)) # model accuracy print('\n--- Pruned Random forest model accuarcy ---') Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)] train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100 print('Model accuracy on the training set: %.2f %%' %train_acc) Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)] test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100 print('Accuracy on the testing set: %.2f %%\n' %test_acc)
def __init__(self, n_trees=10, max_depth=2, min_size=2, cost='gini'): """ Constructor for random forest classifier. This mainly just initialize the attributes of the class by calling the base class constructor. However, here is where it is the cost function string is checked to make sure it either using 'gini', otherwise an error is thrown. Args: cost (str) : The name of the cost function to use for evaluating the split. n_trees (int): The number of trees to use. max_depth (int): The maximum depth of tree. min_size (int): The minimum number of datapoints in terminal nodes. """ if cost != 'gini': raise NameError('Not valid cost function') else: RandomForest.__init__(self, cost, n_trees=10, max_depth=2, min_size=2)
def crossValidationPositions(): ''' Performs 10 fold cross validation on the total joint position dataset ''' theData = generateAllPositionTrainingData() means, stdDevs = theData.normalizeData() k = 10 #Partition the data into 10 subsets dataSets = theData.getKSegments(k) #For each of the 10 subsets leave one out, train on the # other 9, test on the one left out, print the accuracy. results = confusionMatrix(labels) for i in xrange(k): print i #testing set testSet = dataSets[i] #Build the training set trainingSet = TrainingData("CrossVal") trainingList = copy.deepcopy(dataSets) trainingList.pop(i) for elem in trainingList: trainingSet.combineWithNewData(elem) #train the classifier on the trainingSet testForest = RandomForest(trainingSet) testForest.train() #Evaluate the classifer on the test set for samp in testSet.getData(): resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def twoVsOneAngles(): ''' Trains a random forest on the data from participants 1 and 2 and tests it on participant 3. The data used here uses the angle features ''' theData = generateTwoAngleTrainingData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateOneTestAngleData() results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
class CrossValidation(): classificationAlgorithms = [ logisticRegression(), RandomForest(), SVM(), AdaBoost(), XGBoost() ] def __init__(self, dataset, X_train, X_test, y_train, y_test): self.ds = dataset self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.accuracyDict = {} self.models = {} def run(self): for alg in self.classificationAlgorithms: results = alg.run(self.ds, self.X_train, self.X_test, self.y_train, self.y_test) #results incuding: the name of the algorithm and the model self.appendToAccuracyDict( results[0], self.kFoldCrossValidation(results[0], results[1])) self.appendModel(results[0], results[1]) def kFoldCrossValidation(self, algName, classifier): accuracies = cross_val_score(estimator=classifier, X=self.X_train, y=self.y_train, cv=300) accuracy = accuracies.mean() print algName + ' accuracy:', accuracy * 100, '%' return accuracy def appendToAccuracyDict(self, algName, accuracy): #tup[0]->algorithm name, tup[1]->accuracy self.accuracyDict[algName] = accuracy * 100 def appendModel(self, algName, model): #tup[0]->algorithm name, tup[1]->accuracy self.models[algName] = model def getAccuracyDict(self): return self.accuracyDict def getModel(self, name): return self.models[name]
def oneVsTwoPositions(): ''' Trains a random forest on the data from participant 1 and tests it on participant 2 and 3. The data used here uses the position features ''' theData = generateOneTrainPositionData() means, stdDevs = theData.normalizeData() testForest = RandomForest(theData) print "Training" testForest.train() print "Done!" testList = generateTwoTestPositionData(means, stdDevs) results = confusionMatrix(labels) for samp in testList: resultLabel = testForest.classify(samp) trueLabel = samp.getLabel() results.update(trueLabel, resultLabel) results.printMatrix()
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument('-t', '--training', default="data/mushrooms_train.data") parser.add_argument('-e', '--testing', default="data/mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() treeNum=int(args.tree_nums) if args.model == "0": print "Testing Decision Tree model" model = DecisionTree() else: print "Testing Random Forest model" model = RandomForest(treeNum) test_model(model, args.training, args.testing)
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument('-t', '--training', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_train.data") parser.add_argument('-e', '--testing', default="/usr/cs/grad/master/sbeathan/DataMining/random-forest/random-forest/data/mushrooms_test.data") parser.add_argument('-d', '--max_depth', default=10) parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() if args.model == "0": print ("Testing Decision Tree model") model = DecisionTree() else: print ("Testing Random Forest model") model = RandomForest(args.tree_nums) test_model(model, args.training, args.testing)
def main(): if (len(sys.argv) < 3): print("Usage: python3 %s <dataset-csv> <target-attr>" % sys.argv[0]) exit(-1) datasetFile = sys.argv[1] targetAttr = sys.argv[2] separator = ',' random.seed(0) np.random.seed(0) # Read dataset D = pd.read_csv(datasetFile, sep=separator) t0 = time.time() # tree = DecisionTree(D, targetAttr, D.nunique(), sqrt) # tree.render() forest = RandomForest(D, targetAttr, D.nunique(), 10, sqrt, False)
def classify_with_random_forest(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points))
def main(): """ Process the input arguments """ parser = argparse.ArgumentParser() parser.add_argument('-m', '--model', default="0") parser.add_argument('-t', '--training', default="data/tic_tac_toe_train.data") parser.add_argument('-e', '--testing', default="data/tic_tac_toe_test.data") parser.add_argument('-n', '--tree_nums', default=20) args = parser.parse_args() if args.model == "0": print("Testing Decision Tree model") model = DecisionTree() else: print("Testing Random Forest model") model = RandomForest(args.tree_nums) test_model(model, args.training, args.testing)
def get_classifier_object(self): if self.classifier_name == 'svm': self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'logreg': self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'RForest': self.clf = RandomForest(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'knn': self.clf = KNN(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'lda': self.clf = LDA(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() return self.clf.get_classifier()
class Ensemble(object): def __init__(self): self.pca_randomForest = None self.pca_randomForest_norm = None self.pca_randomForest_pca = None self.rbm_lr_rbm = None self.rbm_lr = None self.texture_10_8 = None self.texture_5_10 = None self.texture_7_10 = None self.texture_9_8 = None self.texture_4_10 = None self.texture_20_8 = None self.ensemble_logistic_regression = None self.edge_pca_lr = None self.pca_edge_norm = None self.pca_edge_pca = None self.ip = ImagesProcessor() # Agregamos las predicciones aca porque no logramos pasarlas por referencia self.pca_randomForest_y_hat = None self.rbm_lr_y_hat = None self.texture_10_8_y_hat = None self.texture_5_10_y_hat = None def load(self): self.texture_10_8 = self._load_classifier('./ridgeClassifier_10_8') self.texture_5_10 = self._load_classifier('./ridgeClassifier_5_10') self.texture_7_10 = self._load_classifier('./ridgeClassifier_7_10') self.texture_9_8 = self._load_classifier('./ridgeClassifier_9_8') self.texture_4_10 = self._load_classifier('./ridgeClassifier_4_10') self.texture_20_8 = self._load_classifier('./ridgeClassifier_20_8') self.ensemble_logistic_regression = self._load_classifier('ensemble_logistic_regression') #pca_randomForest_pca = _load_classifier('./pca') #rbm_lr = _load_classifier('./rbm') def _load_classifier(self, path): f = file(path, 'r') classifier = cPickle.load(f) f.close() return classifier def fit_small(self, images, y): images_transformed, y_transformed = self.ip.transformImages(images, y, rotate=True, crop=True) t_t10_8 = threading.Thread(target=self._fit_small_texture10_8, args=(images[:], y, self.texture_10_8, 10, 8, 2)) t_t10_8.daemon = True t_t10_8.start() t_t5_10 = threading.Thread(target=self._fit_small_texture5_10, args=(images[:], y, self.texture_5_10, 5, 10, 2)) t_t5_10.daemon = True t_t5_10.start() t_t7_10 = threading.Thread(target=self._fit_small_texture7_10, args=(images[:], y, self.texture_7_10, 7, 10, 2)) t_t7_10.daemon = True t_t7_10.start() t_t9_8 = threading.Thread(target=self._fit_small_texture9_8, args=(images[:], y, self.texture_9_8, 9, 8, 2)) t_t9_8.daemon = True t_t9_8.start() t_t4_10 = threading.Thread(target=self._fit_small_texture4_10, args=(images[:], y, self.texture_4_10, 4, 10, 2)) t_t4_10.daemon = True t_t4_10.start() t_t20_8 = threading.Thread(target=self._fit_small_texture20_8, args=(images[:], y, self.texture_20_8, 20, 8, 2)) t_t20_8.daemon = True t_t20_8.start() t_pc = threading.Thread(target=self._fit_small_pc, args=(images_transformed[:], y_transformed)) t_pc.daemon = True t_pc.start() t_rbm = threading.Thread(target=self._fit_small_rbm, args=(images_transformed[:], y_transformed)) t_rbm.daemon = True t_rbm.start() t_t10_8.join() t_t5_10.join() t_t7_10.join() t_t9_8.join() t_t4_10.join() t_t20_8.join() t_pc.join() t_rbm.join() def _fit_small_texture10_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_10_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_10_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) # FIXE: unificar estas dos funciones. No le gusta pasar el estimador como atributo def _fit_small_texture5_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_5_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_5_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture7_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_7_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_7_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture9_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_9_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_9_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture4_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_4_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_4_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture20_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_20_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_20_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_pc(self, images, y): start_time = time.time() print("PCA RANDOM FOREST") ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES) self.pca_randomForest = RandomForest(ds, y, n_estimators=2000) self.pca_randomForest.fit() print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time)) def _fit_small_rbm(self, ds, y): start_time = time.time() print("RBM LR") ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001) self.rbm_lr_rbm = BernoulliRBM(random_state=0, verbose=True) self.rbm_lr_rbm.learning_rate = 0.01 self.rbm_lr_rbm.n_iter = 5 self.rbm_lr_rbm.n_components = 150 logistic = linear_model.RidgeClassifier(alpha=2) self.rbm_lr = Pipeline(steps=[('rbm', self.rbm_lr_rbm), ('lr', logistic)]) self.rbm_lr.fit(ds, y) print("COMPLETE RBM LR --- %s seconds ---" % (time.time() - start_time)) def fit_big(self, ds, y): self.ensemble_logistic_regression = linear_model.LogisticRegression() self.ensemble_logistic_regression.fit(ds, y) def predict_small(self, images): # t_predict_small_pac_ranfomForest = threading.Thread(target=self._predict_small_pac_ranfomForest, args=(images, )) # t_predict_small_pac_ranfomForest.daemon = True # t_predict_small_pac_ranfomForest.start() # t_predict_small_rbm_lr = threading.Thread(target=self._predict_small_rbm_lr, args=(images, )) # t_predict_small_rbm_lr.daemon = True # t_predict_small_rbm_lr.start() t_predict_small_texture_10_8 = threading.Thread(target=self._predict_small_texture_10_8, args=(images, )) t_predict_small_texture_10_8.daemon = True t_predict_small_texture_10_8.start() t_predict_small_texture_5_10 = threading.Thread(target=self._predict_small_texture_5_10, args=(images, )) t_predict_small_texture_5_10.daemon = True t_predict_small_texture_5_10.start() t_predict_small_texture_7_10 = threading.Thread(target=self._predict_small_texture_7_10, args=(images, )) t_predict_small_texture_7_10.daemon = True t_predict_small_texture_7_10.start() t_predict_small_texture_9_8 = threading.Thread(target=self._predict_small_texture_9_8, args=(images, )) t_predict_small_texture_9_8.daemon = True t_predict_small_texture_9_8.start() t_predict_small_texture_4_10 = threading.Thread(target=self._predict_small_texture_4_10, args=(images, )) t_predict_small_texture_4_10.daemon = True t_predict_small_texture_4_10.start() t_predict_small_texture_20_8 = threading.Thread(target=self._predict_small_texture_20_8, args=(images, )) t_predict_small_texture_20_8.daemon = True t_predict_small_texture_20_8.start() # t_predict_small_pac_ranfomForest.join() # t_predict_small_rbm_lr.join() t_predict_small_texture_10_8.join() t_predict_small_texture_5_10.join() t_predict_small_texture_9_8.join() t_predict_small_texture_4_10.join() t_predict_small_texture_20_8.join() t_predict_small_texture_7_10.join() return(np.vstack((self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) #return(np.vstack((self.pca_randomForest_y_hat, self.rbm_lr_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) #return(np.vstack((self.pca_randomForest_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) def _predict_small_rbm_lr(self, images): start_time = time.time() ds = images[:] ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001) self.rbm_lr_y_hat = self.rbm_lr.predict(ds) print "Complete prediction RBM --- %s ---" % (time.time() - start_time) def _predict_small_pac_ranfomForest(self, images): start_time = time.time() ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = self.pca_randomForest_norm.transform(ds) ds = self.pca_randomForest_pca.transform(ds) self.pca_randomForest_y_hat = self.pca_randomForest.predict(ds) print "Complete prediction PCA --- %s ---" % (time.time() - start_time) def _predict_small_texture_10_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 10, 8) self.texture_10_8_y_hat = self.texture_10_8.predict(ds) print "Complete prediction Texture 10 8 --- %s ---" % (time.time() - start_time) def _predict_small_texture_5_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 5, 10) self.texture_5_10_y_hat = self.texture_5_10.predict(ds) print "Complete prediction Texture 5 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_7_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 7, 10) self.texture_7_10_y_hat = self.texture_7_10.predict(ds) print "Complete prediction Texture 7 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_9_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 9, 8) self.texture_9_8_y_hat = self.texture_9_8.predict(ds) print "Complete prediction Texture 9 8 --- %s ---" % (time.time() - start_time) def _predict_small_texture_4_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 4, 10) self.texture_4_10_y_hat = self.texture_4_10.predict(ds) print "Complete prediction Texture 4 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_20_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 20, 8) self.texture_20_8_y_hat = self.texture_20_8.predict(ds) print "Complete prediction Texture 20 8 --- %s ---" % (time.time() - start_time) def predict_big(self, ds): return(self.ensemble_logistic_regression.predict(ds))
def main(cv=False,kaggle=True, num_Trees=10, verbose=False): X = [] y = [] # Load data set with open("hw4-data.csv") as f: next(f, None) for line in csv.reader(f, delimiter = ","): X.append(line[:-1]) y.append(line[-1]) #end X = np.array(X, dtype = float) y = np.array(y, dtype = int) # Split training/test sets # You need to modify the following code for cross validation if cv == True: K = 10 cv_accuracy =[] for ii in xrange(K): X_train = np.array([x for i, x in enumerate(X) if i % K != ii], dtype = float) y_train = np.array([z for i, z in enumerate(y) if i % K != ii], dtype = int) X_test = np.array([x for i, x in enumerate(X) if i % K == ii], dtype = float) y_test = np.array([z for i, z in enumerate(y) if i % K == ii], dtype = int) randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() randomForest.fit(X_train, y_train) t1 = time() print "time elapses = %.3f s" % (t1-t0) y_predicted = randomForest.predict(X_test) results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)] # Accuracy accuracy = float(results.count(True)) / float(len(results)) print "test accuracy: %.4f" % accuracy cv_accuracy.append(accuracy) print "average cv accuracy: %.4f" % np.mean(cv_accuracy) else: ii = 3 K = 10 X_train = np.array([x for i, x in enumerate(X) if i % K != ii], dtype = float) y_train = np.array([z for i, z in enumerate(y) if i % K != ii], dtype = int) X_test = np.array([x for i, x in enumerate(X) if i % K == ii], dtype = float) y_test = np.array([z for i, z in enumerate(y) if i % K == ii], dtype = int) if kaggle==True: randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() # randomForest.fit(X_train,y_train) randomForest.fit(X,y) #use the full data t1 = time() print "time elapses = %.3f s" % (t1-t0) # y_predicted = randomForest.predict(X_test) # results = [prediction == truth # for prediction,truth in zip(y_predicted,y_test)] # # Accuracy # accuracy = float(results.count(True)) / float(len(results)) # print "test accuracy: %.4f" % accuracy generateSubmissionFile(myname, randomForest) else: randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() randomForest.fit(X_train,y_train) t1 = time() print "time elapses = %.3f s" % (t1-t0) y_predicted = randomForest.predict(X_test) results = [prediction == truth for prediction,truth in zip(y_predicted,y_test)] accuracy = float(results.count(True)) / float(len(results)) print "test accuracy: %.4f" % accuracy