def get_single_models(self): """ function to get best single models :return: """ self.single_models = {} self.rf = randomForest(self.params["rf"], self.X_train, self.y_train) self.ada = adaBoost(self.params["ada"], self.X_train, self.y_train) self.svc = svcClf(self.params["svc"], self.X_train, self.y_train) self.lr = logisticRegression(self.params["lr"], self.X_train, self.y_train) self.single_models[self.rf.get_model_name()] = self.rf.get_best_model() logging.info("Best parameters for random forest: {}\n".format( self.rf.best_params)) self.single_models[ self.ada.get_model_name()] = self.ada.get_best_model() logging.info("Best parameters for ada boost: {}\n".format( self.ada.best_params)) self.single_models[ self.svc.get_model_name()] = self.svc.get_best_model() logging.info("Best parameters for svc: {}\n".format( self.svc.best_params)) self.single_models[self.lr.get_model_name()] = self.lr.get_best_model() logging.info("Best parameters for logistic regression: {}\n".format( self.lr.best_params))
def crossValidateRandomForest(): f1Inputs, f1Labels, _ = read_libsvm_default('data/data_semeion/folds/fold1') f2Inputs, f2Labels, _ = read_libsvm_default('data/data_semeion/folds/fold2') f3Inputs, f3Labels, _ = read_libsvm_default('data/data_semeion/folds/fold3') f4Inputs, f4Labels, _ = read_libsvm_default('data/data_semeion/folds/fold4') f5Inputs, f5Labels, _ = read_libsvm_default('data/data_semeion/folds/fold5') allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(), f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()] allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels] forestSizes = [10, 50, 100] bestForestSize = None bestAccuracy = 0 counter = 1 everyAccuracy = [] for forestSize in forestSizes: allAccuracies = [] for i in range(len(allFoldInputArrays)): allTrainData = [] allTrainLabels = [] for j in range(len(allFoldInputArrays)): if j != i: allTrainData.extend(allFoldInputArrays[j]) allTrainLabels.extend(allFoldLabelArrays[j]) print("Hyperparameters: forest size: " + str(forestSize)) tempforest = randomForest(numFeatures, forestSize) tempforest.train(allTrainData, allTrainLabels) evaluation = tempforest.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i]) accuracy = evaluation allAccuracies.append(accuracy) everyAccuracy.append(accuracy) if statistics.mean(allAccuracies) > bestAccuracy: bestAccuracy = statistics.mean(allAccuracies) bestForestSize = forestSize avgAccuracy = statistics.mean(everyAccuracy) print("Best forest size: " + str(bestForestSize)) print("Best accuracy: " + str(bestAccuracy)) print("Average accuracy: " + str(avgAccuracy))
## Setup Data: trainingInputs, trainingLabels, numFeatures = read_libsvm_default( 'data/data-splits/data.train') testInputs, testLabels, _ = read_libsvm_default('data/data-splits/data.test', numFeatures) trainingInputsArr = trainingInputs.toarray() testInputsArr = testInputs.toarray() ## Discretize data: def discreteizeData(nonDiscreteArr): means = np.mean(nonDiscreteArr, axis=0) for i in range(len(nonDiscreteArr)): for j in range(len(nonDiscreteArr[i])): if nonDiscreteArr[i][j] <= means[j]: nonDiscreteArr[i][j] = 0 else: nonDiscreteArr[i][j] = 1 return nonDiscreteArr discreteizeData(trainingInputsArr) hachi = randomForest(numFeatures, 50) hachi.train(trainingInputsArr, trainingLabels) print("training set: ") print(hachi.evaluate(trainingInputsArr, trainingLabels)) print("test set: ") print(hachi.evaluate(testInputsArr, testLabels))
# plot ROC curve for test set perform_lr.roc_auc_curve(title="Logistic Regression Classifier (ROC)") # plot confusion matrix cm_lr = confusion_matrix(y_test, y_test_pred_lr) perform_lr.confusion_matrix(cm_lr, title = "Logistic Regression Classifier (Confusion Matrix)") ######################################### #Random Forest print("Running Random Forest Classifier...") params_rf = {"n_estimators": [10, 30, 50, 70, 90, 100, 120, 160, 200, 240], "min_samples_split": [2, 4, 6, 8], "min_samples_leaf": [1, 2, 3, 4]} rf = randomForest(params_rf, X_train, y_train) best_rf = rf.get_best_model() logging.info("Best parameters for random forest: {}\n".format(rf.best_params)) best_rf.fit(X_train, y_train) y_train_pred_rf = best_rf.predict(X_train) y_test_pred_rf = best_rf.predict(X_test) AUC_train_rf = multiclass_roc_auc_score(y_train, y_train_pred_rf) AUC_test_rf = multiclass_roc_auc_score(y_test, y_test_pred_rf) print("AUC for training set is: " + str(AUC_train_rf)) print("AUC for test set is: " + str(AUC_test_rf)) logging.info("AUC of {} on training data: {}".format("random forest", AUC_train_rf)) logging.info("AUC of {} on test data: {}".format("random forest", AUC_test_rf))
def rf(data, labels, test_features=None): from random_forest import randomForest from helpers import helpers as hp from decision_tree import decisionTree h = hp() rf = randomForest() dt = decisionTree() data = pd.concat([data, labels], axis=1) # print(data) accuracy = [] precision = [] recall = [] f_score = [] models = [] fb_score = [] foldSize = int(data.shape[0] / 5) for i in range(5): print("Running iteration " + str(i+1) + " of k cross validation") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) forest = rf.forest(trainData) target = testData.iloc[:,-1].values.tolist() predicted = rf.predictForest(testData.iloc[:, :-1], forest) models.append(forest) calMetrics(target, predicted) # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) # print(truePositives, trueNegatives, falsePositives, falseNegatives) # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) # precision.append(tmpPrecision) # recall.append(tmpRecall) # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall) # print(tm_fscore) # f_score.append(tm_fscore) h.calculateMetrics(accuracy, precision, recall, f_score) # print(accuracy, precision, recall, f_score) # h.calculateMetrics(accuracy, precision, recall, f_score) # ind = f_score.index(min(f_score)) # print(f_score[ind]) # pred = rf.predictForest(test_features, models[ind]) # print(pred) predicted = pd.DataFrame() for root in models: pred = dt.predictData(test_features, root) predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1) print(predicted) p = pd.DataFrame() p = [] for idx, row in predicted.iterrows(): p.append(row.value_counts().index.tolist()[0]) print(p) return p
def random_forest(self, kCrossValidation): print("\nRunning Random Forest Classifier ....................\n") from random_forest import randomForest h = hp() fileName = h.get_fileName() filePath = "../Data/" + fileName + ".txt" # filePath = "CSE-601/project3/Data/"+fileName+".txt" data, labels = h.readData(filePath) data = h.oneHotEncoding(data, labels) rf = randomForest() try: numTrees = int(input("\nEnter number of trees: ")) numFeatures = int(input("Enter number of features to consider: ")) except: print("\nExecution Failed - Wrong Input") exit() accuracy = [] precision = [] recall = [] f_score = [] models = [] foldSize = int(data.shape[0] / kCrossValidation) for i in range(kCrossValidation): print("Running iteration " + str(i + 1) + " of k cross validation .....") testData = data.loc[foldSize * i:foldSize * (i + 1) - 1] trainData = data.loc[:foldSize * i - 1].append(data.loc[foldSize * (i + 1):]) forest = rf.forest(trainData, numTrees=numTrees, numFeatures=numFeatures) target = testData.iloc[:, -1].values.tolist() predicted = rf.predictForest(testData.iloc[:, :-1], forest) models.append(forest) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( predicted, target) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) print("\nMetrics on train data with k-cross validation") h.calculateMetrics(accuracy, precision, recall, f_score) fileName = input( "\nEnter test data file name without extension (if no test file, just press enter): " ) if fileName != '': filePath = "../Data/" + fileName + ".txt" # filePath = "CSE-601/project3/Data/"+fileName+".txt" testData, testLabels = h.readData(filePath) testData = h.oneHotEncoding(testData, testLabels) predLabels = [] for forest in models: predLabels.append(rf.predictForest(testData, forest)) predLabels = pd.DataFrame(predLabels) pred = [] for _, colData in predLabels.iteritems(): pred.append(colData.value_counts().index[0]) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( pred, testData.iloc[:, -1].values.tolist()) accuracy = [ h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives) ] precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) f_score = [h.findFMeasure(precision, recall)] print("\nMetrics on test data with bagging") h.calculateMetrics(accuracy, [precision], [recall], f_score)