def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import RandomForestClassifier import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(20, 50, 1): rf = RandomForestClassifier(n_estimators = i, n_jobs = -1) rf.fit(trainingFeatures, trainingTarget) predictions = rf.predict(testingFeatures) rf = None cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, "components: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " components with rmsle: ", best_rmsle rf = RandomForestClassifier(n_estimators = best_i, n_jobs = -1) rf.fit(trainingFeatures, trainingTarget) predictions = rf.predict(validationFeatures) rf = None cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Model cost: ", rmsle
def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import ExtraTreesRegressor import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(2000, 3001, 1000): model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(testingFeatures) cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, " estimators: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " estimators with rmsle: ", best_rmsle model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(validationFeatures) cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Final model cost: ", rmsle