def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import RandomForestClassifier import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(20, 50, 1): rf = RandomForestClassifier(n_estimators = i, n_jobs = -1) rf.fit(trainingFeatures, trainingTarget) predictions = rf.predict(testingFeatures) rf = None cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, "components: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " components with rmsle: ", best_rmsle rf = RandomForestClassifier(n_estimators = best_i, n_jobs = -1) rf.fit(trainingFeatures, trainingTarget) predictions = rf.predict(validationFeatures) rf = None cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Model cost: ", rmsle
def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import ExtraTreesRegressor import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(2000, 3001, 1000): model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(testingFeatures) cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, " estimators: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " estimators with rmsle: ", best_rmsle model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(validationFeatures) cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Final model cost: ", rmsle
for i in testingFeaturesPca[columns].values: testDs.addSample(tuple(i[:-1]), i[-1]) return trainDs, testDs #%% from loadData import loadSets from datetime import datetime from pandas import DataFrame from sklearn.linear_model import RidgeCV from sklearn.linear_model import SGDRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import RandomForestRegressor before = datetime.now() trainingSet, testingSet = loadSets() testingSet.index = testingSet.id #%% testingFeatures = testingSet.drop('id', 1) ids = testingSet.id #%% ''' from helper import splitDataset trainingSet, testingSet = splitDataset(trainingSet, 0.7) testingFeatures = testingSet.drop('cost', 1) testingTarget = testingSet.cost ids = None ''' #%%