def estimate():
    from loadData import loadSets
    from helper import splitDataset, separateTargetFromTrain
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np
    import math

    best_rmsle = 2
    best_i = 0
    
    trainingSet, testingSet = loadSets()
    testingSet = None

    trainingData, testingData = splitDataset(trainingSet, 0.6)
    testingData, validationData = splitDataset(testingData, 0.5)
    trainingSet = None
    
    trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData)
    testingTarget, testingFeatures = separateTargetFromTrain(testingData)
    validationTarget, validationFeatures = separateTargetFromTrain(validationData)

    testingTarget = testingTarget.values
    validationTarget = validationTarget.values
    
    trainingData = None
    testingData = None
    validationData = None    
    
    for i in range(20, 50, 1):
        rf = RandomForestClassifier(n_estimators = i, n_jobs = -1)
        rf.fit(trainingFeatures, trainingTarget)
        
        predictions = rf.predict(testingFeatures)
        rf = None
                
        cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2)
        rmsle = math.sqrt(np.mean(cost))
        print i, "components: ", rmsle
        
        if rmsle < best_rmsle:
            best_rmsle = rmsle
            best_i = i
            
    print "Best: ", best_i, " components with rmsle: ", best_rmsle
    
    rf = RandomForestClassifier(n_estimators = best_i, n_jobs = -1)
    rf.fit(trainingFeatures, trainingTarget)
    predictions = rf.predict(validationFeatures)
    rf = None
                
    cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2)
    rmsle = math.sqrt(np.mean(cost))
    
    print "Model cost: ", rmsle
def estimate():
    from loadData import loadSets
    from helper import splitDataset, separateTargetFromTrain
    from sklearn.ensemble import ExtraTreesRegressor
    import numpy as np
    import math

    best_rmsle = 2
    best_i = 0
    
    trainingSet, testingSet = loadSets()
    testingSet = None

    trainingData, testingData = splitDataset(trainingSet, 0.6)
    testingData, validationData = splitDataset(testingData, 0.5)
    trainingSet = None
    
    trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData)
    testingTarget, testingFeatures = separateTargetFromTrain(testingData)
    validationTarget, validationFeatures = separateTargetFromTrain(validationData)

    testingTarget = testingTarget.values
    validationTarget = validationTarget.values
    
    trainingData = None
    testingData = None
    validationData = None    
    
    for i in range(2000, 3001, 1000):
        model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1)
        model.fit(trainingFeatures, trainingTarget)
        
        predictions = model.predict(testingFeatures)
                
        cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2)
        rmsle = math.sqrt(np.mean(cost))
        print i, " estimators: ", rmsle
        
        if rmsle < best_rmsle:
            best_rmsle = rmsle
            best_i = i
            
    print "Best: ", best_i, " estimators with rmsle: ", best_rmsle
    
    model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1)
    model.fit(trainingFeatures, trainingTarget)
    predictions = model.predict(validationFeatures)
            
    cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2)
    rmsle = math.sqrt(np.mean(cost))
    
    print "Final model cost: ", rmsle