def main(): args = parseArgs() X, _, _, _ = filehandlers.loadTrainingSets(args.input) pca = sklearn.decomposition.PCA(args.variance) pca.fit(X) numComponents = pca.components_.shape[0] numFeatures = pca.components_.shape[1] print "{:.1f}% ({} / {}) features to retain {}% of the variance".format(100. * numComponents / numFeatures, numComponents, numFeatures, 100 * args.variance) filehandlers.savePca(args.output, pca)
def runTests(filename, pcaFile=None): print "TRAINING/TESTING {}".format(filename) trainingInputs, trainingOutputs, validationInputs, validationOutputs = filehandlers.loadTrainingSets(filename) if pcaFile: pca = filehandlers.loadPca(pcaFile) trainingInputs = pca.transform(trainingInputs) validationInputs = pca.transform(validationInputs) numExamples = trainingInputs.shape[0] numFeatures = trainingInputs.shape[1] print "Loaded training data with shape {} and {}".format(trainingInputs.shape, trainingOutputs.shape) print "Loaded validation data with shape {} and {}".format(validationInputs.shape, validationOutputs.shape) avgPredictions = trainingOutputs.mean(axis=0) print "Baselines" print "\tPredict zero on all outputs (val): {}".format(rms(0, validationOutputs.ravel())) print "\tPredict one on all outputs (val): {}".format(rms(1, validationOutputs.ravel())) print "\tPredict average on all outputs (val): {}".format(rms(avgPredictions.repeat(validationOutputs.shape[0]), validationOutputs.ravel())) print "\tPredict one on all outputs then norm (val): {}".format(rms(normalizePredictions(numpy.ones(validationOutputs.shape)).ravel(), validationOutputs.ravel())) tiledAverage = numpy.tile(avgPredictions, (validationOutputs.shape[0], 1)) print "\tPredict average on all outputs then norm (val): {}".format(rms(normalizePredictions(tiledAverage).ravel(), validationOutputs.ravel())) experimentLinearRegression(trainingInputs, trainingOutputs, validationInputs, validationOutputs) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5, maxFeatures="auto") #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None, maxFeatures=int(numFeatures * 0.75)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 80, None, maxFeatures=int(numFeatures * 0.5)) #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 160, None, maxFeatures=int(numFeatures * 0.5)) #for numTrees in [ 40, 80, 160 ]: # experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, numTrees, None, maxFeatures=int(numFeatures * 0.5), minSplit=5) experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5), minSplit=20, saveAsPath="randomForest-40t-0.5f-20mss.pickle")