示例#1
0
文件: test.py 项目: ktrnka/galaxyzoo
def main():
    args = parseArgs()
    
    with open(args.model, "r") as modelIn:
        model = pickle.load(modelIn)
        
    testingLabels, testingFeatures = filehandlers.loadTestingData(args.testingInput, pca=args.pca)
    if args.pca:
        pca = filehandlers.loadPca(args.pca)
        testingFeatures = pca.transform(testingFeatures)
    
    predictions = model.predict(testingFeatures)
    testingLabels.shape = (predictions.shape[0], 1)
    numpy.savetxt(args.outFile, numpy.hstack([testingLabels, predictions]), delimiter=",", fmt=["%d"] + ["%f"] * predictions.shape[1], header=loadHeader(args.headersFrom))
    
    print "Don't forget to delete the comment mark on the first line of output"
示例#2
0
文件: train.py 项目: ktrnka/galaxyzoo
def runTests(filename, pcaFile=None):
    print "TRAINING/TESTING {}".format(filename)
    
    trainingInputs, trainingOutputs, validationInputs, validationOutputs = filehandlers.loadTrainingSets(filename)
    if pcaFile:
        pca = filehandlers.loadPca(pcaFile)
        trainingInputs = pca.transform(trainingInputs)
        validationInputs = pca.transform(validationInputs)

    numExamples = trainingInputs.shape[0]
    numFeatures = trainingInputs.shape[1]
    print "Loaded training data with shape {} and {}".format(trainingInputs.shape, trainingOutputs.shape)
    print "Loaded validation data with shape {} and {}".format(validationInputs.shape, validationOutputs.shape)
    
    avgPredictions = trainingOutputs.mean(axis=0)

    print "Baselines"
    print "\tPredict zero on all outputs (val): {}".format(rms(0, validationOutputs.ravel()))
    print "\tPredict one on all outputs (val): {}".format(rms(1, validationOutputs.ravel()))
    print "\tPredict average on all outputs (val): {}".format(rms(avgPredictions.repeat(validationOutputs.shape[0]), validationOutputs.ravel()))
    
    print "\tPredict one on all outputs then norm (val): {}".format(rms(normalizePredictions(numpy.ones(validationOutputs.shape)).ravel(), validationOutputs.ravel()))
    
    tiledAverage = numpy.tile(avgPredictions, (validationOutputs.shape[0], 1))
    print "\tPredict average on all outputs then norm (val): {}".format(rms(normalizePredictions(tiledAverage).ravel(), validationOutputs.ravel()))

    experimentLinearRegression(trainingInputs, trainingOutputs, validationInputs, validationOutputs)

 
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5)
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, 5, maxFeatures="auto")
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None)
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 20, None, maxFeatures=int(numFeatures * 0.75))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 80, None, maxFeatures=int(numFeatures * 0.5))
    #experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 160, None, maxFeatures=int(numFeatures * 0.5))

    #for numTrees in [ 40, 80, 160 ]:
    #    experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, numTrees, None, maxFeatures=int(numFeatures * 0.5), minSplit=5)
    
    experimentRandomForest(trainingInputs, trainingOutputs, validationInputs, validationOutputs, 40, None, maxFeatures=int(numFeatures * 0.5), minSplit=20, saveAsPath="randomForest-40t-0.5f-20mss.pickle")