Пример #1
0
def createEnsembleBasedODifferentTrainingSets():


    # constructing the limits
    margins = np.linspace(0,878000,5,dtype=int)

    marginTuples=[]
    for i in range(len(margins)-1):
        marginTuples.append((margins[i],margins[i+1]))


    # training classifiers
    allClassifiers = Parallel(n_jobs=-1)(delayed(mainScript.trainClassifierOnTrainingData)(margins=marginTuple) for marginTuple in marginTuples)

    # Predicting on batch test data
    partitionNumber = utils.numberOfPartitions
    for batchIndex in range(partitionNumber):

        print "Predicting batch {}".format(batchIndex)
        miniTestData = dataReader.getSerializedMiniTestData(batchIndex)

        xTest,yTest = mainScript.constructTestData(miniTestData)

        for classifierIndex,currentClassifier in enumerate(allClassifiers):
            constructPredictionWithOutput(currentClassifier,classifierIndex,xTest,batchIndex)


    # post process
    print "Post processing everything..."
    outputFileNames = ["data\\ensembleTraining\\out"+str(index)+".csv" for index in range(len(allClassifiers))]

    for outputFileName in outputFileNames:
        dataReader.postProcessCsv(outputFileName=outputFileName)



    #Merging everything together
    print "Merging all solutions...."
    fileRegex = "data\\ensembleTraining\\*.csv"
    createEnsembleBasedOnExitingPredictions(fileRegex=fileRegex)
Пример #2
0
def predictForSubmission():
    startTime = time.time()
    allAlgorithmStartTime = startTime

    numberOfTrainingExamples = -1
    classifier = trainClassifierOnTrainingData(numberOfTrainingExamples)

    print "Beginning to load test data..."

    partitionNumber = utils.numberOfPartitions
    for index in range(partitionNumber):

        miniTestData = dataReader.getSerializedMiniTestData(index)

        xTest,yTest = constructTestData(miniTestData)

        print "Predicting..."
        yPred = classifier.predict_proba(xTest)

        dataReader.writePredToCsv(yPred,index)

    print "Post processing..."
    dataReader.postProcessCsv()
    print("Total run time:{}".format(time.time() - allAlgorithmStartTime))