Пример #1
0
def main():
    mode = sys.argv[1]
    print "mode = ", mode
    
    train_data = np.array([[0, 0], [0, 1], [1,0], [1,1]])
    train_label = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
    wHid, wOut = train(train_data, train_label, mode)
    test_data = np.array([[0,0], [0,1], [1,0], [1,1]])
    test(test_data, wHid, wOut)
def runVisPrior(trainData, testData, questionType, visModel, questionDict, questionIdict, numAns, delta):
    objDict, objIdict = buildObjDict(trainData, questionType, questionDict, questionIdict)

    count_wa, count_a = trainCount(trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns)
    print count_wa

    # Reindex test set
    testInput = testData[0]
    testTarget = testData[1]
    testTargetReshape = testTarget.reshape(testTarget.size)
    testObjId = reindexObjId(testInput, objDict, questionDict, questionIdict, questionType)

    # Run vis model on test set
    testOutput = nn.test(visModel, testInput)

    print "Before Prior Test Accuracy:",
    rate, _, __ = calcRate(testOutput, testTarget)
    print rate

    # Run on test set
    visPriorOutput = runVisPriorOnce(testObjId, count_wa, count_a, testOutput, delta)
    print "delta=%f Test Accuracy:" % delta,
    rate, _, __ = calcRate(visPriorOutput, testTarget)
    print rate
    return visPriorOutput
Пример #3
0
def testAll(taskId, model, dataFolder, resultsFolder):
    testAnswerFile = os.path.join(resultsFolder, taskId,
                                  '%s.test.o.txt' % taskId)
    testTruthFile = os.path.join(resultsFolder, taskId,
                                 '%s.test.t.txt' % taskId)
    testDataFile = os.path.join(dataFolder, 'test.npy')
    vocabDictFile = os.path.join(dataFolder, 'vocab-dict.npy')
    vocabDict = np.load(vocabDictFile)
    testData = np.load(testDataFile)
    inputTest = testData[0]
    outputTest = nn.test(model, inputTest)
    targetTest = testData[1]
    questionArray = vocabDict[1]
    answerArray = vocabDict[3]
    print len(answerArray)
    print outputTest.shape
    outputTxt(outputTest, targetTest, answerArray, testAnswerFile,
              testTruthFile)
    resultsRank = calcPrecision(outputTest, targetTest)
    correct, total = calcRate(inputTest, outputTest, targetTest, questionArray)
    resultsCategory = correct / total.astype(float)
    resultsFile = os.path.join(resultsFolder, taskId, 'result.txt')
    resultsWups = runWups(testAnswerFile, testTruthFile)
    with open(resultsFile, 'w') as f:
        f.write('rate @ 1: %.4f\n' % resultsRank[0])
        f.write('rate @ 5: %.4f\n' % resultsRank[1])
        f.write('rate @ 10: %.4f\n' % resultsRank[2])
        f.write('object: %.4f\n' % resultsCategory[0])
        f.write('number: %.4f\n' % resultsCategory[1])
        f.write('color: %.4f\n' % resultsCategory[2])
        f.write('scene: %.4f\n' % resultsCategory[3])
        f.write('WUPS 1.0: %.4f\n' % resultsWups[0])
        f.write('WUPS 0.9: %.4f\n' % resultsWups[1])
        f.write('WUPS 0.0: %.4f\n' % resultsWups[2])
Пример #4
0
def testAll(taskId, model, dataFolder, resultsFolder):
    testAnswerFile = os.path.join(resultsFolder, taskId, '%s.test.o.txt' % taskId)
    testTruthFile = os.path.join(resultsFolder, taskId, '%s.test.t.txt' % taskId)
    testDataFile = os.path.join(dataFolder, 'test.npy')
    vocabDictFile = os.path.join(dataFolder, 'vocab-dict.npy')
    vocabDict = np.load(vocabDictFile)
    testData = np.load(testDataFile)
    inputTest = testData[0]
    outputTest = nn.test(model, inputTest)
    targetTest = testData[1]
    questionArray = vocabDict[1]
    answerArray = vocabDict[3]
    print len(answerArray)
    print outputTest.shape
    outputTxt(outputTest, targetTest, answerArray, testAnswerFile, testTruthFile)
    resultsRank = calcPrecision(outputTest, targetTest)
    correct, total = calcRate(inputTest, outputTest, targetTest, questionArray)
    resultsCategory = correct / total.astype(float)
    resultsFile = os.path.join(resultsFolder, taskId, 'result.txt')
    resultsWups = runWups(testAnswerFile, testTruthFile)
    with open(resultsFile, 'w') as f:
        f.write('rate @ 1: %.4f\n' % resultsRank[0])
        f.write('rate @ 5: %.4f\n' % resultsRank[1])
        f.write('rate @ 10: %.4f\n' % resultsRank[2])
        f.write('object: %.4f\n' % resultsCategory[0])
        f.write('number: %.4f\n' % resultsCategory[1])
        f.write('color: %.4f\n' % resultsCategory[2])
        f.write('scene: %.4f\n' % resultsCategory[3])
        f.write('WUPS 1.0: %.4f\n' % resultsWups[0])
        f.write('WUPS 0.9: %.4f\n' % resultsWups[1])
        f.write('WUPS 0.0: %.4f\n' % resultsWups[2])
Пример #5
0
def testAll(
            modelId, 
            model, 
            dataFolder, 
            resultsFolder):
    testAnswerFile = getAnswerFilename(modelId, resultsFolder)
    testTruthFile = getTruthFilename(modelId, resultsFolder)
    data = loadDataset(dataFolder)
    outputTest = nn.test(model, data['testData'][0])
    rate, correct, total = nn.calcRate(model, outputTest, data['testData'][1])
    print 'rate: %.4f' % rate
    resultsRank, \
    resultsCategory, \
    resultsWups = runAllMetrics(
                        data['testData'][0],
                        outputTest,
                        data['testData'][1],
                        data['ansIdict'],
                        data['questionTypeArray'],
                        testAnswerFile,
                        testTruthFile)
    writeMetricsToFile(
                modelId,
                rate,
                resultsRank,
                resultsCategory,
                resultsWups,
                resultsFolder)
    return outputTest
def validDelta(trainData, validData, preVisModelOutput, questionDict, questionIdict, numAns, deltas, questionType):
    objDict, objIdict = buildObjDict(trainData, questionType, questionDict, questionIdict)
    count_wa, count_a = trainCount(trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns)
    print count_wa

    # Reindex valid set
    validInput = validData[0]
    validTarget = validData[1]
    validTargetReshape = validTarget.reshape(validTarget.size)
    validObjId = reindexObjId(validInput, objDict, questionDict, questionIdict, questionType)

    # Run vis model on valid set
    validOutput = nn.test(preVisModel, validInput)
    print "Before Prior Valid Accuracy:",
    rate, _, __ = calcRate(validOutput, validTarget)
    print rate

    # Determine best delta
    bestRate = 0.0
    bestDelta = 0.0
    for delta in deltas:
        visPriorOutput = runVisPriorOnce(validObjId, count_wa, count_a, validOutput, delta)
        print "delta=%f Valid Accuracy:" % delta,
        rate, _, __ = calcRate(visPriorOutput, validTarget)
        print rate
        if rate > bestRate:
            bestRate = rate
            bestDelta = delta
    print "Best Delta:", bestDelta
    return bestDelta
Пример #7
0
def validDelta(
                trainData,
                validData,
                preVisModelOutput,
                questionDict,
                questionIdict,
                numAns,
                deltas,
                questionType):
    objDict, objIdict = buildObjDict(
                                trainData,
                                questionType,
                                questionDict,
                                questionIdict)
    count_wa, count_a = trainCount(
                                trainData, 
                                questionType,
                                questionDict,
                                questionIdict,
                                objDict,
                                objIdict,
                                numAns)
    print count_wa

    # Reindex valid set
    validInput = validData[0]
    validTarget = validData[1]
    validTargetReshape = validTarget.reshape(validTarget.size)
    validObjId = reindexObjId(
                                validInput, 
                                objDict, 
                                questionDict, 
                                questionIdict, 
                                questionType)

    # Run vis model on valid set
    validOutput = nn.test(preVisModel, validInput)
    print 'Before Prior Valid Accuracy:',
    rate, _, __ = calcRate(validOutput, validTarget)
    print rate

    # Determine best delta
    bestRate = 0.0
    bestDelta = 0.0
    for delta in deltas:
        visPriorOutput = runVisPriorOnce(
                                validObjId, 
                                count_wa, 
                                count_a, 
                                validOutput, 
                                delta)        
        print 'delta=%f Valid Accuracy:' % delta,
        rate, _, __ = calcRate(visPriorOutput, validTarget)
        print rate
        if rate > bestRate:
            bestRate = rate
            bestDelta = delta
    print 'Best Delta:', bestDelta
    return bestDelta
def runAllModels(
                inputTest, 
                questionTypeArray, 
                modelSpecs,
                resultsFolder,
                dataset,
                dataFolder):
    allOutputs = []
    for modelSpec in modelSpecs:
        if modelSpec['isClassEnsemble']:
            print 'Running test data on ensemble model %s...' \
                    % modelSpec['name']
            models = loadEnsemble(modelSpec['id'].split(','), resultsFolder)
            classDataFolders = getClassDataFolders(dataset, dataFolder)
            if modelSpec['runPrior']:
                outputTest = runEnsemblePrior(
                                    inputTest, 
                                    models,
                                    dataFolder,
                                    classDataFolders,
                                    questionTypeArray)
            else:
                outputTest = runEnsemble(
                                    inputTest, 
                                    models,
                                    dataFolder,
                                    classDataFolders,
                                    questionTypeArray)
        elif modelSpec['isAverageEnsemble']:
            modelOutputs = []
            for modelId in modelSpec['id'].split(','):
                model = it.loadModel(modelId, resultsFolder)
                modelOutputs.append(nn.test(model, inputTest))
            outputTest = np.zeros(modelOutputs[0].shape)
            for output in modelOutputs:
                shape0 = min(outputTest.shape[0], output.shape[0])
                shape1 = min(outputTest.shape[1], output.shape[1])
                outputTest[:shape0, :shape1] += output[:shape0, :shape1] / \
                    float(len(modelOutputs))
        else:
            print 'Running test data on model %s...' \
                    % modelSpec['name']
            model = it.loadModel(modelSpec['id'], resultsFolder)
            outputTest = nn.test(model, inputTest)
        allOutputs.append(outputTest)
    return allOutputs
Пример #9
0
def runAvgAll(models, data):
    print 'Running model %s' % modelId
    modelOutput = nn.test(model, data['testData'][0])
    modelOutputs.append(modelOutput)
    finalOutput = np.zeros(modelOutputs[0].shape)
    for output in modelOutputs:
        shape0 = min(finalOutput.shape[0], output.shape[0])
        shape1 = min(finalOutput.shape[1], output.shape[1])
        finalOutput[:shape0, :shape1] += output[:shape0, :shape1] / float(len(modelOutputs))
    return finalOutput
Пример #10
0
def runVisPrior(
                trainData,
                testData,
                questionType,
                visModel,
                questionDict,
                questionIdict,
                numAns,
                delta):
    objDict, objIdict = buildObjDict(
                                trainData, 
                                questionType,
                                questionDict,
                                questionIdict)

    count_wa, count_a = trainCount(
                                trainData, 
                                questionType,
                                questionDict,
                                questionIdict,
                                objDict,
                                objIdict,
                                numAns)
    print count_wa

    # Reindex test set
    testInput = testData[0]
    testTarget = testData[1]
    testTargetReshape = testTarget.reshape(testTarget.size)
    testObjId = reindexObjId(
                                testInput, 
                                objDict, 
                                questionDict, 
                                questionIdict, 
                                questionType)

    # Run vis model on test set
    testOutput = nn.test(visModel, testInput)

    print 'Before Prior Test Accuracy:',
    rate, _, __ = calcRate(testOutput, testTarget)
    print rate
    
    # Run on test set
    visPriorOutput = runVisPriorOnce(
                                testObjId, 
                                count_wa, 
                                count_a, 
                                testOutput, 
                                delta)
    print 'delta=%f Test Accuracy:' % delta,
    rate, _, __ = calcRate(visPriorOutput, testTarget)
    print rate
    return visPriorOutput
Пример #11
0
def runTests(params, model, trainer):
    if params["testDataFilename"] is not None:
        if params["imageqa"]:
            imageqa_test.testAll(trainer.name, model, params["dataFolder"], params["outputFolder"])
        else:
            testData = np.load(params["testDataFilename"])
            testInput = testData[0]
            testTarget = testData[1]
            model.loadWeights(np.load(trainer.modelFilename))
            testOutput = nn.test(model, testInput)
            testRate, c, t = nn.calcRate(model, testOutput, testTarget)
            print "Test rate: ", testRate
            with open(os.path.join(trainer.outputFolder, "result.txt"), "w+") as f:
                f.write("Test rate: %f\n" % testRate)
Пример #12
0
def run(c, filename='', genre=''):
    if c == 'normal':
        setup = Setup()
        if not setup.check():
            exit()
        setup.initData()
        setup.safeKeep(setup.X, 'trainvec.pkl')
        setup.safeKeep(setup.testX, 'testvec.pkl')
        setup.safeKeep(setup.testY, 'testlabel.pkl')
        setup.safeKeep(setup.Y, 'labels.pkl')
    elif c == 'nn':
        return test(calc_mfcc(filename))
    elif c == 'reco':
        return runreco(calc_mfcc(filename), genre)
Пример #13
0
def runTests(params, model, trainer):
    if params['testDataFilename'] is not None:
        if params['imageqa']:
            imageqa_test.testAll(trainer.name, model, params['dataFolder'],
                                 params['outputFolder'])
        else:
            testData = np.load(params['testDataFilename'])
            testInput = testData[0]
            testTarget = testData[1]
            model.loadWeights(np.load(trainer.modelFilename))
            testOutput = nn.test(model, testInput)
            testRate, c, t = nn.calcRate(model, testOutput, testTarget)
            print 'Test rate: ', testRate
            with open(os.path.join(trainer.outputFolder, 'result.txt'),
                      'w+') as f:
                f.write('Test rate: %f\n' % testRate)
def __runEnsemble(
                inputTest,
                models,
                ansDict,
                classAnsIdict,
                questionTypeArray):
    allOutput = []
    for i, model in enumerate(models):
        print 'Running test data on model #%d...' % i
        outputTest = nn.test(model, inputTest)
        allOutput.append(outputTest)
    ensembleOutputTest = np.zeros((inputTest.shape[0], len(ansDict)))
    for n in range(allOutput[0].shape[0]):
        qtype = questionTypeArray[n]
        output = allOutput[qtype]
        for i in range(output.shape[1]):
            ansId = ansDict[classAnsIdict[qtype][i]]
            ensembleOutputTest[n, ansId] = output[n, i]
    return ensembleOutputTest
def runEnsemblePrior(
                        inputTest,
                        models, 
                        dataFolder, 
                        classDataFolders,
                        questionTypeArray):
    """
    Similar to "testEnsemble" in imageqa_test.
    Run visprior on number and color questions.
    """
    data = it.loadDataset(dataFolder)
    numAns = len(data['ansIdict'])
    outputTest = np.zeros((inputTest.shape[0], numAns))
    count = 0

    allOutput = []
    ensembleOutputTest = np.zeros((inputTest.shape[0], numAns))
    classAnsIdict = []

    for i, model in enumerate(models):
        data_m = it.loadDataset(classDataFolders[i])
        classAnsIdict.append(data_m['ansIdict'])
        tvData_m = ip.combineTrainValid(data_m['trainData'], data_m['validData'])
        print 'Running test data on model #%d...' % i
        if i == 0:
            # Object questions
            print 'No prior'
            outputTest = nn.test(model, data_m['testData'][0])
            print 'Accuracy:',
            print ip.calcRate(outputTest, data_m['testData'][1])
        elif i == 1 or i == 2 or i == 3:
            # Number and color and location questions
            print 'Prior'
            # Delta is pre-determined
            if i == 1:
                delta = 1e-6
                questionType = "number"
            elif i == 2:
                delta = 5e-4
                questionType = "color"
            elif i == 3:
                delta = 1.0
                questionType = "location"
            outputTest = ip.runVisPrior(
                                tvData_m,
                                data_m['testData'],
                                questionType,
                                model,
                                data_m['questionDict'],
                                data_m['questionIdict'],
                                len(data_m['ansIdict']),
                                delta)
        allOutput.append(outputTest)
    counter = [0, 0, 0, 0]
    for n in range(inputTest.shape[0]):
        qtype = questionTypeArray[n]
        output = allOutput[qtype]
        for i in range(output.shape[1]):
            ansId = data['ansDict'][classAnsIdict[qtype][i]]
            ensembleOutputTest[n, ansId] = output[counter[qtype], i]
        counter[qtype] += 1
    return ensembleOutputTest
Пример #16
0
        ansDict_m = data_m['ansDict']
        ansIdict = data['ansIdict']
        questionDict_m = data_m['questionDict']
        questionIdict = data['questionIdict']

        newTestInput = np.zeros(testInput.shape, dtype='int')
        for n in range(testInput.shape[0]):
            newTestInput[n, 0, 0] = testInput[n, 0, 0]
            for t in range(1, testInput.shape[1]):
                if testInput[n, t, 0] != 0:
                    word = questionIdict[testInput[n, t, 0] - 1]
                    newTestInput[n, t, 0] = questionDict_m[word]
                else:
                    break
        mainModel = it.loadModel(mainModelId, resultsFolder)
        mainTestOutput = nn.test(mainModel, newTestInput)

        # Need to extract the class output from mainTestOutput
        classNewId = []
        for ans in ansIdict:
            classNewId.append(ansDict_m[ans])
        classNewId = np.array(classNewId, dtype='int')
        mainTestOutput = mainTestOutput[:, classNewId]

        for i in range(len(ansIdict)):
            mixRatio = i / 10.0
            ensTestOutput = mixRatio * visTestOutput + \
                (1 - mixRatio) * mainTestOutput
            print '%.2f VIS+PRIOR & %.2f VIS+BLSTM Accuracy:' % \
                (mixRatio, 1 - mixRatio),
            rate, _, __ = calcRate(ensTestOutput, testTarget)
Пример #17
0
    validModels = []
    for modelId in modelIds:
        print 'Loading model %s' % modelId
        models.append(it.loadModel(modelId, resultsFolder))
    for modelId in validModelIds:
        print 'Loading model %s' % modelId
        validModels.append(it.loadModel(modelId, resultsFolder))

    modelOutputs = []
    validModelOutputs = []
    # for modelId, model in zip(validModelIds, validModels):
    #     print 'Running model %s' % modelId
    #     modelOutput = nn.test(model, data['validData'][0])
    #     validModelOutputs.append(modelOutput)
    # 
    # mixRatios = np.arange(0, 11) * 0.1
    # bestMixRatio = validAvg(validModelOutputs, mixRatios, data['validData'][1])
    # print 'Best ratio found: %.4f' % bestMixRatio
    bestMixRatio = 0.5
    shape = None
    for modelId, model in zip(modelIds, models):
        print 'Running model %s' % modelId
        modelOutput = nn.test(model, data['testData'][0])
        if shape is None:
            shape = modelOutput.shape
        else:
            modelOutput = modelOutput[:shape[0],:shape[1]]
        modelOutputs.append(modelOutput)

    testAvgAll(modelOutputs, bestMixRatio, data, outputFolder)
Пример #18
0
    for i in range(0, 10):
        trainInput_, trainTarget_, testInput_, testTarget_ = \
        vt.splitData(trainInput, trainTarget, 0.1, i)
        trainOpt['heldOutRatio'] = 0.1
        trainOpt['xvalidNo'] = 0
        trainOpt['needValid'] = True

        model = nn.load(modelFilename)
        trainer = nn.Trainer(
            name=name + ('-%d-v' % i),
            model=model,
            trainOpt=trainOpt,
            outputFolder=outputFolder
        )
        trainer.train(trainInput_, trainTarget_)

        # Train again with all data, without validation
        trainOpt['needValid'] = False
        trainOpt['numEpoch'] = trainer.stoppedEpoch + 1
        trainer = nn.Trainer(
            name=name + ('-%d' % i),
            model=model,
            trainOpt=trainOpt,
            outputFolder=outputFolder
        )
        trainer.train(trainInput_, trainTarget_)
        testOutput = nn.test(model, testInput_)
        testRate, correct, total = nn.calcRate(model, testOutput, testTarget_)
        with open(os.path.join(trainer.outputFolder, 'result.txt'), 'w+') as f:
            f.write('Test rate: %f' % testRate)
Пример #19
0
import matplotlib.pyplot as plt
import nn

learnRate = 0.1

if len(sys.argv) > 1:  #use given settings
    dropout = float(sys.argv[1])
    momentum = float(sys.argv[2])
    decay = float(sys.argv[3])

    (model, optimizer, criterion) = nn.init(learnRate,
                                            momentum=momentum,
                                            dropout=dropout,
                                            wd=decay)
    (accv, errv, losst) = nn.fullTrain(model, optimizer, criterion)
    testAccuracy = nn.test(model)

    print('Test Accuracy:\t' + str(testAccuracy.item() * 100.) + "%\n")

    fig, ax1 = plt.subplots()

    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Validation Error', color='tab:red')
    ax1.plot(errv, color='tab:red')
    ax1.tick_params(axis='y', labelcolor='tab:red')

    ax2 = ax1.twinx()

    ax2.set_ylabel('Training Loss', color='tab:blue')
    ax2.plot(losst, color='tab:blue')
    ax2.tick_params(axis='y', labelcolor='tab:blue')
Пример #20
0
def test(d, nn=classifier_nn, knn=classifier_knn):
    return nn.test(d), knn.test(d)
Пример #21
0
def test(d,nn=classifier_nn,knn=classifier_knn):
	return nn.test(d), knn.test(d)
        ansDict_m = data_m["ansDict"]
        ansIdict = data["ansIdict"]
        questionDict_m = data_m["questionDict"]
        questionIdict = data["questionIdict"]

        newTestInput = np.zeros(testInput.shape, dtype="int")
        for n in range(testInput.shape[0]):
            newTestInput[n, 0, 0] = testInput[n, 0, 0]
            for t in range(1, testInput.shape[1]):
                if testInput[n, t, 0] != 0:
                    word = questionIdict[testInput[n, t, 0] - 1]
                    newTestInput[n, t, 0] = questionDict_m[word]
                else:
                    break
        mainModel = it.loadModel(mainModelId, resultsFolder)
        mainTestOutput = nn.test(mainModel, newTestInput)

        # Need to extract the class output from mainTestOutput
        classNewId = []
        for ans in ansIdict:
            classNewId.append(ansDict_m[ans])
        classNewId = np.array(classNewId, dtype="int")
        mainTestOutput = mainTestOutput[:, classNewId]

        for i in range(len(ansIdict)):
            mixRatio = i / 10.0
            ensTestOutput = mixRatio * visTestOutput + (1 - mixRatio) * mainTestOutput
            print "%.2f VIS+PRIOR & %.2f VIS+BLSTM Accuracy:" % (mixRatio, 1 - mixRatio),
            rate, _, __ = calcRate(ensTestOutput, testTarget)
            print rate
Пример #23
0
print "Training is starting"
for i in xrange(K):
    print " Case #" + str(i + 1) + ":"
    layer = train(data[i][3], data[i][4], data[i][5], feature_size, num_nodes,
                  0.005, 300)
    accur = accuracy(data[i][0], data[i][1], data[i][2], layer, num_nodes)
    layers.append(layer)
    vl_acc.append(accur)
    print "  Validation accuracy: " + str(accur)
tr_acc = [
    accuracy(training_data, training_label, training_size, x, num_nodes)
    for x in layers
]
print "Training is done"

print "Testing is starting"
max_ac = max(vl_acc)
ind_ac = 0
for i in xrange(K):
    if (vl_acc[i] == max_ac):
        ind_ac = i
        break
training_accuracy = accuracy(training_data, training_label, training_size,
                             layers[ind_ac], num_nodes)
test_output = test(test_data, layers[ind_ac], num_nodes)
print " Training accuracy: " + str(training_accuracy)
submission = np.array([[i + 1, test_output[i] - (1 - test_output[i])]
                       for i in xrange(test_size)])
np.savetxt('../data/submission_6.csv', submission, delimiter=',')
print "Testing is done"
Пример #24
0
    return result;

K = 10;
data = K_fold(training_data, training_size, training_label, K);
layers = []; vl_acc = [];
num_nodes = 20;

print "Training is starting";
for i in xrange(K):
    print " Case #"+str(i+1)+":";
    layer = train(data[i][3], data[i][4], data[i][5], feature_size, num_nodes, 0.005, 300);
    accur = accuracy(data[i][0], data[i][1], data[i][2], layer, num_nodes);
    layers.append(layer);
    vl_acc.append(accur);
    print "  Validation accuracy: "+str(accur);
tr_acc = [accuracy(training_data, training_label, training_size, x, num_nodes) for x in layers];
print "Training is done";

print "Testing is starting";
max_ac = max(vl_acc);
ind_ac = 0;
for i in xrange(K):
    if (vl_acc[i] == max_ac):
        ind_ac = i;
        break;
training_accuracy = accuracy(training_data, training_label, training_size, layers[ind_ac], num_nodes);
test_output = test(test_data, layers[ind_ac], num_nodes);
print " Training accuracy: " + str(training_accuracy);
submission = np.array([[i+1, test_output[i]-(1-test_output[i])] for i in xrange(test_size)]);
np.savetxt('submission_6.csv', submission, delimiter = ',');
print "Testing is done";
Пример #25
0
                                             np.random.RandomState(2))
    with open(configFilename) as f:
        trainOpt = yaml.load(f)

    for i in range(0, 10):
        trainInput_, trainTarget_, testInput_, testTarget_ = \
        vt.splitData(trainInput, trainTarget, 0.1, i)
        trainOpt['heldOutRatio'] = 0.1
        trainOpt['xvalidNo'] = 0
        trainOpt['needValid'] = True

        model = nn.load(modelFilename)
        trainer = nn.Trainer(name=name + ('-%d-v' % i),
                             model=model,
                             trainOpt=trainOpt,
                             outputFolder=outputFolder)
        trainer.train(trainInput_, trainTarget_)

        # Train again with all data, without validation
        trainOpt['needValid'] = False
        trainOpt['numEpoch'] = trainer.stoppedEpoch + 1
        trainer = nn.Trainer(name=name + ('-%d' % i),
                             model=model,
                             trainOpt=trainOpt,
                             outputFolder=outputFolder)
        trainer.train(trainInput_, trainTarget_)
        testOutput = nn.test(model, testInput_)
        testRate, correct, total = nn.calcRate(model, testOutput, testTarget_)
        with open(os.path.join(trainer.outputFolder, 'result.txt'), 'w+') as f:
            f.write('Test rate: %f' % testRate)
    for i, flag in enumerate(sys.argv):
        if flag == '-m' or flag == '-model':
            modelId = sys.argv[i + 1]
        elif flag == '-d' or flag == '-data':
            dataFolder = sys.argv[i + 1]
        elif flag == '-td' or flag == '-tdata':
            testDataFolder = sys.argv[i + 1]
        elif flag == '-reindex':
            needReindex = True
        elif flag == '-r' or flag == '-results':
            resultsFolder = sys.argv[i + 1]
        elif flag == '-dataset':
            dataset = sys.argv[i + 1]

    model = it.loadModel(modelId, resultsFolder)
    data = it.loadDataset(dataFolder)
    testdata = it.loadDataset(testDataFolder)
    if needReindex:
        testQuestions, testAnswers = reindexDataset(testdata['testData'][0],
                                                    testdata['testData'][1],
                                                    testdata['questionIdict'],
                                                    data['questionDict'],
                                                    testdata['ansIdict'],
                                                    data['ansDict'])
    else:
        testQuestions = testdata['testData'][0]
        testAnswers = testdata['testData'][1]
    outputTest = nn.test(model, testQuestions)
    rate, correct, total = nn.calcRate(model, outputTest, testAnswers)
    print 'rate: %.4f' % rate
Пример #27
0
import preprocessing
import nn
import pickle
from data_utils import pearsonsR

if __name__ == '__main__':
	trainPosts, testPosts, devPosts, devTestPosts = preprocessing.prepare()
	countNot1 = 0
	for i in trainPosts[0]:
		if i[-1] != -1:
			countNot1 +=1
	print('Count not -1 is ', countNot1)

	pr = pearsonsR(trainPosts[0])
	for item in pr:
		print(item)
	with open('pearsonsR.p', 'wb') as f:
		pickle.dump(pr, file=f)
		pickle.dump

	print('Beginning nn')
#	nn = nn.simple_feed_forward()
	nn.train(trainPosts[0])
	nn.test(devTestPosts[0])
            modelId = sys.argv[i + 1]
        elif flag == '-d' or flag == '-data':
            dataFolder = sys.argv[i + 1]
        elif flag == '-td' or flag == '-tdata':
            testDataFolder = sys.argv[i + 1]
        elif flag == '-reindex':
            needReindex = True
        elif flag == '-r' or flag == '-results':
            resultsFolder = sys.argv[i + 1]
        elif flag == '-dataset':
            dataset = sys.argv[i + 1]
    
    model = it.loadModel(modelId, resultsFolder)
    data = it.loadDataset(dataFolder)
    testdata = it.loadDataset(testDataFolder)
    if needReindex:
        testQuestions, testAnswers = reindexDataset(
            testdata['testData'][0],
            testdata['testData'][1],
            testdata['questionIdict'],
            data['questionDict'],
            testdata['ansIdict'],
            data['ansDict'])
    else:
        testQuestions = testdata['testData'][0]
        testAnswers = testdata['testData'][1]
    outputTest = nn.test(model, testQuestions)
    rate, correct, total = nn.calcRate(model, outputTest, testAnswers)
    print 'rate: %.4f' % rate

Пример #29
0
    cnn_net = CNN(model['conv1.weight'].shape[1])

    cnn_net.load_state_dict(model)

    count_data = glob.glob(args.data_path + '/count_data/*.tsv.gz')
    label_data = glob.glob(args.data_path + '/label_data/*.txt')

    is_test = lambda x: osp.basename(x).split('.')[0] in patients

    count_data = list(filter(is_test, count_data))
    label_data = list(filter(is_test, label_data))

    eval_pths = dict(count_data=count_data, label_data=label_data)

    eval_dataset = SpotDataset(
        eval_pths,
        size=len(count_data),
        genes=genelist,
    )

    resmat = test(cnn_net, eval_dataset, num_workers=args.num_workers)

    respth = osp.join(output_dir, '.'.join([TAG, "test.pred.res", "tsv"]))

    resmat.to_csv(
        respth,
        sep='\t',
        header=True,
        index=True,
    )
Пример #30
0
def experiment_model(
    selection_problem,
    selection_test_fold,
    selection_source,
    selection_test_source,
    selection_count,
    selection_random_seed,
    selection_tag,
    selection_reject_minimum,
    selection_overwrite,
    al_threshold,
    embedding_type,
    embedding_shape,
    embedding_overwrite,
    model_type,
    model_arch_num,
    model_layer_sizes,
    model_maxlen,
    model_batch_size,
    model_learning_rate,
    model_epochs,
    model_num,
    experiment_tag,
    verbose=True,
    params=None
):
    # embed_df, sel_df, name = experiment_dataset(
    #     selection_problem,
    #     selection_source,
    #     selection_count,
    #     selection_random_seed,
    #     selection_reject_minimum,
    #     selection_overwrite,
    #     embedding_type,
    #     embedding_shape,
    #     embedding_overwrite,
    #     verbose=verbose
    # )


    embed_df, sel_df, name, test_selection_df, test_embedding_df, al_selection_df, al_embedding_df = experiment_dataset(
        selection_problem,
        selection_test_fold,
        selection_source,
        selection_test_source,
        selection_count,
        selection_random_seed,
        selection_tag,
        selection_reject_minimum,
        selection_overwrite,
        al_threshold,
        embedding_type,
        embedding_shape,
        embedding_overwrite,
    )

    X = embed_df
    X_test = test_embedding_df
    X_al_test = al_embedding_df
    target_col = ""
    if selection_problem == "reliability":
        target_col = "reliable"
        y = sel_df.reliable
        y_test = test_selection_df.reliable
        y_al_test = al_selection_df.reliable
    elif selection_problem == "biased" or selection_problem == "extreme_biased" or selection_problem == "bias_direction": # NOTE: unsure if this is where bias_direction should go?
        target_col = "biased"
        y = sel_df.biased
        y_test = test_selection_df.biased
        y_al_test = al_selection_df.biased

    # pad as needed
    data_width=0
    if embedding_shape == "sequence":
        X = lstm.pad_data(X, maxlen=model_maxlen)
        X_test = lstm.pad_data(X_test, maxlen=model_maxlen)
        X_al_test = lstm.pad_data(X_al_test, maxlen=model_maxlen)

        # TODO: 300 actually needs to be width (num cols) of dataset
        data_width = X.shape[-1]
        if model_type == "cnn":
            X = np.reshape(X, (X.shape[0], model_maxlen*data_width, 1))
            X_test = np.reshape(X_test, (X_test.shape[0], model_maxlen*data_width, 1))
            X_al_test = np.reshape(X_al_test, (X_al_test.shape[0], model_maxlen*data_width, 1))
    else:
        X = np.array(X)
        y = np.array(y)
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        X_al_test = np.array(X_al_test)
        y_al_test = np.array(y_al_test)
        print(X)
        data_width = X.shape[-1]

    
    if selection_problem == "bias_direction" and model_type != "svm":
        y = keras.utils.to_categorical(y, num_classes=3)
        y_test = keras.utils.to_categorical(y_test, num_classes=3)
        y_al_test = keras.utils.to_categorical(y_al_test, num_classes=3)


    if "AL_TRAINING" in experiment_tag:
        model = svm.LinearSVC()
        print(X_al_test.shape, y_al_test.shape)
        cv_results = cross_validate(model, X_al_test, y_al_test, cv=10)
        print("_"*80)
        print(cv_results["test_score"])
        results_scores = []
        total = 0
        for num in cv_results["test_score"]:
            results_scores.append(num)
            total += num
        total /= len(cv_results["test_score"])
        print(total)

        save_data = {"average": float(total), "scores": results_scores}
        output_path = f"../data/output/{experiment_tag}"
        util.create_dir(output_path)
        with open(output_path + "/" + experiment_tag + ".json", 'w') as outfile:
            json.dump(save_data, outfile)
        exit()

    
    name = f'{experiment_tag}_{name}_{model_type}_{model_arch_num}_{model_num}_{model_maxlen}_{model_batch_size}_{model_learning_rate}'
        
    if model_type == "lstm":
        model, history, loss, acc, predictions = lstm.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem)

        loss_al, acc_al, predictions_al = lstm.test(X_al_test, y_al_test, model_batch_size, model)
    elif model_type == "cnn":
        model, history, loss, acc, predictions = cnn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name)
    elif model_type == "nn":
        model, history, loss, acc, predictions = nn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem)

        loss_al, acc_al, predictions_al = nn.test(X_al_test, y_al_test, model_batch_size, model)
    elif model_type == "svm":
        model = svm.LinearSVC(random_state=42)
        model.fit(X, y)
        history = None
        loss = 0
        acc = model.score(X_test, y_test)
        predictions = model.predict(X_test)
        loss_al = 0
        acc_al = model.score(X_al_test, y_al_test)
        predictions_al = model.predict(X_al_test)
    print("Training done")

    logging.info("%s", str(test_selection_df[target_col].value_counts()))
    print(test_selection_df[target_col].value_counts())

    # turn predictions into dataframe
    #pred = pd.DataFrame({"predicted": predictions})
    #pred.index = test_selection_df.index
    
    if selection_problem == "bias_direction" and model_type != "svm":
        test_selection_df["predicted"] = np.argmax(predictions, axis=1)
        test_selection_df["pred_class"] = np.argmax(predictions, axis=1)
        
        al_selection_df["predicted"] = np.argmax(predictions_al, axis=1)
        al_selection_df["pred_class"] = np.argmax(predictions_al, axis=1)
    else:
        test_selection_df["predicted"] = predictions
        test_selection_df["pred_class"] = round(test_selection_df.predicted).astype(int)
        
        al_selection_df["predicted"] = predictions_al
        al_selection_df["pred_class"] = round(al_selection_df.predicted).astype(int)

    #al_unique_selection_df = []
    
    # get list of sources for MBC that aren't in training set
    training_sources = list(set(sel_df.source))
    mbc_sources = list(set(al_selection_df.Source))
    unseen_mbc_sources = [x for x in mbc_sources if x not in training_sources and not (x in util.MBC_to_NELA and util.MBC_to_NELA[x] in training_sources)]
    al_unseen_selection_df = al_selection_df[al_selection_df.Source.isin(unseen_mbc_sources)]

    print("="*20, "TRAINING", "="*20)
    print(training_sources)
    print("="*20, "MBC", "="*20)
    print(mbc_sources)
    print("="*20, "UNSEEN", "="*20)
    print(unseen_mbc_sources)


    overall_counts = [] 
    overall_counts_al = [] 
    overall_counts_al_unseen = [] # only unique sources
    if selection_problem != "bias_direction":
        overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=True)
        overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=True)
        overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=True)
    else:
        overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=False)
        overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=False)
        overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=False)


        

    # make output directory (based on experiment tag)
    output_path = f"../data/output/{experiment_tag}"
    breakdown_output_path = output_path + "/persource"
    albreakdown_output_path = output_path + "/alpersource"
    util.create_dir(output_path)
    util.create_dir(breakdown_output_path)
    util.create_dir(albreakdown_output_path)

    logging.info("Overall confusion analysis")
    confusion_analysis(overall_counts, output_path, experiment_tag, name, history, loss, acc, params, False)
    logging.info("Overall analysis complete")

    groups = test_selection_df.groupby(test_selection_df.source)
    logging.info("There are %i groups", len(groups))


    for group_name, group in groups:
        logging.info("Next group %s", name)

        group_counts = []
        if selection_problem != "bias_direction":
            group_counts = calculate_cm_counts(group, target_col, binary=True)
        else:
            group_counts = calculate_cm_counts(group, target_col, binary=False)
            

        confusion_analysis(group_counts, breakdown_output_path, experiment_tag, name + "_persource", history, loss, acc, params, source=group_name)

    #with open("../data/output/" + name + "_predictions.pkl", 'wb') as outfile:
    with open(output_path + "/" + name + "_predictions.pkl", 'wb') as outfile:
        pickle.dump(test_selection_df, outfile)

    logging.info("*****-----------------------------------------*****")
    logging.info("Article-level analysis")
    confusion_analysis(overall_counts_al, output_path, experiment_tag, name + "_al", None, loss_al, acc_al, params, False)
    logging.info("--- (With only unseen sources)")
    confusion_analysis(overall_counts_al_unseen, output_path, experiment_tag, name + "_al_unseen", None, loss_al, acc_al, params, False)
    with open(output_path + "/" + name + "_al_unseensourcelist.json", 'w') as outfile:
        json.dump(unseen_mbc_sources, outfile)
    # TODO: move unseen source calc to bottom and redo groups?
    
    groups = al_selection_df.groupby(al_selection_df.Source)
    logging.info("There are %i al groups", len(groups))
    
    for group_name, group in groups:
        logging.info("Next group %s", name)

        group_counts = []
        if selection_problem != "bias_direction":
            group_counts = calculate_cm_counts(group, target_col, binary=True)
        else:
            group_counts = calculate_cm_counts(group, target_col, binary=False)
        confusion_analysis(group_counts, albreakdown_output_path, experiment_tag, name + "_peralsource", None, loss_al, acc_al, params, source=group_name)
    with open(output_path + "/" + name + "_predictionsal.pkl", 'wb') as outfile:
        pickle.dump(al_selection_df, outfile)