def main(): mode = sys.argv[1] print "mode = ", mode train_data = np.array([[0, 0], [0, 1], [1,0], [1,1]]) train_label = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) wHid, wOut = train(train_data, train_label, mode) test_data = np.array([[0,0], [0,1], [1,0], [1,1]]) test(test_data, wHid, wOut)
def runVisPrior(trainData, testData, questionType, visModel, questionDict, questionIdict, numAns, delta): objDict, objIdict = buildObjDict(trainData, questionType, questionDict, questionIdict) count_wa, count_a = trainCount(trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns) print count_wa # Reindex test set testInput = testData[0] testTarget = testData[1] testTargetReshape = testTarget.reshape(testTarget.size) testObjId = reindexObjId(testInput, objDict, questionDict, questionIdict, questionType) # Run vis model on test set testOutput = nn.test(visModel, testInput) print "Before Prior Test Accuracy:", rate, _, __ = calcRate(testOutput, testTarget) print rate # Run on test set visPriorOutput = runVisPriorOnce(testObjId, count_wa, count_a, testOutput, delta) print "delta=%f Test Accuracy:" % delta, rate, _, __ = calcRate(visPriorOutput, testTarget) print rate return visPriorOutput
def testAll(taskId, model, dataFolder, resultsFolder): testAnswerFile = os.path.join(resultsFolder, taskId, '%s.test.o.txt' % taskId) testTruthFile = os.path.join(resultsFolder, taskId, '%s.test.t.txt' % taskId) testDataFile = os.path.join(dataFolder, 'test.npy') vocabDictFile = os.path.join(dataFolder, 'vocab-dict.npy') vocabDict = np.load(vocabDictFile) testData = np.load(testDataFile) inputTest = testData[0] outputTest = nn.test(model, inputTest) targetTest = testData[1] questionArray = vocabDict[1] answerArray = vocabDict[3] print len(answerArray) print outputTest.shape outputTxt(outputTest, targetTest, answerArray, testAnswerFile, testTruthFile) resultsRank = calcPrecision(outputTest, targetTest) correct, total = calcRate(inputTest, outputTest, targetTest, questionArray) resultsCategory = correct / total.astype(float) resultsFile = os.path.join(resultsFolder, taskId, 'result.txt') resultsWups = runWups(testAnswerFile, testTruthFile) with open(resultsFile, 'w') as f: f.write('rate @ 1: %.4f\n' % resultsRank[0]) f.write('rate @ 5: %.4f\n' % resultsRank[1]) f.write('rate @ 10: %.4f\n' % resultsRank[2]) f.write('object: %.4f\n' % resultsCategory[0]) f.write('number: %.4f\n' % resultsCategory[1]) f.write('color: %.4f\n' % resultsCategory[2]) f.write('scene: %.4f\n' % resultsCategory[3]) f.write('WUPS 1.0: %.4f\n' % resultsWups[0]) f.write('WUPS 0.9: %.4f\n' % resultsWups[1]) f.write('WUPS 0.0: %.4f\n' % resultsWups[2])
def testAll( modelId, model, dataFolder, resultsFolder): testAnswerFile = getAnswerFilename(modelId, resultsFolder) testTruthFile = getTruthFilename(modelId, resultsFolder) data = loadDataset(dataFolder) outputTest = nn.test(model, data['testData'][0]) rate, correct, total = nn.calcRate(model, outputTest, data['testData'][1]) print 'rate: %.4f' % rate resultsRank, \ resultsCategory, \ resultsWups = runAllMetrics( data['testData'][0], outputTest, data['testData'][1], data['ansIdict'], data['questionTypeArray'], testAnswerFile, testTruthFile) writeMetricsToFile( modelId, rate, resultsRank, resultsCategory, resultsWups, resultsFolder) return outputTest
def validDelta(trainData, validData, preVisModelOutput, questionDict, questionIdict, numAns, deltas, questionType): objDict, objIdict = buildObjDict(trainData, questionType, questionDict, questionIdict) count_wa, count_a = trainCount(trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns) print count_wa # Reindex valid set validInput = validData[0] validTarget = validData[1] validTargetReshape = validTarget.reshape(validTarget.size) validObjId = reindexObjId(validInput, objDict, questionDict, questionIdict, questionType) # Run vis model on valid set validOutput = nn.test(preVisModel, validInput) print "Before Prior Valid Accuracy:", rate, _, __ = calcRate(validOutput, validTarget) print rate # Determine best delta bestRate = 0.0 bestDelta = 0.0 for delta in deltas: visPriorOutput = runVisPriorOnce(validObjId, count_wa, count_a, validOutput, delta) print "delta=%f Valid Accuracy:" % delta, rate, _, __ = calcRate(visPriorOutput, validTarget) print rate if rate > bestRate: bestRate = rate bestDelta = delta print "Best Delta:", bestDelta return bestDelta
def validDelta( trainData, validData, preVisModelOutput, questionDict, questionIdict, numAns, deltas, questionType): objDict, objIdict = buildObjDict( trainData, questionType, questionDict, questionIdict) count_wa, count_a = trainCount( trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns) print count_wa # Reindex valid set validInput = validData[0] validTarget = validData[1] validTargetReshape = validTarget.reshape(validTarget.size) validObjId = reindexObjId( validInput, objDict, questionDict, questionIdict, questionType) # Run vis model on valid set validOutput = nn.test(preVisModel, validInput) print 'Before Prior Valid Accuracy:', rate, _, __ = calcRate(validOutput, validTarget) print rate # Determine best delta bestRate = 0.0 bestDelta = 0.0 for delta in deltas: visPriorOutput = runVisPriorOnce( validObjId, count_wa, count_a, validOutput, delta) print 'delta=%f Valid Accuracy:' % delta, rate, _, __ = calcRate(visPriorOutput, validTarget) print rate if rate > bestRate: bestRate = rate bestDelta = delta print 'Best Delta:', bestDelta return bestDelta
def runAllModels( inputTest, questionTypeArray, modelSpecs, resultsFolder, dataset, dataFolder): allOutputs = [] for modelSpec in modelSpecs: if modelSpec['isClassEnsemble']: print 'Running test data on ensemble model %s...' \ % modelSpec['name'] models = loadEnsemble(modelSpec['id'].split(','), resultsFolder) classDataFolders = getClassDataFolders(dataset, dataFolder) if modelSpec['runPrior']: outputTest = runEnsemblePrior( inputTest, models, dataFolder, classDataFolders, questionTypeArray) else: outputTest = runEnsemble( inputTest, models, dataFolder, classDataFolders, questionTypeArray) elif modelSpec['isAverageEnsemble']: modelOutputs = [] for modelId in modelSpec['id'].split(','): model = it.loadModel(modelId, resultsFolder) modelOutputs.append(nn.test(model, inputTest)) outputTest = np.zeros(modelOutputs[0].shape) for output in modelOutputs: shape0 = min(outputTest.shape[0], output.shape[0]) shape1 = min(outputTest.shape[1], output.shape[1]) outputTest[:shape0, :shape1] += output[:shape0, :shape1] / \ float(len(modelOutputs)) else: print 'Running test data on model %s...' \ % modelSpec['name'] model = it.loadModel(modelSpec['id'], resultsFolder) outputTest = nn.test(model, inputTest) allOutputs.append(outputTest) return allOutputs
def runAvgAll(models, data): print 'Running model %s' % modelId modelOutput = nn.test(model, data['testData'][0]) modelOutputs.append(modelOutput) finalOutput = np.zeros(modelOutputs[0].shape) for output in modelOutputs: shape0 = min(finalOutput.shape[0], output.shape[0]) shape1 = min(finalOutput.shape[1], output.shape[1]) finalOutput[:shape0, :shape1] += output[:shape0, :shape1] / float(len(modelOutputs)) return finalOutput
def runVisPrior( trainData, testData, questionType, visModel, questionDict, questionIdict, numAns, delta): objDict, objIdict = buildObjDict( trainData, questionType, questionDict, questionIdict) count_wa, count_a = trainCount( trainData, questionType, questionDict, questionIdict, objDict, objIdict, numAns) print count_wa # Reindex test set testInput = testData[0] testTarget = testData[1] testTargetReshape = testTarget.reshape(testTarget.size) testObjId = reindexObjId( testInput, objDict, questionDict, questionIdict, questionType) # Run vis model on test set testOutput = nn.test(visModel, testInput) print 'Before Prior Test Accuracy:', rate, _, __ = calcRate(testOutput, testTarget) print rate # Run on test set visPriorOutput = runVisPriorOnce( testObjId, count_wa, count_a, testOutput, delta) print 'delta=%f Test Accuracy:' % delta, rate, _, __ = calcRate(visPriorOutput, testTarget) print rate return visPriorOutput
def runTests(params, model, trainer): if params["testDataFilename"] is not None: if params["imageqa"]: imageqa_test.testAll(trainer.name, model, params["dataFolder"], params["outputFolder"]) else: testData = np.load(params["testDataFilename"]) testInput = testData[0] testTarget = testData[1] model.loadWeights(np.load(trainer.modelFilename)) testOutput = nn.test(model, testInput) testRate, c, t = nn.calcRate(model, testOutput, testTarget) print "Test rate: ", testRate with open(os.path.join(trainer.outputFolder, "result.txt"), "w+") as f: f.write("Test rate: %f\n" % testRate)
def run(c, filename='', genre=''): if c == 'normal': setup = Setup() if not setup.check(): exit() setup.initData() setup.safeKeep(setup.X, 'trainvec.pkl') setup.safeKeep(setup.testX, 'testvec.pkl') setup.safeKeep(setup.testY, 'testlabel.pkl') setup.safeKeep(setup.Y, 'labels.pkl') elif c == 'nn': return test(calc_mfcc(filename)) elif c == 'reco': return runreco(calc_mfcc(filename), genre)
def runTests(params, model, trainer): if params['testDataFilename'] is not None: if params['imageqa']: imageqa_test.testAll(trainer.name, model, params['dataFolder'], params['outputFolder']) else: testData = np.load(params['testDataFilename']) testInput = testData[0] testTarget = testData[1] model.loadWeights(np.load(trainer.modelFilename)) testOutput = nn.test(model, testInput) testRate, c, t = nn.calcRate(model, testOutput, testTarget) print 'Test rate: ', testRate with open(os.path.join(trainer.outputFolder, 'result.txt'), 'w+') as f: f.write('Test rate: %f\n' % testRate)
def __runEnsemble( inputTest, models, ansDict, classAnsIdict, questionTypeArray): allOutput = [] for i, model in enumerate(models): print 'Running test data on model #%d...' % i outputTest = nn.test(model, inputTest) allOutput.append(outputTest) ensembleOutputTest = np.zeros((inputTest.shape[0], len(ansDict))) for n in range(allOutput[0].shape[0]): qtype = questionTypeArray[n] output = allOutput[qtype] for i in range(output.shape[1]): ansId = ansDict[classAnsIdict[qtype][i]] ensembleOutputTest[n, ansId] = output[n, i] return ensembleOutputTest
def runEnsemblePrior( inputTest, models, dataFolder, classDataFolders, questionTypeArray): """ Similar to "testEnsemble" in imageqa_test. Run visprior on number and color questions. """ data = it.loadDataset(dataFolder) numAns = len(data['ansIdict']) outputTest = np.zeros((inputTest.shape[0], numAns)) count = 0 allOutput = [] ensembleOutputTest = np.zeros((inputTest.shape[0], numAns)) classAnsIdict = [] for i, model in enumerate(models): data_m = it.loadDataset(classDataFolders[i]) classAnsIdict.append(data_m['ansIdict']) tvData_m = ip.combineTrainValid(data_m['trainData'], data_m['validData']) print 'Running test data on model #%d...' % i if i == 0: # Object questions print 'No prior' outputTest = nn.test(model, data_m['testData'][0]) print 'Accuracy:', print ip.calcRate(outputTest, data_m['testData'][1]) elif i == 1 or i == 2 or i == 3: # Number and color and location questions print 'Prior' # Delta is pre-determined if i == 1: delta = 1e-6 questionType = "number" elif i == 2: delta = 5e-4 questionType = "color" elif i == 3: delta = 1.0 questionType = "location" outputTest = ip.runVisPrior( tvData_m, data_m['testData'], questionType, model, data_m['questionDict'], data_m['questionIdict'], len(data_m['ansIdict']), delta) allOutput.append(outputTest) counter = [0, 0, 0, 0] for n in range(inputTest.shape[0]): qtype = questionTypeArray[n] output = allOutput[qtype] for i in range(output.shape[1]): ansId = data['ansDict'][classAnsIdict[qtype][i]] ensembleOutputTest[n, ansId] = output[counter[qtype], i] counter[qtype] += 1 return ensembleOutputTest
ansDict_m = data_m['ansDict'] ansIdict = data['ansIdict'] questionDict_m = data_m['questionDict'] questionIdict = data['questionIdict'] newTestInput = np.zeros(testInput.shape, dtype='int') for n in range(testInput.shape[0]): newTestInput[n, 0, 0] = testInput[n, 0, 0] for t in range(1, testInput.shape[1]): if testInput[n, t, 0] != 0: word = questionIdict[testInput[n, t, 0] - 1] newTestInput[n, t, 0] = questionDict_m[word] else: break mainModel = it.loadModel(mainModelId, resultsFolder) mainTestOutput = nn.test(mainModel, newTestInput) # Need to extract the class output from mainTestOutput classNewId = [] for ans in ansIdict: classNewId.append(ansDict_m[ans]) classNewId = np.array(classNewId, dtype='int') mainTestOutput = mainTestOutput[:, classNewId] for i in range(len(ansIdict)): mixRatio = i / 10.0 ensTestOutput = mixRatio * visTestOutput + \ (1 - mixRatio) * mainTestOutput print '%.2f VIS+PRIOR & %.2f VIS+BLSTM Accuracy:' % \ (mixRatio, 1 - mixRatio), rate, _, __ = calcRate(ensTestOutput, testTarget)
validModels = [] for modelId in modelIds: print 'Loading model %s' % modelId models.append(it.loadModel(modelId, resultsFolder)) for modelId in validModelIds: print 'Loading model %s' % modelId validModels.append(it.loadModel(modelId, resultsFolder)) modelOutputs = [] validModelOutputs = [] # for modelId, model in zip(validModelIds, validModels): # print 'Running model %s' % modelId # modelOutput = nn.test(model, data['validData'][0]) # validModelOutputs.append(modelOutput) # # mixRatios = np.arange(0, 11) * 0.1 # bestMixRatio = validAvg(validModelOutputs, mixRatios, data['validData'][1]) # print 'Best ratio found: %.4f' % bestMixRatio bestMixRatio = 0.5 shape = None for modelId, model in zip(modelIds, models): print 'Running model %s' % modelId modelOutput = nn.test(model, data['testData'][0]) if shape is None: shape = modelOutput.shape else: modelOutput = modelOutput[:shape[0],:shape[1]] modelOutputs.append(modelOutput) testAvgAll(modelOutputs, bestMixRatio, data, outputFolder)
for i in range(0, 10): trainInput_, trainTarget_, testInput_, testTarget_ = \ vt.splitData(trainInput, trainTarget, 0.1, i) trainOpt['heldOutRatio'] = 0.1 trainOpt['xvalidNo'] = 0 trainOpt['needValid'] = True model = nn.load(modelFilename) trainer = nn.Trainer( name=name + ('-%d-v' % i), model=model, trainOpt=trainOpt, outputFolder=outputFolder ) trainer.train(trainInput_, trainTarget_) # Train again with all data, without validation trainOpt['needValid'] = False trainOpt['numEpoch'] = trainer.stoppedEpoch + 1 trainer = nn.Trainer( name=name + ('-%d' % i), model=model, trainOpt=trainOpt, outputFolder=outputFolder ) trainer.train(trainInput_, trainTarget_) testOutput = nn.test(model, testInput_) testRate, correct, total = nn.calcRate(model, testOutput, testTarget_) with open(os.path.join(trainer.outputFolder, 'result.txt'), 'w+') as f: f.write('Test rate: %f' % testRate)
import matplotlib.pyplot as plt import nn learnRate = 0.1 if len(sys.argv) > 1: #use given settings dropout = float(sys.argv[1]) momentum = float(sys.argv[2]) decay = float(sys.argv[3]) (model, optimizer, criterion) = nn.init(learnRate, momentum=momentum, dropout=dropout, wd=decay) (accv, errv, losst) = nn.fullTrain(model, optimizer, criterion) testAccuracy = nn.test(model) print('Test Accuracy:\t' + str(testAccuracy.item() * 100.) + "%\n") fig, ax1 = plt.subplots() ax1.set_xlabel('Epochs') ax1.set_ylabel('Validation Error', color='tab:red') ax1.plot(errv, color='tab:red') ax1.tick_params(axis='y', labelcolor='tab:red') ax2 = ax1.twinx() ax2.set_ylabel('Training Loss', color='tab:blue') ax2.plot(losst, color='tab:blue') ax2.tick_params(axis='y', labelcolor='tab:blue')
def test(d, nn=classifier_nn, knn=classifier_knn): return nn.test(d), knn.test(d)
def test(d,nn=classifier_nn,knn=classifier_knn): return nn.test(d), knn.test(d)
ansDict_m = data_m["ansDict"] ansIdict = data["ansIdict"] questionDict_m = data_m["questionDict"] questionIdict = data["questionIdict"] newTestInput = np.zeros(testInput.shape, dtype="int") for n in range(testInput.shape[0]): newTestInput[n, 0, 0] = testInput[n, 0, 0] for t in range(1, testInput.shape[1]): if testInput[n, t, 0] != 0: word = questionIdict[testInput[n, t, 0] - 1] newTestInput[n, t, 0] = questionDict_m[word] else: break mainModel = it.loadModel(mainModelId, resultsFolder) mainTestOutput = nn.test(mainModel, newTestInput) # Need to extract the class output from mainTestOutput classNewId = [] for ans in ansIdict: classNewId.append(ansDict_m[ans]) classNewId = np.array(classNewId, dtype="int") mainTestOutput = mainTestOutput[:, classNewId] for i in range(len(ansIdict)): mixRatio = i / 10.0 ensTestOutput = mixRatio * visTestOutput + (1 - mixRatio) * mainTestOutput print "%.2f VIS+PRIOR & %.2f VIS+BLSTM Accuracy:" % (mixRatio, 1 - mixRatio), rate, _, __ = calcRate(ensTestOutput, testTarget) print rate
print "Training is starting" for i in xrange(K): print " Case #" + str(i + 1) + ":" layer = train(data[i][3], data[i][4], data[i][5], feature_size, num_nodes, 0.005, 300) accur = accuracy(data[i][0], data[i][1], data[i][2], layer, num_nodes) layers.append(layer) vl_acc.append(accur) print " Validation accuracy: " + str(accur) tr_acc = [ accuracy(training_data, training_label, training_size, x, num_nodes) for x in layers ] print "Training is done" print "Testing is starting" max_ac = max(vl_acc) ind_ac = 0 for i in xrange(K): if (vl_acc[i] == max_ac): ind_ac = i break training_accuracy = accuracy(training_data, training_label, training_size, layers[ind_ac], num_nodes) test_output = test(test_data, layers[ind_ac], num_nodes) print " Training accuracy: " + str(training_accuracy) submission = np.array([[i + 1, test_output[i] - (1 - test_output[i])] for i in xrange(test_size)]) np.savetxt('../data/submission_6.csv', submission, delimiter=',') print "Testing is done"
return result; K = 10; data = K_fold(training_data, training_size, training_label, K); layers = []; vl_acc = []; num_nodes = 20; print "Training is starting"; for i in xrange(K): print " Case #"+str(i+1)+":"; layer = train(data[i][3], data[i][4], data[i][5], feature_size, num_nodes, 0.005, 300); accur = accuracy(data[i][0], data[i][1], data[i][2], layer, num_nodes); layers.append(layer); vl_acc.append(accur); print " Validation accuracy: "+str(accur); tr_acc = [accuracy(training_data, training_label, training_size, x, num_nodes) for x in layers]; print "Training is done"; print "Testing is starting"; max_ac = max(vl_acc); ind_ac = 0; for i in xrange(K): if (vl_acc[i] == max_ac): ind_ac = i; break; training_accuracy = accuracy(training_data, training_label, training_size, layers[ind_ac], num_nodes); test_output = test(test_data, layers[ind_ac], num_nodes); print " Training accuracy: " + str(training_accuracy); submission = np.array([[i+1, test_output[i]-(1-test_output[i])] for i in xrange(test_size)]); np.savetxt('submission_6.csv', submission, delimiter = ','); print "Testing is done";
np.random.RandomState(2)) with open(configFilename) as f: trainOpt = yaml.load(f) for i in range(0, 10): trainInput_, trainTarget_, testInput_, testTarget_ = \ vt.splitData(trainInput, trainTarget, 0.1, i) trainOpt['heldOutRatio'] = 0.1 trainOpt['xvalidNo'] = 0 trainOpt['needValid'] = True model = nn.load(modelFilename) trainer = nn.Trainer(name=name + ('-%d-v' % i), model=model, trainOpt=trainOpt, outputFolder=outputFolder) trainer.train(trainInput_, trainTarget_) # Train again with all data, without validation trainOpt['needValid'] = False trainOpt['numEpoch'] = trainer.stoppedEpoch + 1 trainer = nn.Trainer(name=name + ('-%d' % i), model=model, trainOpt=trainOpt, outputFolder=outputFolder) trainer.train(trainInput_, trainTarget_) testOutput = nn.test(model, testInput_) testRate, correct, total = nn.calcRate(model, testOutput, testTarget_) with open(os.path.join(trainer.outputFolder, 'result.txt'), 'w+') as f: f.write('Test rate: %f' % testRate)
for i, flag in enumerate(sys.argv): if flag == '-m' or flag == '-model': modelId = sys.argv[i + 1] elif flag == '-d' or flag == '-data': dataFolder = sys.argv[i + 1] elif flag == '-td' or flag == '-tdata': testDataFolder = sys.argv[i + 1] elif flag == '-reindex': needReindex = True elif flag == '-r' or flag == '-results': resultsFolder = sys.argv[i + 1] elif flag == '-dataset': dataset = sys.argv[i + 1] model = it.loadModel(modelId, resultsFolder) data = it.loadDataset(dataFolder) testdata = it.loadDataset(testDataFolder) if needReindex: testQuestions, testAnswers = reindexDataset(testdata['testData'][0], testdata['testData'][1], testdata['questionIdict'], data['questionDict'], testdata['ansIdict'], data['ansDict']) else: testQuestions = testdata['testData'][0] testAnswers = testdata['testData'][1] outputTest = nn.test(model, testQuestions) rate, correct, total = nn.calcRate(model, outputTest, testAnswers) print 'rate: %.4f' % rate
import preprocessing import nn import pickle from data_utils import pearsonsR if __name__ == '__main__': trainPosts, testPosts, devPosts, devTestPosts = preprocessing.prepare() countNot1 = 0 for i in trainPosts[0]: if i[-1] != -1: countNot1 +=1 print('Count not -1 is ', countNot1) pr = pearsonsR(trainPosts[0]) for item in pr: print(item) with open('pearsonsR.p', 'wb') as f: pickle.dump(pr, file=f) pickle.dump print('Beginning nn') # nn = nn.simple_feed_forward() nn.train(trainPosts[0]) nn.test(devTestPosts[0])
modelId = sys.argv[i + 1] elif flag == '-d' or flag == '-data': dataFolder = sys.argv[i + 1] elif flag == '-td' or flag == '-tdata': testDataFolder = sys.argv[i + 1] elif flag == '-reindex': needReindex = True elif flag == '-r' or flag == '-results': resultsFolder = sys.argv[i + 1] elif flag == '-dataset': dataset = sys.argv[i + 1] model = it.loadModel(modelId, resultsFolder) data = it.loadDataset(dataFolder) testdata = it.loadDataset(testDataFolder) if needReindex: testQuestions, testAnswers = reindexDataset( testdata['testData'][0], testdata['testData'][1], testdata['questionIdict'], data['questionDict'], testdata['ansIdict'], data['ansDict']) else: testQuestions = testdata['testData'][0] testAnswers = testdata['testData'][1] outputTest = nn.test(model, testQuestions) rate, correct, total = nn.calcRate(model, outputTest, testAnswers) print 'rate: %.4f' % rate
cnn_net = CNN(model['conv1.weight'].shape[1]) cnn_net.load_state_dict(model) count_data = glob.glob(args.data_path + '/count_data/*.tsv.gz') label_data = glob.glob(args.data_path + '/label_data/*.txt') is_test = lambda x: osp.basename(x).split('.')[0] in patients count_data = list(filter(is_test, count_data)) label_data = list(filter(is_test, label_data)) eval_pths = dict(count_data=count_data, label_data=label_data) eval_dataset = SpotDataset( eval_pths, size=len(count_data), genes=genelist, ) resmat = test(cnn_net, eval_dataset, num_workers=args.num_workers) respth = osp.join(output_dir, '.'.join([TAG, "test.pred.res", "tsv"])) resmat.to_csv( respth, sep='\t', header=True, index=True, )
def experiment_model( selection_problem, selection_test_fold, selection_source, selection_test_source, selection_count, selection_random_seed, selection_tag, selection_reject_minimum, selection_overwrite, al_threshold, embedding_type, embedding_shape, embedding_overwrite, model_type, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, model_num, experiment_tag, verbose=True, params=None ): # embed_df, sel_df, name = experiment_dataset( # selection_problem, # selection_source, # selection_count, # selection_random_seed, # selection_reject_minimum, # selection_overwrite, # embedding_type, # embedding_shape, # embedding_overwrite, # verbose=verbose # ) embed_df, sel_df, name, test_selection_df, test_embedding_df, al_selection_df, al_embedding_df = experiment_dataset( selection_problem, selection_test_fold, selection_source, selection_test_source, selection_count, selection_random_seed, selection_tag, selection_reject_minimum, selection_overwrite, al_threshold, embedding_type, embedding_shape, embedding_overwrite, ) X = embed_df X_test = test_embedding_df X_al_test = al_embedding_df target_col = "" if selection_problem == "reliability": target_col = "reliable" y = sel_df.reliable y_test = test_selection_df.reliable y_al_test = al_selection_df.reliable elif selection_problem == "biased" or selection_problem == "extreme_biased" or selection_problem == "bias_direction": # NOTE: unsure if this is where bias_direction should go? target_col = "biased" y = sel_df.biased y_test = test_selection_df.biased y_al_test = al_selection_df.biased # pad as needed data_width=0 if embedding_shape == "sequence": X = lstm.pad_data(X, maxlen=model_maxlen) X_test = lstm.pad_data(X_test, maxlen=model_maxlen) X_al_test = lstm.pad_data(X_al_test, maxlen=model_maxlen) # TODO: 300 actually needs to be width (num cols) of dataset data_width = X.shape[-1] if model_type == "cnn": X = np.reshape(X, (X.shape[0], model_maxlen*data_width, 1)) X_test = np.reshape(X_test, (X_test.shape[0], model_maxlen*data_width, 1)) X_al_test = np.reshape(X_al_test, (X_al_test.shape[0], model_maxlen*data_width, 1)) else: X = np.array(X) y = np.array(y) X_test = np.array(X_test) y_test = np.array(y_test) X_al_test = np.array(X_al_test) y_al_test = np.array(y_al_test) print(X) data_width = X.shape[-1] if selection_problem == "bias_direction" and model_type != "svm": y = keras.utils.to_categorical(y, num_classes=3) y_test = keras.utils.to_categorical(y_test, num_classes=3) y_al_test = keras.utils.to_categorical(y_al_test, num_classes=3) if "AL_TRAINING" in experiment_tag: model = svm.LinearSVC() print(X_al_test.shape, y_al_test.shape) cv_results = cross_validate(model, X_al_test, y_al_test, cv=10) print("_"*80) print(cv_results["test_score"]) results_scores = [] total = 0 for num in cv_results["test_score"]: results_scores.append(num) total += num total /= len(cv_results["test_score"]) print(total) save_data = {"average": float(total), "scores": results_scores} output_path = f"../data/output/{experiment_tag}" util.create_dir(output_path) with open(output_path + "/" + experiment_tag + ".json", 'w') as outfile: json.dump(save_data, outfile) exit() name = f'{experiment_tag}_{name}_{model_type}_{model_arch_num}_{model_num}_{model_maxlen}_{model_batch_size}_{model_learning_rate}' if model_type == "lstm": model, history, loss, acc, predictions = lstm.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem) loss_al, acc_al, predictions_al = lstm.test(X_al_test, y_al_test, model_batch_size, model) elif model_type == "cnn": model, history, loss, acc, predictions = cnn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name) elif model_type == "nn": model, history, loss, acc, predictions = nn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem) loss_al, acc_al, predictions_al = nn.test(X_al_test, y_al_test, model_batch_size, model) elif model_type == "svm": model = svm.LinearSVC(random_state=42) model.fit(X, y) history = None loss = 0 acc = model.score(X_test, y_test) predictions = model.predict(X_test) loss_al = 0 acc_al = model.score(X_al_test, y_al_test) predictions_al = model.predict(X_al_test) print("Training done") logging.info("%s", str(test_selection_df[target_col].value_counts())) print(test_selection_df[target_col].value_counts()) # turn predictions into dataframe #pred = pd.DataFrame({"predicted": predictions}) #pred.index = test_selection_df.index if selection_problem == "bias_direction" and model_type != "svm": test_selection_df["predicted"] = np.argmax(predictions, axis=1) test_selection_df["pred_class"] = np.argmax(predictions, axis=1) al_selection_df["predicted"] = np.argmax(predictions_al, axis=1) al_selection_df["pred_class"] = np.argmax(predictions_al, axis=1) else: test_selection_df["predicted"] = predictions test_selection_df["pred_class"] = round(test_selection_df.predicted).astype(int) al_selection_df["predicted"] = predictions_al al_selection_df["pred_class"] = round(al_selection_df.predicted).astype(int) #al_unique_selection_df = [] # get list of sources for MBC that aren't in training set training_sources = list(set(sel_df.source)) mbc_sources = list(set(al_selection_df.Source)) unseen_mbc_sources = [x for x in mbc_sources if x not in training_sources and not (x in util.MBC_to_NELA and util.MBC_to_NELA[x] in training_sources)] al_unseen_selection_df = al_selection_df[al_selection_df.Source.isin(unseen_mbc_sources)] print("="*20, "TRAINING", "="*20) print(training_sources) print("="*20, "MBC", "="*20) print(mbc_sources) print("="*20, "UNSEEN", "="*20) print(unseen_mbc_sources) overall_counts = [] overall_counts_al = [] overall_counts_al_unseen = [] # only unique sources if selection_problem != "bias_direction": overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=True) overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=True) overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=True) else: overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=False) overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=False) overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=False) # make output directory (based on experiment tag) output_path = f"../data/output/{experiment_tag}" breakdown_output_path = output_path + "/persource" albreakdown_output_path = output_path + "/alpersource" util.create_dir(output_path) util.create_dir(breakdown_output_path) util.create_dir(albreakdown_output_path) logging.info("Overall confusion analysis") confusion_analysis(overall_counts, output_path, experiment_tag, name, history, loss, acc, params, False) logging.info("Overall analysis complete") groups = test_selection_df.groupby(test_selection_df.source) logging.info("There are %i groups", len(groups)) for group_name, group in groups: logging.info("Next group %s", name) group_counts = [] if selection_problem != "bias_direction": group_counts = calculate_cm_counts(group, target_col, binary=True) else: group_counts = calculate_cm_counts(group, target_col, binary=False) confusion_analysis(group_counts, breakdown_output_path, experiment_tag, name + "_persource", history, loss, acc, params, source=group_name) #with open("../data/output/" + name + "_predictions.pkl", 'wb') as outfile: with open(output_path + "/" + name + "_predictions.pkl", 'wb') as outfile: pickle.dump(test_selection_df, outfile) logging.info("*****-----------------------------------------*****") logging.info("Article-level analysis") confusion_analysis(overall_counts_al, output_path, experiment_tag, name + "_al", None, loss_al, acc_al, params, False) logging.info("--- (With only unseen sources)") confusion_analysis(overall_counts_al_unseen, output_path, experiment_tag, name + "_al_unseen", None, loss_al, acc_al, params, False) with open(output_path + "/" + name + "_al_unseensourcelist.json", 'w') as outfile: json.dump(unseen_mbc_sources, outfile) # TODO: move unseen source calc to bottom and redo groups? groups = al_selection_df.groupby(al_selection_df.Source) logging.info("There are %i al groups", len(groups)) for group_name, group in groups: logging.info("Next group %s", name) group_counts = [] if selection_problem != "bias_direction": group_counts = calculate_cm_counts(group, target_col, binary=True) else: group_counts = calculate_cm_counts(group, target_col, binary=False) confusion_analysis(group_counts, albreakdown_output_path, experiment_tag, name + "_peralsource", None, loss_al, acc_al, params, source=group_name) with open(output_path + "/" + name + "_predictionsal.pkl", 'wb') as outfile: pickle.dump(al_selection_df, outfile)