Пример #1
0
                             'val - aBot',
                             metric,
                             xlabel='Epochs')

            if 'logProbsMean' in rankMetrics:
                logProbsMean = params['CELossCoeff'] * rankMetrics[
                    'logProbsMean']
                viz.linePlot(iterId, logProbsMean, 'aBotLoss', 'val CE')

                if params['trainMode'] == 'sl-abot':
                    valLoss = logProbsMean
                    viz.linePlot(iterId, valLoss, 'loss', 'val loss')

        if qBot:
            print("qBot Validation:")
            rankMetrics, roundMetrics = rankQBot(qBot, dataset, 'val')

            for metric, value in rankMetrics.items():
                viz.linePlot(epochId,
                             value,
                             'val - qBot',
                             metric,
                             xlabel='Epochs')

            viz.linePlot(iterId, epochId, 'iter x epoch', 'epochs')

            if 'logProbsMean' in rankMetrics:
                logProbsMean = params['CELossCoeff'] * rankMetrics[
                    'logProbsMean']
                viz.linePlot(iterId, logProbsMean, 'qBotLoss', 'val CE')
Пример #2
0
                                   split,
                                   scoringFunction=utils.maskedNll)

            for metric, value in rankMetrics.items():
                plotName = splitName + ' - ABot Rank'
                viz.linePlot(iterId,
                             value,
                             plotName,
                             metric,
                             xlabel='Iterations')
                logging.info("Metric \"{}\": {}".format(metric, value))

    # if params['evalModeList'] == 'QBotRank':
    if 'QBotRank' in params['evalModeList']:
        print("Performing QBotRank evaluation")
        rankMetrics, roundRanks = rankQBot(qBot, dataset, split, verbose=1)
        for metric, value in rankMetrics.items():
            plotName = splitName + ' - QBot Rank'
            viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations')

        for r in range(numRounds + 1):
            for metric, value in roundRanks[r].items():
                plotName = '[Iter %d] %s - QABots Rank Roundwise' % \
                            (iterId, splitName)
                viz.linePlot(r, value, plotName, metric, xlabel='Round')

    # if params['evalModeList'] == 'QABotsRank':
    if 'QABotsRank' in params['evalModeList']:
        print("Performing QABotsRank evaluation")
        outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5"
        rankMetrics, roundRanks = rankQABots(qBot,
def run_dialog(params,
               dataset,
               split,
               aBot,
               qBot=None,
               beamSize=1):

    assert aBot is not None or (qBot is not None and aBot is not None),\
                            "Must provide either an A-Bot alone or both \
                            Q-Bot and A-Bot when generating dialog"
    rankMetrics, _ = rankQBot(qBot, dataset, 'val')

    old_split = dataset.split
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    train_questions = set()

    dataset.split = 'train'
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=False,
        num_workers=0,
        collate_fn=dataset.collate_fn)

    ind2word = dataset.ind2word
    to_str_gt = lambda w: str(" ".join([ind2word[x] for x in filter(lambda x:\
                    x>0,w.data.cpu().numpy())])) #.encode('utf-8','ignore')
    to_str_pred = lambda w, l: str(" ".join([ind2word[x] for x in list( filter(
        lambda x:x>0,w.data.cpu().numpy()))][:l.data.cpu()[0]])) #.encode('utf-8','ignore')

    for idx, batch in enumerate(dataloader):
        # append all questions in train in a set to calculate downstream metrics
        gtQuestions = Variable(batch['ques'], requires_grad=False)
        gtQuesLens = Variable(batch['ques_len'], requires_grad=False)
        if gtQuesLens.shape[0] < batchSize:
            break

        # iterate through the batch and add to dictionary
        for j in range(batchSize):
            for rnd in range(numRounds):
                question_str = to_str_pred(gtQuestions[j,rnd,:], gtQuesLens[j,rnd])
                train_questions.add(question_str[8:])

    print("train questions len:", len(train_questions))

    dataset.split = split

    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=False,
        num_workers=0,
        collate_fn=dataset.collate_fn)

    text = {'data': []}
    if '%s_img_fnames' % split not in dataset.data.keys():
        print("[Error] Need coco directory and info as input " \
               "to -cocoDir and -cocoInfo arguments for locating "\
               "coco image files.")
        print("Exiting dialogDump without saving files.")
        return None

    getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x]
    getImgId = lambda x: int(getImgFileName(x)[:-4][-12:])

    similarity_scores_mean = Variable(torch.zeros(numRounds))
    norm_difference_scores_mean = Variable(torch.zeros(numRounds))
    norm_scores_mean = Variable(torch.zeros(numRounds))
    huber_scores_mean = Variable(torch.zeros(numRounds))

    if params["useGPU"]:

        similarity_scores_mean = similarity_scores_mean.cuda()
        norm_difference_scores_mean = norm_difference_scores_mean.cuda()
        norm_scores_mean = norm_scores_mean.cuda()
        huber_scores_mean = huber_scores_mean.cuda()

    tot_idx = 0
    output_dialog = True
    tot_examples = 0
    unique_questions = 0
    unique_questions_list = []
    mutual_overlap_list = []
    ent_1_list = []
    ent_2_list = []
    dist_1_list = []
    dist_2_list = []
    avg_precision_list = []

    bleu_metric = 0
    novel_questions = 0
    oscillating_questions_cnt = 0
    per_round_bleu = np.zeros(numRounds)
    ent_1 = 0
    ent_2 = 0

    for idx, batch in enumerate(dataloader):
        print("current batch:",idx)
        if idx > 3:
            output_dialog = False
        tot_idx = tot_idx + 1
        imgIds = [getImgId(x) for x in batch['index']]
        dialog = [{'dialog': [], 'image_id': imgId} for imgId in imgIds]

        if dataset.useGPU:
            batch = {key: v.cuda() if hasattr(v, 'cuda')\
                else v for key, v in batch.items()}

        image = Variable(batch['img_feat'], volatile=True)
        caption = Variable(batch['cap'], volatile=True)
        # ignoring the last batch
        if caption.size()[0] < batchSize:
            break
        captionLens = Variable(batch['cap_len'], volatile=True)
        if qBot is None:  # A-Bot alone needs ground truth dialog
            gtQuestions = Variable(batch['ques'], volatile=True)
            gtQuesLens = Variable(batch['ques_len'], volatile=True)
            gtAnswers = Variable(batch['ans'], volatile=True)
            gtAnsLens = Variable(batch['ans_len'], volatile=True)

        if aBot:
            aBot.eval(), aBot.reset()
            aBot.observe(
                -1, image=image, caption=caption, captionLens=captionLens)
        if qBot:
            qBot.eval(), qBot.reset()
            qBot.observe(-1, caption=caption, captionLens=captionLens)
        questions = []

        for j in range(batchSize):
            caption_str = to_str_gt(caption[j])[8:-6]
            dialog[j]['caption'] = caption_str
        past_dialog_hidden = None
        cur_dialog_hidden = None
        question_str_list = [[] for _ in range(batchSize)]
        gt_questions_str = [[] for _ in range(batchSize)]

        gtQuestions = Variable(batch['ques'], volatile=True)
        gtQuesLens = Variable(batch['ques_len'], volatile=True)
        gtAnswers = Variable(batch['ans'], volatile=True)
        gtAnsLens = Variable(batch['ans_len'], volatile=True)

        for round in range(numRounds):

            if aBot is not None and qBot is None:
                aBot.observe(
                    round,
                    ques=gtQuestions[:, round],
                    quesLens=gtQuesLens[:, round])
                aBot.observe(
                    round,
                    ans=gtAnswers[:, round],
                    ansLens=gtAnsLens[:, round])
                _ = aBot.forward()
                answers, ansLens = aBot.forwardDecode(
                    inference='greedy', beamSize=beamSize)

            elif aBot is not None and qBot is not None:
                questions, quesLens = qBot.forwardDecode(
                    beamSize=beamSize, inference='greedy')
                qBot.observe(round, ques=questions, quesLens=quesLens)
                aBot.observe(round, ques=questions, quesLens=quesLens)
                answers, ansLens = aBot.forwardDecode(
                    beamSize=beamSize, inference='greedy')
                aBot.observe(round, ans=answers, ansLens=ansLens)
                qBot.observe(round, ans=answers, ansLens=ansLens)
                qBot.encoder()

            cur_dialog_hidden = qBot.encoder.dialogHiddens[-1][0]
            if round == 0:
                past_dialog_hidden = qBot.encoder.dialogHiddens[-1][0]
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            similarity_scores = cos(cur_dialog_hidden, past_dialog_hidden)
            norm_difference_scores = torch.abs(torch.norm(cur_dialog_hidden, p=2, dim=1) - \
                          torch.norm(past_dialog_hidden,p=2,dim=1))
            # calculate norm
            norm_scores = torch.norm(cur_dialog_hidden, p=2, dim=1)
            # calculate Huber Loss/ Difference at consecutive rounds with Huber Threshold = 0.1
            threshold = 0.1
            norm_differences = torch.abs(cur_dialog_hidden - past_dialog_hidden)
            l2_mask = norm_differences <= threshold
            norm_differences_new = 0.5 * norm_differences * norm_differences * (l2_mask == 1).float()
            l1_mask = norm_differences > threshold
            norm_differences_new = norm_differences_new + (((l1_mask == 1).float()) * (threshold *
                                                                               (norm_differences - (0.5 * threshold))))

            huber_scores = torch.sum(norm_differences_new, dim=1)

            past_dialog_hidden = cur_dialog_hidden
            similarity_scores_mean[round] = similarity_scores_mean[round] + torch.mean(similarity_scores)

            norm_difference_scores_mean[round] = norm_difference_scores_mean[round] + torch.mean(norm_difference_scores)
            norm_scores_mean[round] = norm_scores_mean[round] + torch.mean(norm_scores)
            huber_scores_mean[round] = huber_scores_mean[round] + torch.mean(huber_scores)

            for j in range(batchSize):
                question_str = to_str_pred(questions[j], quesLens[j]) \
                    if qBot is not None else to_str_gt(gtQuestions[j])

                gt_question_str = to_str_pred(gtQuestions[j,round,:], gtQuesLens[j,round])

                gt_questions_str[j].append(gt_question_str[8:])

                question_str_list[j].append(question_str[8:])
                answer_str = to_str_pred(answers[j], ansLens[j])
                if output_dialog:
                    if round == 0:
                        norm_score = float(norm_scores[j])
                        dialog[j]['dialog'].append({
                            "answer": answer_str[8:],
                            "question": question_str[8:] + ":" + "N:%.2f" % norm_score + " "
                        })  # "8:" for indexing out initial <START>
                    else:
                        similarity_score = float(similarity_scores[j])
                        norm_difference_score = float(norm_difference_scores[j])
                        norm_score = float(norm_scores[j])
                        huber_score = float(huber_scores[j])
                        dialog[j]['dialog'].append({
                            "answer": answer_str[8:],
                            "question": question_str[8:] + ":" + "C:%.2f" % similarity_score + ";" +
                                        "NP:%.2f" % norm_difference_score + "H:%.2f" % huber_score + ";" +
                                        "N:%.2f" % norm_score + " "
                        })  # "8:" for indexing out initial <START>
        per_round_bleu_batch = np.zeros((numRounds, batchSize))
        for j in range(batchSize):
            # calculate bleu scores for each question str, with other questions as references to calculate
            # mutual overlap
            # also calculate round by round bleu score
            unigrams = []
            bigrams = []
            avg_bleu_score = 0
            for rnd in range(numRounds):
                # Novel sentences metric
                cur_ques = question_str_list[j][rnd]
                gt_ques = gt_questions_str[j][rnd]
                if cur_ques not in train_questions:
                    novel_questions += 1

                # question oscillation metrics
                if rnd >= 2:
                    if cur_ques == question_str_list[j][rnd-2]:
                        oscillating_questions_cnt += 1

                # bleu/mutual overlap metric
                references = []
                for k in range(numRounds):
                    if rnd != k:
                        references.append(nltk.word_tokenize(question_str_list[j][k]))

                avg_bleu_score += sentence_bleu(references,nltk.word_tokenize(cur_ques))
                per_round_bleu_batch[rnd][j] = sentence_bleu([nltk.word_tokenize(gt_ques)],
                                                             nltk.word_tokenize(cur_ques))
                unigrams.extend(list(ngrams(nltk.word_tokenize(cur_ques),1)))
                bigrams.extend(list(ngrams(nltk.word_tokenize(cur_ques),2)))

            avg_bleu_score /=  float(numRounds)
            mutual_overlap_list.append(avg_bleu_score)
            bleu_metric += avg_bleu_score
            tot_tokens = len(unigrams)

            unigram_ctr = Counter(unigrams)
            bigram_ctr = Counter(bigrams)
            cur_ent_1 = get_entropy_ctr(unigram_ctr)
            ent_1 += cur_ent_1
            ent_1_list.append(cur_ent_1)
            cur_ent_2 = get_entropy_ctr(bigram_ctr)
            ent_2 += cur_ent_2
            ent_2_list.append(cur_ent_2)

            dist_1 = len(unigram_ctr.keys())/float(tot_tokens)
            dist_2 = len(bigram_ctr.keys())/float(tot_tokens)

            dist_1_list.append(dist_1)
            dist_2_list.append(dist_2)

            cur_unique_ques = len(set(question_str_list[j]))
            unique_questions += cur_unique_ques
            unique_questions_list.append(cur_unique_ques)
            # dialog[j]['caption'] += ':' + str(cur_unique_ques)

        tot_examples += batchSize

        if output_dialog:
            text['data'].extend(dialog)

        per_round_bleu += np.sum(per_round_bleu_batch,axis=1)
        avg_precision_list.extend(np.mean(per_round_bleu_batch,axis=0).tolist())

    similarity_scores_mean = similarity_scores_mean * (1.0/tot_idx)
    norm_difference_scores_mean = norm_difference_scores_mean * (1.0/tot_idx)
    norm_scores_mean = norm_scores_mean *(1.0/tot_idx)
    huber_scores_mean = huber_scores_mean *(1.0/tot_idx)

    print("Mean Cos Similarity Scores:", similarity_scores_mean)
    print("Mean Difference of Norms Scores:", norm_difference_scores_mean)
    print("Mean Norm of Dialog State:", norm_scores_mean)
    print("Mean Huber Loss(Norm of differences):", huber_scores_mean)

    text['opts'] = {
        'qbot': params['qstartFrom'],
        'abot': params['startFrom'],
        'backend': 'cudnn',
        'beamLen': 20,
        'beamSize': beamSize,
        'decoder': params['decoder'],
        'encoder': params['encoder'],
        'gpuid': 0,
        'imgNorm': params['imgNorm'],
        'inputImg': params['inputImg'],
        'inputJson': params['inputJson'],
        'inputQues': params['inputQues'],
        'loadPath': 'checkpoints/',
        'maxThreads': 1,
        'resultPath': 'dialog_output/results',
        'sampleWords': 0,
        'temperature': 1,
        'useHistory': True,
        'useIm': True,
    }
    unique_questions_arr = np.array(unique_questions_list)

    # converting metrics to numpy arrays
    similarity_scores_mean = similarity_scores_mean.cpu().data.numpy().tolist()
    norm_difference_scores_mean = norm_difference_scores_mean.cpu().data.numpy().tolist()
    norm_scores_mean = norm_scores_mean.cpu().data.numpy().tolist()
    huber_scores_mean = huber_scores_mean.cpu().data.numpy().tolist()

    bleu_metric /= float(tot_examples)
    ent_1 /= float(tot_examples)
    ent_2 /= float(tot_examples)
    per_round_bleu = per_round_bleu / float(tot_examples)

    print("tot unique questions: ", unique_questions)
    print("tot examples: ", tot_examples)
    print("avg unique questions per example: ", float(unique_questions) / tot_examples)
    print("std unique questions per example: ", float(np.std(unique_questions_arr)))
    print("Mutual Overlap (Bleu Metric): ", bleu_metric)
    print("tot novel questions: ", novel_questions)
    tot_questions = tot_examples * numRounds
    print("tot questions: ", tot_questions)
    print("avg novel questions: ", float(novel_questions)/float(tot_questions))

    print("avg oscillating questions count", float(oscillating_questions_cnt)/tot_questions)
    print("osciallation questions count", oscillating_questions_cnt)

    dataset.split = old_split

    ret_metrics = {}
    ret_metrics["tot_unique_questions"] = unique_questions
    ret_metrics["tot_examples"] = tot_examples
    ret_metrics["mean_unique_questions"] = int((float(unique_questions) / tot_examples) * 100)/100.0
    ret_metrics["std_unique_questions"] =  int(float(np.std(unique_questions_arr)) * 100)/100.0

    ret_metrics["similarity_scores_mean"] = similarity_scores_mean
    ret_metrics["norm_difference_scores_mean"] = norm_difference_scores_mean
    ret_metrics["norm_scores_mean"] = norm_scores_mean
    ret_metrics["huber_scores_mean"] = huber_scores_mean

    ret_metrics["mutual_overlap_score"] = bleu_metric
    ret_metrics["tot_novel_questions"] = novel_questions
    ret_metrics["avg_novel_questions"] = float(novel_questions)/float(tot_questions)
    ret_metrics["tot_questions"] = tot_questions
    ret_metrics['NLL'] = rankMetrics['logProbsMean']

    ret_metrics["average_precision"] = np.mean(per_round_bleu)
    ret_metrics["per_round_precision"] = per_round_bleu.tolist()
    ret_metrics["ent_1"] = ent_1
    ret_metrics["ent_2"] = ent_2
    ret_metrics["dist_1"] = np.mean(dist_1_list)
    ret_metrics["dist_2"] = np.mean(dist_2_list)

    ret_metrics["average_precision_CI"] = (1.96 * np.std(avg_precision_list))/math.sqrt(len(avg_precision_list))
    ret_metrics["ent_1_CI"] = (1.96 * np.std(ent_1_list))/math.sqrt(len(ent_1_list))
    ret_metrics["ent_2_CI"] = (1.96 * np.std(ent_2_list))/math.sqrt(len(ent_2_list))
    ret_metrics["unique_questions_CI"] = (1.96 * np.std(unique_questions_list))/math.sqrt(len(unique_questions_list))
    ret_metrics["mutual_overlap_CI"] = (1.96 * np.std(mutual_overlap_list))/math.sqrt(len(mutual_overlap_list))
    ret_metrics["dist_1_CI"] = (1.96 * np.std(dist_1_list))/math.sqrt(len(dist_1_list))
    ret_metrics["dist_2_CI"] = (1.96 * np.std(dist_2_list))/math.sqrt(len(dist_2_list))

    return text,ret_metrics
Пример #4
0
def main(params):
    aqmSetting = None
    if ("AQMBotRank" in params["evalModeList"]
            or "AQMdialog" in params["evalModeList"]
            or "AQMdemo" in params["evalModeList"]):
        aqmSetting = getAQMSetting(params)

    # setup dataloader
    dlparams = params.copy()
    dlparams['useIm'] = True
    dlparams['useHistory'] = True
    dlparams['numRounds'] = 10
    splits = ['val', 'test']

    dataset = VisDialDataset(dlparams, splits)

    # Transferring dataset parameters
    transfer = ['vocabSize', 'numOptions', 'numRounds']
    for key in transfer:
        if hasattr(dataset, key):
            params[key] = getattr(dataset, key)

    if 'numRounds' not in params:
        params['numRounds'] = 10

    # Always load checkpoint parameters with continue flag
    params['continue'] = True

    excludeParams = ['batchSize', 'visdomEnv', 'startFrom', 'qstartFrom', 'trainMode', \
        'evalModeList', 'inputImg', 'inputQues', 'inputJson', 'evalTitle', 'beamSize', \
        'enableVisdom', 'visdomServer', 'visdomServerPort', 'randomCaption', 'zeroCaption',
                     'numImg', 'numQ', 'numA', 'alpha',
                     'qbeamSize', 'gamma', 'delta', 'lambda',
                     'onlyGuesser', 'randQ', 'gen1Q', 'gtQ', 'randA', 'noHistory',
                     'slGuesser', 'resampleEveryDialog']

    aBot = None
    qBot = None
    aqmBot = None

    # load aBot
    print('load aBot')
    if params['startFrom']:
        aBot, loadedParams, _ = utils.loadModel(params, 'abot', overwrite=True)
        assert aBot.encoder.vocabSize == dataset.vocabSize, "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        aBot.eval()

    # Retaining certain dataloder parameters
    for key in excludeParams:
        params[key] = dlparams[key]

    print('load qBot')
    # load qBot
    if params['qstartFrom'] and not params['aqmstartFrom']:
        qBot, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True)
        assert qBot.encoder.vocabSize == params[
            'vocabSize'], "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        qBot.eval()

    # Retaining certain dataloder parameters
    for key in excludeParams:
        params[key] = dlparams[key]

    print('load AQM-Bot')
    # load aqmBot
    if params['aqmstartFrom']:  # abot of AQM
        assert params['qstartFrom']  # qbot of AQM

        aqmBot, loadedParams, _ = utils.loadModel(params,
                                                  'AQM-qbot',
                                                  overwrite=True)
        assert aqmBot.questioner.encoder.vocabSize == params[
            'vocabSize'], "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        aqmBot.eval()

        # load qBot
        for key in excludeParams:
            params[key] = dlparams[key]
        aqmQ, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True)
        assert aqmQ.encoder.vocabSize == params[
            'vocabSize'], "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        aqmQ.eval()
        for key in excludeParams:
            params[key] = dlparams[key]
        aqmBot.setQuestioner(aqmQ)

    elif params['aqmQStartFrom']:
        from visdial.models.aqm_questioner import AQMQuestioner
        aqmBot = AQMQuestioner()
        aqmBot.eval()

        params['qstartFrom'] = params['aqmQStartFrom']
        aqmQ, loadedParams, _ = utils.loadModel(params, 'qbot', overwrite=True)
        assert aqmQ.encoder.vocabSize == params[
            'vocabSize'], "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        aqmQ.eval()
        for key in excludeParams:
            params[key] = dlparams[key]
        aqmBot.setQuestioner(aqmQ)

        params['startFrom'] = params['aqmAStartFrom']
        aqmA, loadedParams, _ = utils.loadModel(params, 'abot', overwrite=True)
        assert aqmA.encoder.vocabSize == dataset.vocabSize, "Vocab size mismatch!"
        for key in loadedParams:
            params[key] = loadedParams[key]
        aqmA.eval()
        aqmBot.setAppAnswerer(aqmA)

    for key in excludeParams:
        params[key] = dlparams[key]

    pprint.pprint(params)
    #viz.addText(pprint.pformat(params, indent=4))
    print("Running evaluation!")

    numRounds = params['numRounds']
    if 'ckpt_iterid' in params:
        iterId = params['ckpt_iterid'] + 1
    else:
        iterId = -1

    if 'test' in splits:
        split = 'test'
        splitName = 'test - {}'.format(params['evalTitle'])
    else:
        split = 'val'
        splitName = 'full Val - {}'.format(params['evalTitle'])

    print("Using split %s" % split)
    dataset.split = split

    if 'ABotRank' in params['evalModeList']:
        if params['aqmstartFrom']:
            aBot = aqmBot.appAnswerer
            print('evaluating appBot of AQM')
        print("Performing ABotRank evaluation")
        rankMetrics = rankABot(aBot,
                               dataset,
                               split,
                               scoringFunction=utils.maskedNll,
                               expLowerLimit=params['expLowerLimit'],
                               expUpperLimit=params['expUpperLimit'])
        print(rankMetrics)
        for metric, value in rankMetrics.items():
            plotName = splitName + ' - ABot Rank'
            #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations')

    if 'QBotRank' in params['evalModeList']:
        print("Performing QBotRank evaluation")
        rankMetrics, roundRanks = rankQBot(
            qBot,
            dataset,
            split,
            expLowerLimit=params['expLowerLimit'],
            expUpperLimit=params['expUpperLimit'],
            verbose=1)
        for metric, value in rankMetrics.items():
            plotName = splitName + ' - QBot Rank'
            #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations')

        for r in range(numRounds + 1):
            for metric, value in roundRanks[r].items():
                plotName = '[Iter %d] %s - QABots Rank Roundwise' % \
                            (iterId, splitName)
                #viz.linePlot(r, value, plotName, metric, xlabel='Round')

    if 'QABotsRank' in params['evalModeList']:
        print("Performing QABotsRank evaluation")
        outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5"
        rankMetrics, roundRanks = rankQABots(
            qBot,
            aBot,
            dataset,
            split,
            beamSize=params['beamSize'],
            expLowerLimit=params['expLowerLimit'],
            expUpperLimit=params['expUpperLimit'],
            zeroCaption=params['zeroCaption'],
            randomCaption=params['randomCaption'],
            numRounds=params['runRounds'])
        for metric, value in rankMetrics.items():
            plotName = splitName + ' - QABots Rank'
            #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations')

        for r in range(numRounds + 1):
            for metric, value in roundRanks[r].items():
                plotName = '[Iter %d] %s - QBot All Metrics vs Round'%\
                            (iterId, splitName)
                #viz.linePlot(r, value, plotName, metric, xlabel='Round')

    if 'AQMBotRank' in params['evalModeList']:
        print("Performing AQMBotRank evaluation")
        outputPredFile = "data/visdial/visdial/output_predictions_rollout.h5"
        rankMetrics, roundRanks = AQMRunner(
            aqmBot,
            aBot,
            dataset,
            split,
            beamSize=params['beamSize'],
            realQA=params['aqmRealQA'],
            saveLogs=params['saveLogs'],
            showQA=params['showQA'],
            expLowerLimit=params['expLowerLimit'],
            expUpperLimit=params['expUpperLimit'],
            selectedBatchIdxs=params['selectedBatchIdxs'],
            numRounds=params['runRounds'],
            lda=params['lambda'],
            onlyGuesser=params['onlyGuesser'],
            numQ=params['numQ'],
            qbeamSize=params['qbeamSize'],
            numImg=params['numImg'],
            alpha=params['alpha'],
            numA=params['numA'],
            randQ=params['randQ'],
            randA=params['randA'],
            zeroCaption=params['zeroCaption'],
            randomCaption=params['randomCaption'],
            gamma=params['gamma'],
            delta=params['delta'],
            gen1Q=params['gen1Q'],
            gtQ=params['gtQ'],
            noHistory=params['noHistory'],
            slGuesser=params['slGuesser'],
            resampleEveryDialog=params['resampleEveryDialog'],
            aqmSetting=aqmSetting,
        ).rankQuestioner()
        for metric, value in rankMetrics.items():
            plotName = splitName + ' - QABots Rank'
            #viz.linePlot(iterId, value, plotName, metric, xlabel='Iterations')

        for r in range(numRounds + 1):
            for metric, value in roundRanks[r].items():
                plotName = '[Iter %d] %s - QBot All Metrics vs Round'%\
                            (iterId, splitName)
                #viz.linePlot(r, value, plotName, metric, xlabel='Round')

    if 'dialog' in params['evalModeList']:
        print("Performing dialog generation...")
        split = 'test'
        outputFolder = "dialog_output/results"
        os.makedirs(outputFolder, exist_ok=True)
        outputPath = os.path.join(outputFolder, "results.json")
        dialogDump(params,
                   dataset,
                   split,
                   aBot=aBot,
                   qBot=qBot,
                   expLowerLimit=params['expLowerLimit'],
                   expUpperLimit=params['expUpperLimit'],
                   beamSize=params['beamSize'],
                   savePath=outputPath)

    if 'AQMdialog' in params['evalModeList']:
        print("Performing AQM dialog generation...")

        split = 'test'
        AQMRunner(
            aqmBot,
            aBot,
            dataset,
            split,
            beamSize=params['beamSize'],
            realQA=params['aqmRealQA'],
            saveLogs=params['saveLogs'],
            showQA=params['showQA'],
            expLowerLimit=params['expLowerLimit'],
            expUpperLimit=params['expUpperLimit'],
            selectedBatchIdxs=params['selectedBatchIdxs'],
            numRounds=params['runRounds'],
            lda=params['lambda'],
            onlyGuesser=params['onlyGuesser'],
            numQ=params['numQ'],
            qbeamSize=params['qbeamSize'],
            numImg=params['numImg'],
            alpha=params['alpha'],
            numA=params['numA'],
            randQ=params['randQ'],
            randA=params['randA'],
            zeroCaption=params['zeroCaption'],
            randomCaption=params['randomCaption'],
            gamma=params['gamma'],
            delta=params['delta'],
            gen1Q=params['gen1Q'],
            gtQ=params['gtQ'],
            noHistory=params['noHistory'],
            slGuesser=params['slGuesser'],
            resampleEveryDialog=params['resampleEveryDialog'],
            aqmSetting=aqmSetting,
        ).dialogDump(params)
if 'ABotRank' in params['evalModeList']:
    print("Performing ABotRank evaluation")
    rankMetrics = rankABot(aBot,
                           dataset,
                           split,
                           scoringFunction=utils.maskedNll)

    print(rankMetrics)
    for metric, value in rankMetrics.items():
        plotName = splitName + ' - ABot Rank'

if 'QBotRank' in params['evalModeList']:
    print("Performing QBotRank evaluation")
    rankMetrics, roundRanks = rankQBot(qBot,
                                       dataset,
                                       split,
                                       verbose=1,
                                       exampleLimit=1400 * params['batchSize'])
    for metric, value in rankMetrics.items():
        plotName = splitName + ' - QBot Rank'

    for r in range(numRounds + 1):
        for metric, value in roundRanks[r].items():
            plotName = '[Iter %d] %s - QABots Rank Roundwise' % \
                       (iterId, splitName)

if 'QABotsRank' in params['evalModeList']:
    print("Performing QABotsRank evaluation")
    outputPredFile = "/hhd/lvxinyu/visdial-pytorch/data/visdial/visdial/output_predictions_rollout.h5"
    rankMetrics, roundRanks = rankQABots(qBot,
                                         aBot,