def rankQuestioner(self):
        if self.selectedBatchIdxs is not None:
            raise RuntimeError(
                "Cannot use selectedBatchIdxs when evaluating PMR")

        for batch_idx, batch in enumerate(self.dataloader):
            if self.expLowerLimit is not None:
                if batch_idx < self.expLowerLimit:
                    continue
                if batch_idx >= self.expUpperLimit:
                    break
            else:
                if batch_idx >= self.numBatches:
                    break

            self.runDialog(batch_idx, batch, printSummary=True)

        rankMetricsRounds = []
        print("Percentile mean rank (round, mean, low, high)")

        if self.saveLogs:
            csv_file = open(os.path.join(self.logsDir, "PMR__%s_%s.csv"),
                            mode="w")
            writer = csv.DictWriter(
                csv_file,
                ["round", "meanPercRank", "percRankLow", "percRankHigh"])
            writer.writeheader()

        if not self.slGuesser:
            for round in range(self.numRounds + 1):
                '''
                1. sort and get index within each example
                2. get ranks
                '''
                # num_examples x num_examples
                rank = self.ranks[round]
                rankMetrics = metrics.computeMetrics(
                    Variable(torch.from_numpy(rank)))
                poolSize = len(self.dataset)
                # assert len(ranks) == len(dataset)
                meanRank = rank.mean()
                se = rank.std() / np.sqrt(poolSize)
                meanPercRank = 100 * (1 - (meanRank / poolSize))
                percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
                percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
                print('%d\t%f\t%f\t%f' %
                      (round, meanPercRank, percRankLow, percRankHigh))
                rankMetrics['percentile'] = meanPercRank
                rankMetricsRounds.append(rankMetrics)

                if self.saveLogs:
                    writer.writerow({
                        "round": round,
                        "meanPercRank": meanPercRank,
                        "percRankLow": percRankLow,
                        "percRankHigh": percRankHigh
                    })
        else:
            gtFeatures = self.gtImgFeatures.data.cpu().numpy()
            for round in range(self.numRounds + 1):
                predFeatures = torch.cat(self.roundwiseFeaturePreds[round],
                                         0).data.cpu().numpy()  # (
                dists = pairwise_distances(predFeatures, gtFeatures)
                # num_examples x num_examples
                ranks = []
                for i in range(dists.shape[0]):
                    # Computing rank of i-th prediction vs all images in split
                    if self.expLowerLimit is None:
                        rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
                    else:
                        rank = int(
                            np.where(dists[i, :].argsort() == i +
                                     self.expLowerLimit)[0]) + 1
                    ranks.append(rank)
                ranks = np.array(ranks)
                rankMetrics = metrics.computeMetrics(
                    Variable(torch.from_numpy(ranks)))
                poolSize = len(self.dataset)
                meanRank = ranks.mean()
                se = ranks.std() / np.sqrt(poolSize)
                meanPercRank = 100 * (1 - (meanRank / poolSize))
                percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
                percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
                print('%d\t%f\t%f\t%f' %
                      (round, meanPercRank, percRankLow, percRankHigh))
                rankMetrics['percentile'] = meanPercRank
                rankMetricsRounds.append(rankMetrics)

                if self.saveLogs:
                    writer.writerow({
                        "round": round,
                        "meanPercRank": meanPercRank,
                        "percRankLow": percRankLow,
                        "percRankHigh": percRankHigh
                    })

        self.dataset.split = self.original_split
        return rankMetricsRounds[-1], rankMetricsRounds
示例#2
0
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None):
    '''
        Evaluate A-Bot performance on ranking answer option when it is
        shown ground truth image features, captions and questions.
        Arguments:
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'
            scoringFunction : A function which computes negative log
                              likelihood of a sequence (answer) given log
                              probabilities under an RNN model. Currently
                              utils.maskedNll is the only such function used.
            exampleLimit    : Maximum number of data points to use from
                              the dataset split. If None, all data points.
    '''
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit

    numBatches = (numExamples - 1) // batchSize + 1

    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(dataset,
                            batch_size=batchSize,
                            shuffle=True,
                            num_workers=1,
                            collate_fn=dataset.collate_fn)

    totalLoss, totalTokens = 0, 0
    ranks = []
    logProbsAll = [[] for _ in range(numRounds)]
    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }
        else:
            batch = {
                key: v.contiguous() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }

        image = Variable(batch['img_feat_vgg'], volatile=True)
        image_36 = Variable(batch['img_feat_36'], volatile=True)
        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        questions = Variable(batch['ques'], volatile=True)
        quesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        options = Variable(batch['opt'], volatile=True)
        optionLens = Variable(batch['opt_len'], volatile=True)
        correctOptionInds = Variable(batch['ans_id'], volatile=True)
        aBot.reset()
        aBot.observe(-1,
                     image=[image, image_36],
                     caption=caption,
                     captionLens=captionLens)
        for round in range(numRounds):
            aBot.observe(round,
                         ques=questions[:, round],
                         quesLens=quesLens[:, round],
                         ans=answers[:, round],
                         ansLens=ansLens[:, round])
            logProbs = aBot.evalOptions(
                options[:, round], optionLens[:, round],
                scoringFunction)  # Option: forward +loss (20,100)
            logProbsCurrent = aBot.forward(
            )  # current Answer: no forward (20,22,7826)
            logProbsAll[round].append(
                scoringFunction(logProbsCurrent, answers[:, round].contiguous(
                )))  #current Answer: loss (1) return scores = False
            batchRanks = rankOptions(options[:, round],
                                     correctOptionInds[:, round],
                                     logProbs)  #(20,1)
            ranks.append(batchRanks)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")
    dataloader = None
    print("Sleeping for 3 seconds to let dataloader subprocesses exit...")
    ranks = torch.cat(ranks,
                      0)  #(25(iter)*10(rounds),20), each iter different batch
    rankMetrics = metrics.computeMetrics(ranks.cpu())
    logProbsAll = [torch.cat(lprobs, 0).mean()
                   for lprobs in logProbsAll]  #(10,25) =>(10)
    roundwiseLogProbs = torch.cat(logProbsAll,
                                  0).data.cpu().numpy()  # torch ->numpy
    logProbsMean = roundwiseLogProbs.mean()
    rankMetrics['logProbsMean'] = logProbsMean

    dataset.split = original_split
    return rankMetrics
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None, useNDCG=False):
    '''
        Evaluate A-Bot performance on ranking answer option when it is
        shown ground truth image features, captions and questions.

        Arguments:
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            scoringFunction : A function which computes negative log
                              likelihood of a sequence (answer) given log
                              probabilities under an RNN model. Currently
                              utils.maskedNll is the only such function used.
            exampleLimit    : Maximum number of data points to use from
                              the dataset split. If None, all data points.
    '''

    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit

    numBatches = (numExamples - 1) // batchSize + 1

    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=True,
        num_workers=1,
        collate_fn=dataset.collate_fn)

    # sparse_metrics = SparseGTMetrics()
    ndcg = None
    if useNDCG:
        ndcg = NDCG()
    ranks_json = []

    totalLoss, totalTokens = 0, 0
    ranks = []
    logProbsAll = [[] for _ in range(numRounds)]
    start_t = timer()

    getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x]
    getImgId = lambda x: int(getImgFileName(x)[:-4][-12:])

    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }
        else:
            batch = {
                key: v.contiguous() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }

        image = Variable(batch['img_feat'], volatile=True)
        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        questions = Variable(batch['ques'], volatile=True)
        quesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        options = Variable(batch['opt'], volatile=True)
        optionLens = Variable(batch['opt_len'], volatile=True)

        gtRelevance = None
        round_id = None
        img_ids = None
        correctOptionInds = None

        if split != 'test':
            correctOptionInds = Variable(batch['ans_id'], volatile=True)

        if split == 'val' and useNDCG:
            # read in gtRelevance and round
            gtRelevance = Variable(batch['gt_relevance'],volatile=True)
            round_id = Variable(batch['round_id'],volatile=True)
            img_ids = Variable(batch['image_id'], volatile=True)

        if split == 'test':
            img_ids = [getImgId(x) for x in batch['index']]

        aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        log_probs_rounds = []
        for round in range(numRounds):
            aBot.observe(
                round,
                ques=questions[:, round],
                quesLens=quesLens[:, round],
                ans=answers[:, round],
                ansLens=ansLens[:, round])
            logProbs = aBot.evalOptions(options[:, round],
                                    optionLens[:, round], scoringFunction)
            if useNDCG:
                log_probs_rounds.append(logProbs.unsqueeze(1))
            logProbsCurrent = aBot.forward()
            logProbsAll[round].append(
                scoringFunction(logProbsCurrent,
                                answers[:, round].contiguous()))
            if split != 'test':
                batchRanks = rankOptions(options[:, round],
                                         correctOptionInds[:, round], logProbs)
                ranks.append(batchRanks)
        batch['num_rounds'] = batch['num_rounds'].squeeze(1)
        output = None
        if useNDCG or split == 'test':

            output = torch.cat(log_probs_rounds,dim=1)
            ranks_cur = scores_to_ranks(output)

            for i in range(len(img_ids)):
                # cast into types explicitly to ensure no errors in schema
                # round ids are 1-10, not 0-9
                # "ranks": [rank.data[0] for rank in ranks_cur[i][batch["num_rounds"][i] - 1]]

                if split == "test":
                    ranks_json.append({
                        "image_id": img_ids[i],
                        "round_id": int(batch["num_rounds"][i]),
                        "ranks": ranks_cur[i][batch["num_rounds"][i] - 1].data.cpu().tolist()
                    })
                else:
                    for j in range(numRounds):
                        ranks_json.append({
                            "image_id": img_ids[i].data[0],
                            "round_id": int(j + 1),
                            "ranks": [rank.data[0] for rank in ranks_cur[i][j]]
                        })

        if split == "val":
            # sparse_metrics.observe(output, correctOptionInds)
            if "gt_relevance" in batch and useNDCG:
                indices = torch.arange(output.shape[0]).long().cpu().numpy()
                round_id_numpy = round_id.long().cpu().data.numpy()
                round_id_numpy = round_id_numpy.reshape(-1)
                output = output.cpu().data.numpy()
                output = output[indices, round_id_numpy-1, :]
                output = Variable(torch.from_numpy(output),volatile=True)
                ndcg.observe(output, gtRelevance)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()

    sys.stdout.write("\n")
    dataloader = None
    print("Sleeping for 3 seconds to let dataloader subprocesses exit...")
    dataset.split = original_split

    if split == 'test':
        # dump eval AI file
        dir_out = 'predictions.txt'
        json.dump(ranks_json, open(dir_out, "w"))
        return

    ranks = torch.cat(ranks, 0)
    rankMetrics = metrics.computeMetrics(ranks.cpu())

    logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll]
    roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()
    rankMetrics['logProbsMean'] = logProbsMean

    if split == "val" and useNDCG:
        rankMetrics.update(ndcg.retrieve(reset=True))
        for metric_name, metric_value in rankMetrics.items():
            print(f"{metric_name}: {metric_value}")
    return rankMetrics
示例#4
0
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None):
    """
        Evaluate A-Bot performance on ranking answer option when it is
        shown ground truth image features, captions and questions.

        Arguments:
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            scoringFunction : A function which computes negative log
                              likelihood of a sequence (answer) given log
                              probabilities under an RNN model. Currently
                              utils.maskedNll is the only such function used.
            exampleLimit    : Maximum number of data points to use from
                              the dataset split. If None, all data points.
    """
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit

    numBatches = (numExamples - 1) // batchSize + 1

    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=True,
        num_workers=1,
        collate_fn=dataset.collate_fn,
    )

    totalLoss, totalTokens = 0, 0
    ranks = []
    logProbsAll = [[] for _ in range(numRounds)]
    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda() if hasattr(v, "cuda") else v
                for key, v in batch.items()
            }
        else:
            batch = {
                key: v.contiguous() if hasattr(v, "cuda") else v
                for key, v in batch.items()
            }

        image = batch["img_feat"]
        caption = batch["cap"]
        captionLens = batch["cap_len"]
        questions = batch["ques"]
        quesLens = batch["ques_len"]
        answers = batch["ans"]
        ansLens = batch["ans_len"]
        options = batch["opt"]
        optionLens = batch["opt_len"]
        correctOptionInds = batch["ans_id"]
        aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        with torch.no_grad():
            for round in range(numRounds):
                aBot.observe(
                    round,
                    ques=questions[:, round],
                    quesLens=quesLens[:, round],
                    ans=answers[:, round],
                    ansLens=ansLens[:, round],
                )
                logProbs = aBot.evalOptions(options[:, round],
                                            optionLens[:,
                                                       round], scoringFunction)
                logProbsCurrent = aBot.forward()
                logProbsAll[round].append(
                    scoringFunction(logProbsCurrent,
                                    answers[:, round].contiguous()))
                batchRanks = rankOptions(options[:, round],
                                         correctOptionInds[:, round], logProbs)
                ranks.append(batchRanks)

            end_t = timer()
            delta_t = " Rate: %5.2fs" % (end_t - start_t)
            start_t = end_t
            progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t
            sys.stdout.write(progressString % (split, idx + 1, numBatches))
            sys.stdout.flush()
    sys.stdout.write("\n")
    dataloader = None
    print("Sleeping for 3 seconds to let dataloader subprocesses exit...")
    ranks = torch.cat(ranks, 0)
    rankMetrics = metrics.computeMetrics(ranks.cpu())

    logProbsAll = [torch.stack(lprobs).mean() for lprobs in logProbsAll]
    roundwiseLogProbs = torch.stack(logProbsAll).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()
    rankMetrics["logProbsMean"] = logProbsMean

    dataset.split = original_split
    return rankMetrics
示例#5
0
def rankQABots(qBot, aBot, dataset, split, exampleLimit=None, beamSize=1):
    """
        Evaluates Q-Bot and A-Bot performance on image retrieval where
        both agents must converse with each other without any ground truth
        dialog. The common caption shown to both agents is not the ground
        truth caption, but is instead a caption generated (pre-computed)
        by a pre-trained captioning model (neuraltalk2).

        Arguments:
            qBot    : Q-Bot
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            exampleLimit : Maximum number of data points to use from
                           the dataset split. If None, all data points.
            beamSize     : Beam search width for generating utterrances
    """

    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit
    numBatches = (numExamples - 1) // batchSize + 1
    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=False,
        num_workers=0,
        collate_fn=dataset.collate_fn,
    )

    gtImgFeatures = []
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)]

    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda()
                for key, v in batch.items() if hasattr(v, "cuda")
            }
        else:
            batch = {
                key: v.contiguous()
                for key, v in batch.items() if hasattr(v, "cuda")
            }

        caption = Variable(batch["cap"], volatile=True)
        captionLens = Variable(batch["cap_len"], volatile=True)
        gtQuestions = Variable(batch["ques"], volatile=True)
        gtQuesLens = Variable(batch["ques_len"], volatile=True)
        answers = Variable(batch["ans"], volatile=True)
        ansLens = Variable(batch["ans_len"], volatile=True)
        gtFeatures = Variable(batch["img_feat"], volatile=True)
        image = Variable(batch["img_feat"], volatile=True)

        aBot.eval(), aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        qBot.eval(), qBot.reset()
        qBot.observe(-1, caption=caption, captionLens=captionLens)

        predFeatures = qBot.predictImage()
        roundwiseFeaturePreds[0].append(predFeatures)

        for round in range(numRounds):
            questions, quesLens = qBot.forwardDecode(inference="greedy",
                                                     beamSize=beamSize)
            qBot.observe(round, ques=questions, quesLens=quesLens)
            aBot.observe(round, ques=questions, quesLens=quesLens)
            answers, ansLens = aBot.forwardDecode(inference="greedy",
                                                  beamSize=beamSize)
            aBot.observe(round, ans=answers, ansLens=ansLens)
            qBot.observe(round, ans=answers, ansLens=ansLens)
            predFeatures = qBot.predictImage()
            roundwiseFeaturePreds[round + 1].append(predFeatures)
        gtImgFeatures.append(gtFeatures)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")

    gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy()
    rankMetricsRounds = []

    print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                 0).data.cpu().numpy()
        dists = pairwise_distances(predFeatures, gtFeatures)
        # num_examples x num_examples
        ranks = []
        for i in range(dists.shape[0]):
            # Computing rank of i-th prediction vs all images in split
            rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
            ranks.append(rank)
        ranks = np.array(ranks)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        assert len(ranks) == len(dataset)
        poolSize = len(dataset)
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        print((round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics["percentile"] = meanPercRank
        rankMetricsRounds.append(rankMetrics)

    dataset.split = original_split
    return rankMetricsRounds[-1], rankMetricsRounds
示例#6
0
def rankQBot(qBot, dataset, split, exampleLimit=None, verbose=0):
    """
        Evaluates Q-Bot performance on image retrieval when it is shown
        ground truth captions, questions and answers. Q-Bot does not
        generate dialog in this setting - it only encodes ground truth
        captions and dialog in order to perform image retrieval by
        predicting FC-7 image features after each round of dialog.

        Arguments:
            qBot    : Q-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            exampleLimit : Maximum number of data points to use from
                           the dataset split. If None, all data points.
    """
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit
    numBatches = (numExamples - 1) // batchSize + 1
    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=True,
        num_workers=0,
        collate_fn=dataset.collate_fn,
    )

    # enumerate all gt features and all predicted features
    gtImgFeatures = []
    # caption + dialog rounds
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)]
    logProbsAll = [[] for _ in range(numRounds)]
    featLossAll = [[] for _ in range(numRounds + 1)]
    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda()
                for key, v in batch.items() if hasattr(v, "cuda")
            }
        else:
            batch = {
                key: v.contiguous()
                for key, v in batch.items() if hasattr(v, "cuda")
            }
        caption = batch["cap"]
        captionLens = batch["cap_len"]
        gtQuestions = batch["ques"]
        gtQuesLens = batch["ques_len"]
        answers = batch["ans"]
        ansLens = batch["ans_len"]
        gtFeatures = batch["img_feat"]
        qBot.reset()
        qBot.observe(-1, caption=caption, captionLens=captionLens)
        predFeatures = qBot.predictImage()
        # Evaluating round 0 feature regression network
        featLoss = F.mse_loss(predFeatures, gtFeatures)
        featLossAll[0].append(torch.mean(featLoss))
        # Keeping round 0 predictions
        roundwiseFeaturePreds[0].append(predFeatures)
        with torch.no_grad():
            for round in range(numRounds):
                qBot.observe(round,
                             ques=gtQuestions[:, round],
                             quesLens=gtQuesLens[:, round])
                qBot.observe(round,
                             ans=answers[:, round],
                             ansLens=ansLens[:, round])
                logProbsCurrent = qBot.forward()
                # Evaluating logProbs for cross entropy
                logProbsAll[round].append(
                    utils.maskedNll(logProbsCurrent,
                                    gtQuestions[:, round].contiguous()))
                predFeatures = qBot.predictImage()
                # Evaluating feature regression network
                featLoss = F.mse_loss(predFeatures, gtFeatures)
                featLossAll[round + 1].append(torch.mean(featLoss))
                # Keeping predictions
                roundwiseFeaturePreds[round + 1].append(predFeatures)
        gtImgFeatures.append(gtFeatures)

        end_t = timer()
        delta_t = " Time: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")

    gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy()
    rankMetricsRounds = []
    poolSize = len(dataset)

    # Keeping tracking of feature regression loss and CE logprobs
    logProbsAll = [torch.stack(lprobs).mean() for lprobs in logProbsAll]
    featLossAll = [torch.stack(floss).mean() for floss in featLossAll]
    roundwiseLogProbs = torch.stack(logProbsAll).data.cpu().numpy()
    roundwiseFeatLoss = torch.stack(featLossAll).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()
    featLossMean = roundwiseFeatLoss.mean()

    if verbose:
        print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                 0).data.cpu().numpy()
        # num_examples x num_examples
        dists = pairwise_distances(predFeatures, gtFeatures)
        ranks = []
        for i in range(dists.shape[0]):
            rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
            ranks.append(rank)
        ranks = np.array(ranks)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        if verbose:
            print((round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics["percentile"] = meanPercRank
        rankMetrics["featLoss"] = roundwiseFeatLoss[round]
        if round < len(roundwiseLogProbs):
            rankMetrics["logProbs"] = roundwiseLogProbs[round]
        rankMetricsRounds.append(rankMetrics)

    rankMetricsRounds[-1]["logProbsMean"] = logProbsMean
    rankMetricsRounds[-1]["featLossMean"] = featLossMean

    dataset.split = original_split
    return rankMetricsRounds[-1], rankMetricsRounds
def rankQBot(qBot, dataset, split, exampleLimit=None, verbose=0):
    '''
        Evaluates Q-Bot performance on image retrieval when it is shown
        ground truth captions, questions and answers. Q-Bot does not
        generate dialog in this setting - it only encodes ground truth
        captions and dialog in order to perform image retrieval by
        predicting FC-7 image features after each round of dialog.

        Arguments:
            qBot    : Q-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            exampleLimit : Maximum number of data points to use from
                           the dataset split. If None, all data points.
    '''
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    original_split = dataset.split  # train
    dataset.split = split  # val
    if exampleLimit != None:
        numExamples = exampleLimit
    elif dataset.split == 'val':
        numExamples = dataset.numDataPoints[split] - 3
    else:
        numExamples = dataset.numDataPoints[split]

    numBatches = (numExamples -
                  1) // batchSize + 1  # how much val images per batch(384)
    dataloader = DataLoader(dataset,
                            batch_size=batchSize,
                            shuffle=True,
                            num_workers=1,
                            collate_fn=dataset.collate_fn)

    # enumerate all gt features and all predicted features
    gtImgFeatures = []
    # caption + dialog rounds
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)
                             ]  # initial guess at round -1
    logProbsAll = [[] for _ in range(numRounds)]
    featLossAll = [[]
                   for _ in range(numRounds + 1)]  # initial guess at round -1
    start_t = timer()
    for idx, batch in enumerate(
            dataloader
    ):  # idx 0[[20,4096],[20,4096],[20,4096]...] -> idx 1[[40,4096],[40,4096],[40,4096]...]
        # featloss idx 0[[1],[1],[1]...] -> idx 1[[1,1],[1,1],[1,1]...]
        if idx == numBatches:  # 384
            break
        if dataset.useGPU:
            batch = {
                key: v.cuda()
                for key, v in batch.items() if hasattr(v, 'cuda')
            }
        else:
            batch = {
                key: v.contiguous()
                for key, v in batch.items() if hasattr(v, 'cuda')
            }
        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        gtQuestions = Variable(batch['ques'], volatile=True)
        gtQuesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        gtFeatures = Variable(batch['img_feat'], volatile=True)
        qBot.reset()  # only evaluate, no forward
        qBot.observe(-1, caption=caption, captionLens=captionLens)

        predFeatures = qBot.predictImage()  # (20,4096)
        featLoss = F.mse_loss(predFeatures, gtFeatures)  # batch loss
        featLossAll[0].append(torch.mean(featLoss))
        roundwiseFeaturePreds[0].append(
            predFeatures)  # batch predicted features

        for round in range(
                numRounds
        ):  # predFeatures round 0[[20,4096],[],[]...] -> round 1[[20,4096],[20,4096],[]...]
            # featloss     round 0[[1],[],[]...] -> round 1[[1],[1],[]...]
            qBot.observe(round,
                         ques=gtQuestions[:, round],
                         quesLens=gtQuesLens[:, round])
            qBot.observe(round,
                         ans=answers[:, round],
                         ansLens=ansLens[:, round])
            logProbsCurrent = qBot.forward()  # (20,15,7826)
            logProbsAll[round].append(
                utils.maskedNll(logProbsCurrent,
                                gtQuestions[:, round].contiguous()))
            predFeatures = qBot.predictImage()  # no history encoder forward
            featLoss = F.mse_loss(predFeatures, gtFeatures)
            featLossAll[round + 1].append(torch.mean(featLoss))
            roundwiseFeaturePreds[round + 1].append(predFeatures)
        gtImgFeatures.append(gtFeatures)

        end_t = timer()
        delta_t = " Time: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")

    gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy()
    rankMetricsRounds = []
    poolSize = len(dataset)

    ##############################################################################
    #  loss for sentense & features loss
    ##############################################################################
    logProbsAll = [
        torch.cat(lprobs, 0).mean() for lprobs in logProbsAll
    ]  # (10,384)->(10,1) mean logProbs for 384 cases at each rund
    featLossAll = [
        torch.cat(floss, 0).mean() for floss in featLossAll
    ]  # (11,384)->(11,1) mean featloss for 384 cases at each round
    roundwiseLogProbs = torch.cat(logProbsAll,
                                  0).data.cpu().numpy()  # torch to numpy array
    roundwiseFeatLoss = torch.cat(featLossAll, 0).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()  # total mean logProbs
    featLossMean = roundwiseFeatLoss.mean()

    ##############################################################################
    #   Percentile mean rank
    ##############################################################################
    if verbose:
        print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        predFeatures = torch.cat(
            roundwiseFeaturePreds[round],
            0).data.cpu().numpy()  # (384,20,4096)-> (7663,4096)
        dists = pairwise_distances(predFeatures,
                                   gtFeatures)  # cosine similarity (7663,7663)
        ranks = []
        for i in range(dists.shape[0]):
            rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
            ranks.append(rank)
        ranks = np.array(ranks)  # (7663,)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        if verbose:
            print((round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics['percentile'] = meanPercRank
        rankMetrics['featLoss'] = roundwiseFeatLoss[round]
        if round < len(roundwiseLogProbs):
            rankMetrics['logProbs'] = roundwiseLogProbs[round]
        rankMetricsRounds.append(rankMetrics)

    rankMetricsRounds[-1]['logProbsMean'] = logProbsMean
    rankMetricsRounds[-1]['featLossMean'] = featLossMean

    dataset.split = original_split
    return rankMetricsRounds[-1], rankMetricsRounds
示例#8
0
def rankQABots(qBot,
               aBot,
               dataset,
               split,
               expLowerLimit=None,
               expUpperLimit=None,
               exampleLimit=None,
               beamSize=1,
               numRounds=None,
               zeroCaption=0,
               randomCaption=0):
    '''
        Evaluates Q-Bot and A-Bot performance on image retrieval where
        both agents must converse with each other without any ground truth
        dialog. The common caption shown to both agents is not the ground
        truth caption, but is instead a caption generated (pre-computed)
        by a pre-trained captioning model (neuraltalk2).

        Arguments:
            qBot    : Q-Bot
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            exampleLimit : Maximum number of data points to use from
                           the dataset split. If None, all data points.
            beamSize     : Beam search width for generating utterrances
    '''
    def getRandomCaption(dataset):
        dataloader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=True,
                                num_workers=0,
                                collate_fn=dataset.collate_fn)

        for idx, batch in enumerate(dataloader):
            if dataset.useGPU:
                batch = {key: v.cuda() for key, v in batch.items() \
                         if hasattr(v, 'cuda')}
            else:
                batch = {key: v.contiguous() for key, v in batch.items() \
                         if hasattr(v, 'cuda')}

            caption = batch['cap']
            captionLens = batch['cap_len']
            return caption, captionLens

    if expLowerLimit is not None: assert expUpperLimit is not None

    # batchSize = dataset.batchSize
    batchSize = 1
    print('dataset.batchsize', dataset.batchSize)
    numRounds = dataset.numRounds if numRounds is None else numRounds
    print('dataset.numRounds', dataset.numRounds)
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit
    numBatches = (numExamples - 1) // batchSize + 1
    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(dataset,
                            batch_size=batchSize,
                            shuffle=False,
                            num_workers=0,
                            collate_fn=dataset.collate_fn)

    gtImgFeatures = []
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)]

    for batch in dataloader:
        if dataset.useGPU:
            batch = {key: v.cuda() for key, v in batch.items() \
                                            if hasattr(v, 'cuda')}
        else:
            batch = {key: v.contiguous() for key, v in batch.items() \
                                            if hasattr(v, 'cuda')}
        gtImgFeatures.append(Variable(batch['img_feat'], volatile=True))
    gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy()  # [9629, 4096]

    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if expLowerLimit is not None:
            if idx < expLowerLimit: continue
            if idx >= expUpperLimit: break
        else:
            if idx == numBatches:
                break

        if dataset.useGPU:
            batch = {key: v.cuda() for key, v in batch.items() \
                                            if hasattr(v, 'cuda')}
        else:
            batch = {key: v.contiguous() for key, v in batch.items() \
                                            if hasattr(v, 'cuda')}

        if zeroCaption:  # warn: no deepcopy is used; might make unexpected behaviors
            batch['cap'].zero_()
            gc.collect()
        if randomCaption:
            batch['cap'], batch['cap_len'] = getRandomCaption(dataset)

        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        gtQuestions = Variable(batch['ques'], volatile=True)
        gtQuesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        # gtFeatures = Variable(batch['img_feat'], volatile=True)
        image = Variable(batch['img_feat'], volatile=True)

        aBot.eval(), aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        qBot.eval(), qBot.reset()
        qBot.observe(-1, caption=caption, captionLens=captionLens)

        predFeatures = qBot.predictImage()
        roundwiseFeaturePreds[0].append(predFeatures)

        for round in range(numRounds):
            questions, quesLens = qBot.forwardDecode(inference='greedy',
                                                     beamSize=beamSize)
            qBot.observe(round, ques=questions, quesLens=quesLens)
            aBot.observe(round, ques=questions, quesLens=quesLens)
            answers, ansLens = aBot.forwardDecode(inference='greedy',
                                                  beamSize=beamSize)
            aBot.observe(round, ans=answers, ansLens=ansLens)
            qBot.observe(round, ans=answers, ansLens=ansLens)
            predFeatures = qBot.predictImage()
            roundwiseFeaturePreds[round + 1].append(predFeatures)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")

    rankMetricsRounds = []

    print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                 0).data.cpu().numpy()  # (
        dists = pairwise_distances(predFeatures, gtFeatures)
        # num_examples x num_examples
        ranks = []
        for i in range(dists.shape[0]):
            # Computing rank of i-th prediction vs all images in split
            if expLowerLimit is None:
                rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1  # ???
            else:
                rank = int(
                    np.where(dists[i, :].argsort() == i +
                             expLowerLimit)[0]) + 1  # ???
            ranks.append(rank)
        ranks = np.array(ranks)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        # assert len(ranks) == len(dataset)
        poolSize = len(dataset)
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        print('%d\t%f\t%f\t%f' %
              (round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics['percentile'] = meanPercRank
        rankMetricsRounds.append(rankMetrics)

    dataset.split = original_split
    return rankMetricsRounds[-1], rankMetricsRounds
示例#9
0
def rankABot_category_specific(aBot,
                               dataset,
                               split,
                               category,
                               categoryFiltering,
                               scoringFunction,
                               exampleLimit=None):
    '''
        Evaluate A-Bot performance on ranking answer option when it is
        shown ground truth image features, captions and questions.

        Arguments:
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            scoringFunction : A function which computes negative log
                              likelihood of a sequence (answer) given log
                              probabilities under an RNN model. Currently
                              utils.maskedNll is the only such function used.
            exampleLimit    : Maximum number of data points to use from
                              the dataset split. If None, all data points.
    '''
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit

    numBatches = (numExamples - 1) // batchSize + 1

    skipped_batches = []

    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(dataset,
                            batch_size=batchSize,
                            shuffle=True,
                            num_workers=1,
                            collate_fn=dataset.collate_fn)

    totalLoss, totalTokens = 0, 0
    ranks = []
    logProbsAll = [[] for _ in range(numRounds)]
    start_t = timer()
    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }
        else:
            batch = {
                key: v.contiguous() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }

        image = Variable(batch['img_feat'], volatile=True)
        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        questions = Variable(batch['ques'], volatile=True)
        quesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        options = Variable(batch['opt'], volatile=True)
        optionLens = Variable(batch['opt_len'], volatile=True)
        correctOptionInds = Variable(batch['ans_id'], volatile=True)
        convId = Variable(batch['conv_id'], volatile=False)

        # Get conversation category mapping for the batch
        category_mapping_conv = [
            categoryFiltering.get(str(batched_convId), [])
            for batched_convId in convId.data
        ]
        entire_batch_empty = True
        for category_rounds in category_mapping_conv:
            if len(category_rounds) > 0: entire_batch_empty = False
        if entire_batch_empty:
            skipped_batches.append(idx)
            continue

        aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        for round in range(numRounds):
            aBot.observe(round,
                         ques=questions[:, round],
                         quesLens=quesLens[:, round],
                         ans=answers[:, round],
                         ansLens=ansLens[:, round])

            logProbs = aBot.evalOptions(options[:, round], optionLens[:,
                                                                      round],
                                        scoringFunction)  #batch x 100 options
            logProbsCurrent = aBot.forward(
            )  #batch x max answer length x vocab size

            for bidx in range(len(convId)):
                if round in category_mapping_conv[bidx]:

                    logProbsAll[round].append(
                        scoringFunction(
                            logProbsCurrent[bidx].unsqueeze(0),
                            answers[bidx, round].unsqueeze(0).contiguous()))
                    batchRanks = rankOptions(
                        options[bidx,
                                round].unsqueeze(0), correctOptionInds[bidx,
                                                                       round],
                        logProbs[bidx].unsqueeze(0))  #batch,
                    ranks.append(batchRanks)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()
    sys.stdout.write("\n")
    dataloader = None
    print("Sleeping for 3 seconds to let dataloader subprocesses exit...")
    ranks = torch.cat(
        ranks, 0
    )  #list of num batches*num_rounds, each item is batchsize tensor --> flatten
    rankMetrics = metrics.computeMetrics(ranks.cpu())

    logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll
                   ]  #list<round>:list<104 batches in dataset>:float
    roundwiseLogProbs = torch.cat(logProbsAll,
                                  0).data.cpu().numpy()  #num_rounds,
    logProbsMean = roundwiseLogProbs.mean()  #float
    rankMetrics['logProbsMean'] = 1. * logProbsMean

    dataset.split = original_split
    return rankMetrics
示例#10
0
def DialogEval(val_model, dataset, split, exampleLimit=None, verbose=0, txt_retrieval_mode='mse'):
    print("text retrieval mode is: {}".format(txt_retrieval_mode))
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit
    numBatches = (numExamples - 1) // batchSize + 1
    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=True,
        num_workers=0,
        collate_fn=dataset.collate_fn)

    # enumerate all gt features and all predicted features
    gttxtFeatures = []
    # caption + dialog rounds
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)]
    logProbsAll = [[] for _ in range(numRounds)]
    featLossAll = [[] for _ in range(numRounds + 1)]
    # Added by Mingyang Zhou for Perplexity Computation
    perplexityAll = [[] for _ in range(numRounds)]
    start_t = timer()

    # Modified by Mingyang Zhou
    # Record the wining rates for the questioner in multiple games
    win_rate = [0] * (numRounds + 1)
    num_games = 0

    # Modified by Mingyang Zhou
    all_txt_feat = txtLoader(dataloader, dataset)
    im_ranker = Ranker()

    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda()
                for key, v in batch.items() if hasattr(v, 'cuda')
            }
        else:
            batch = {
                key: v.contiguous()
                for key, v in batch.items() if hasattr(v, 'cuda')
            }
        # caption = Variable(batch['cap'], volatile=True)
        # captionLens = Variable(batch['cap_len'], volatile=True)
        # gtQuestions = Variable(batch['ques'], volatile=True)
        # gtQuesLens = Variable(batch['ques_len'], volatile=True)
        # answers = Variable(batch['ans'], volatile=True)
        # ansLens = Variable(batch['ans_len'], volatile=True)
        # gtFeatures = Variable(batch['txt_feat'], volatile=True)
        with torch.no_grad():
            caption = Variable(batch['cap'])
            captionLens = Variable(batch['cap_len'])
            gtQuestions = Variable(batch['ques'])
            gtQuesLens = Variable(batch['ques_len'])
            answers = Variable(batch['ans'])
            ansLens = Variable(batch['ans_len'])
            if txt_retrieval_mode == "mse":
                if val_model.txtEncodingMode == "txtuess":
                    gtFeatures = val_model.forwardtext(Variable(batch['txt_feat']))
                else:
                    gtFeatures = Variable(batch['txt_feat'])
            else:
                gtFeatures = Variable(batch['txt_feat'])
                gtFeatures = val_model.multimodalpredictIm(gtFeatures)
            text = Variable(batch['txt_feat'])  # Added by Mingyang Zhou
            # Update the Ranker
            if val_model.txtEncodingMode == "txtuess":
                im_ranker.update_rep(val_model, all_txt_feat)

            val_model.reset()
            val_model.observe(-1, caption=caption, captionLens=captionLens)
            if val_model.new_questioner:
                val_model.observe_txt(text)

            if val_model.txtEncodingMode == "txtuess":
                act_index = torch.randint(
                    0, all_txt_feat.size(0) - 1, (text.size(0), 1))
                predicted_text = all_txt_feat[act_index].squeeze(1)
                val_model.observe_txt(predicted_text)

            if txt_retrieval_mode == "mse":
                predFeatures = val_model.predicttext()
                # Evaluating round 0 feature regression network
                featLoss = F.mse_loss(predFeatures, gtFeatures)
                #featLoss = F.mse_loss(predFeatures, gtFeatures)
                featLossAll[0].append(torch.mean(featLoss))
                # Keeping round 0 predictions
                roundwiseFeaturePreds[0].append(predFeatures)

                # Modified by Mingyang Zhou for txtEncoding Mode == "txtuess"
                if val_model.txtEncodingMode == "txtuess":
                    # act_index = im_ranker.nearest_neighbor(
                    #     predFeatures.data, all_txt_feat)
                    act_index = im_ranker.nearest_neighbor(
                        predFeatures.data)
                    predicted_text = all_txt_feat[act_index]

                # Compute the winning rate at round 0, modified by Mingyang
                # Zhou
                round_dists = pairwise_distances(
                    predFeatures.cpu().numpy(), gtFeatures.cpu().numpy())

                for i in range(round_dists.shape[0]):
                    current_rank = int(
                        np.where(round_dists[i, :].argsort() == i)[0]) + 1
                    if current_rank <= 1:
                        win_rate[0] += 1
                    # update the num_games
                    num_games += 1

            elif txt_retrieval_mode == "cosine_similarity":
                dialogEmbedding = val_model.multimodalpredictText()
                featLoss = pairwiseRanking_criterion(
                    gtFeatures, dialogEmbedding)
                featLossAll[0].append(torch.sum(featLoss))
                roundwiseFeaturePreds[0].append(
                    dialogEmbedding)
                # Initailize the round_dists, with each row as the cosine
                # similarity
                round_dists = np.matmul(
                    dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose())
                for i in range(round_dists.shape[0]):
                    current_rank = int(
                        np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1
                    if current_rank <= 1:
                        win_rate[0] += 1
                    # update the num_games
                    num_games += 1

            # convert gtFeatures back to tensor
            # gtFeatures = torch.from_numpy(gtFeatures)

            for round in range(numRounds):
                if val_model.txtEncodingMode == "txtuess":
                    val_model.observe_txt(predicted_text)
                val_model.observe(
                    round,
                    ques=gtQuestions[:, round],
                    quesLens=gtQuesLens[:, round])
                val_model.observe(
                    round, ans=answers[:, round], ansLens=ansLens[:, round])
                logProbsCurrent = val_model.forward()

                # Evaluating logProbs for cross entropy
                logProbsAll[round].append(
                    utils.maskedNll(logProbsCurrent,
                                    gtQuestions[:, round].contiguous()))
                perplexityAll[round].append(utils.maskedPerplexity(logProbsCurrent,
                                                                   gtQuestions[:, round].contiguous()))

                if txt_retrieval_mode == "mse":
                    predFeatures = val_model.predicttext()
                    # Evaluating feature regression network

                    # Deal with different txtEncodingMode
                    featLoss = F.mse_loss(predFeatures, gtFeatures)

                    featLossAll[round + 1].append(torch.mean(featLoss))
                    # Keeping predictions
                    roundwiseFeaturePreds[round + 1].append(predFeatures)

                    # Modified by Mingyang Zhou
                    if val_model.txtEncodingMode == "txtuess":
                        # act_index = im_ranker.nearest_neighbor(
                        #     predFeatures.data, all_txt_feat)
                        act_index = im_ranker.nearest_neighbor(
                            predFeatures.data)
                        predicted_text = all_txt_feat[act_index].squeeze(1)

                    # Compute the winning rate at round 0, modified by Mingyang
                    # Zhou
                    round_dists = pairwise_distances(
                        predFeatures.cpu().numpy(), gtFeatures.cpu().numpy())
                    for i in range(round_dists.shape[0]):
                        current_rank = int(
                            np.where(round_dists[i, :].argsort() == i)[0]) + 1
                        if current_rank <= 1:
                            win_rate[round + 1] += 1

                elif txt_retrieval_mode == "cosine_similarity":
                    dialogEmbedding = val_model.multimodalpredictText()
                    featLoss = pairwiseRanking_criterion(
                        gtFeatures, dialogEmbedding)
                    featLossAll[round + 1].append(torch.sum(featLoss))
                    roundwiseFeaturePreds[round + 1].append(
                        dialogEmbedding)  # Keep the dialogEmbedding, To be modified later.
                    # Initailize the round_dists, with each row as the cosine
                    # similarity
                    round_dists = np.matmul(
                        dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose())
                    for i in range(round_dists.shape[0]):
                        current_rank = int(
                            np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1
                        if current_rank <= 1:
                            win_rate[round + 1] += 1

                # convert gtFeatures back to tensor
                # gtFeatures = torch.from_numpy(gtFeatures)

            gttxtFeatures.append(gtFeatures)

            end_t = timer()
            delta_t = " Time: %5.2fs" % (end_t - start_t)
            start_t = end_t
            progressString = "\r[val_model] Evaluating split '%s' [%d/%d]\t" + delta_t
            sys.stdout.write(progressString % (split, idx + 1, numBatches))
            sys.stdout.flush()

    sys.stdout.write("\n")
    # Compute the win_rate, modified by Mingyang Zhou
    win_rate = [x / num_games for x in win_rate]
    print("The winning rates for {} are: {}".format(split, win_rate))

    gtFeatures = torch.cat(gttxtFeatures, 0).data.cpu().numpy()
    rankMetricsRounds = []
    poolSize = len(dataset)

    # Keeping tracking of feature regression loss and CE logprobs
    # logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll]
    # featLossAll = [torch.cat(floss, 0).mean() for floss in featLossAll]
    # roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy()
    # roundwiseFeatLoss = torch.cat(featLossAll, 0).data.cpu().numpy()
    logProbsAll = [torch.stack(lprobs, 0).mean() for lprobs in logProbsAll]
    # Compute the Mean Perplexity for each round
    perplexityAll = [torch.cat(perplexity, 0).mean().data.item()
                     for perplexity in perplexityAll]

    featLossAll = [torch.stack(floss, 0).mean() for floss in featLossAll]
    roundwiseLogProbs = torch.stack(logProbsAll, 0).data.cpu().numpy()
    roundwiseFeatLoss = torch.stack(featLossAll, 0).data.cpu().numpy()
    # Compute the Mean Perplexity over all rounds
    # roundwisePerplexity = torch.stack(perplexityAll, 0).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()
    featLossMean = roundwiseFeatLoss.mean()
    perplexityMean = sum(perplexityAll) / len(perplexityAll)
    print("The Perplxity of current Questioner is: {}".format(perplexityMean))
    # Added by Mingyang Zhou
    winrateMean = sum(win_rate) / len(win_rate)

    if verbose:
        print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        if txt_retrieval_mode == "mse":
            predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                     0).data.cpu().numpy()
            # num_examples x num_examples
            dists = pairwise_distances(predFeatures, gtFeatures)
            ranks = []
            for i in range(dists.shape[0]):
                rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
                ranks.append(rank)
        elif txt_retrieval_mode == "cosine_similarity":
            predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                     0).data.cpu().numpy()
            dists = np.matmul(predFeatures, gtFeatures.transpose())
            ranks = []
            for i in range(dists.shape[0]):
                rank = int(np.where(dists[i, :].argsort()[::-1] == i)[0]) + 1
                ranks.append(rank)

        ranks = np.array(ranks)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        if verbose:
            print((round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics['percentile'] = meanPercRank
        rankMetrics['featLoss'] = roundwiseFeatLoss[round]
        if round < len(roundwiseLogProbs):
            rankMetrics['logProbs'] = roundwiseLogProbs[round]
        rankMetricsRounds.append(rankMetrics)

    rankMetricsRounds[-1]['logProbsMean'] = logProbsMean
    rankMetricsRounds[-1]['featLossMean'] = featLossMean
    rankMetricsRounds[-1]['winrateMean'] = winrateMean
    # Added the perplexity in eval metrics
    rankMetricsRounds[-1]['perplexityMean'] = perplexityMean

    dataset.split = original_split
    return rankMetricsRounds[-1], rankMetricsRounds
示例#11
0
def DialogEval_2(val_model, target_model, dataset, split, exampleLimit=None, beamSize=1, txt_retrieval_mode='mse'):
    print("text Encoding Mode is: {}".format(val_model.txtEncodingMode))
    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit
    numBatches = (numExamples - 1) // batchSize + 1
    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=False,
        num_workers=0,
        collate_fn=dataset.collate_fn)

    gttxtFeatures = []
    roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)]
    # Added by Mingyang Zhou for Perplexity Computation
    # perplexityAll = [[] for _ in range(numRounds)]

    start_t = timer()

    # Defined by Mingyang Zhou
    win_rate = [0] * (numRounds + 1)
    num_games = 0

    # Modified by Mingyang Zhou
    all_txt_feat = txtLoader(dataloader, dataset)
    im_ranker = Ranker()

    # Update the Ranker
    val_model.eval(), val_model.reset()
    if val_model.txtEncodingMode == "txtuess":
        im_ranker.update_rep(val_model, all_txt_feat)

    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {key: v.cuda() for key, v in batch.items()
                     if hasattr(v, 'cuda')}
        else:
            batch = {key: v.contiguous() for key, v in batch.items()
                     if hasattr(v, 'cuda')}

        # caption = Variable(batch['cap'], volatile=True)
        # captionLens = Variable(batch['cap_len'], volatile=True)
        # gtQuestions = Variable(batch['ques'], volatile=True)
        # gtQuesLens = Variable(batch['ques_len'], volatile=True)
        # answers = Variable(batch['ans'], volatile=True)
        # ansLens = Variable(batch['ans_len'], volatile=True)
        # gtFeatures = Variable(batch['txt_feat'], volatile=True)
        # text = Variable(batch['txt_feat'], volatile=True)
        with torch.no_grad():
            caption = Variable(batch['cap'])
            captionLens = Variable(batch['cap_len'])
            gtQuestions = Variable(batch['ques'])
            gtQuesLens = Variable(batch['ques_len'])
            answers = Variable(batch['ans'])
            ansLens = Variable(batch['ans_len'])
            if txt_retrieval_mode == "mse":
                if val_model.txtEncodingMode == "txtuess":
                    gtFeatures = val_model.forwardtext(Variable(batch['txt_feat']))
                else:
                    gtFeatures = Variable(batch['txt_feat'])
            else:
                gtFeatures = Variable(batch['txt_feat'])
                gtFeatures = val_model.multimodalpredictIm(gtFeatures)
            text = Variable(batch['txt_feat'])

            target_model.eval(), target_model.reset()
            target_model.observe(-1, text=text, caption=caption,
                         captionLens=captionLens)
            val_model.eval(), val_model.reset()
            val_model.observe(-1, caption=caption, captionLens=captionLens)
            if val_model.new_questioner:
                val_model.observe_txt(text)

            if val_model.txtEncodingMode == "txtuess":
                act_index = torch.randint(
                    0, all_txt_feat.size(0) - 1, (text.size(0), 1))
                predicted_text = all_txt_feat[act_index].squeeze(1)
                val_model.observe_txt(predicted_text)

            if txt_retrieval_mode == "mse":
                predFeatures = val_model.predicttext()
                roundwiseFeaturePreds[0].append(predFeatures)

                # Modified by Mingyang Zhou for txtEncoding Mode == "txtuess"
                if val_model.txtEncodingMode == "txtuess":
                    # act_index = im_ranker.nearest_neighbor(
                    #     predFeatures.data, all_txt_feat)
                    act_index = im_ranker.nearest_neighbor(
                        predFeatures.data)
                    predicted_text = all_txt_feat[act_index]
                    # Should observe the current predicted text
                    val_model.observe_txt(predicted_text)

                # Compute the winning rate at round 0, modified by Mingyang
                # Zhou
                round_dists = pairwise_distances(
                    predFeatures.cpu().numpy(), gtFeatures.cpu().numpy())
                for i in range(round_dists.shape[0]):
                    current_rank = int(
                        np.where(round_dists[i, :].argsort() == i)[0]) + 1
                    if current_rank <= 1:
                        win_rate[0] += 1
                    # update the num_games
                    num_games += 1
            elif txt_retrieval_mode == "cosine_similarity":
                dialogEmbedding = val_model.multimodalpredictText()
                roundwiseFeaturePreds[0].append(
                    dialogEmbedding)
                # Initailize the round_dists, with each row as the cosine
                # similarity
                round_dists = np.matmul(
                    dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose())
                for i in range(round_dists.shape[0]):
                    current_rank = int(
                        np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1
                    if current_rank <= 1:
                        win_rate[0] += 1
                    # update the num_games
                    num_games += 1

            for round in range(numRounds):
                # questions, quesLens = val_model.forwardDecode(
                #     inference='greedy', beamSize=beamSize)
                questions, quesLens = val_model.forwardDecode(
                    inference='greedy', beamSize=beamSize)
                # print(logProbsCurrent.size())
                val_model.observe(round, ques=questions, quesLens=quesLens)
                target_model.observe(round, ques=questions, quesLens=quesLens)
                # answers, ansLens = target_model.forwardDecode(
                #     inference='greedy', beamSize=beamSize)
                answers, ansLens = target_model.forwardDecode(
                    inference='greedy', beamSize=beamSize)
                target_model.observe(round, ans=answers, ansLens=ansLens)
                val_model.observe(round, ans=answers, ansLens=ansLens)
                if val_model.new_questioner:
                    val_model.observe_txt(text)
                if val_model.txtEncodingMode == "txtuess":
                    val_model.observe_txt(predicted_text)

                # Added by Mingyang Zhou
                # logProbsCurrent = val_model.forward()
                # perplexityAll[round].append(utils.maskedPerplexity(logProbsCurrent,
                # gtQuestions[:, round].contiguous()))
                if txt_retrieval_mode == "mse":
                    predFeatures = val_model.predicttext()
                    roundwiseFeaturePreds[round + 1].append(predFeatures)

                    # Modified by Mingyang Zhou for txtEncoding Mode ==
                    # "txtuess"
                    if val_model.txtEncodingMode == "txtuess":
                        # act_index = im_ranker.nearest_neighbor(
                        #     predFeatures.data, all_txt_feat)
                        act_index = im_ranker.nearest_neighbor(
                            predFeatures.data)
                        predicted_text = all_txt_feat[act_index]
                    # Compute the winning rate at round 0, modified by Mingyang
                    # Zhou
                    round_dists = pairwise_distances(
                        predFeatures.cpu().numpy(), gtFeatures.cpu().numpy())
                    for i in range(round_dists.shape[0]):
                        current_rank = int(
                            np.where(round_dists[i, :].argsort() == i)[0]) + 1
                        if current_rank <= 1:
                            win_rate[round + 1] += 1
                elif txt_retrieval_mode == "cosine_similarity":
                    dialogEmbedding = val_model.multimodalpredictText()
                    roundwiseFeaturePreds[round + 1].append(
                        dialogEmbedding)  # Keep the dialogEmbedding, To be modified later.
                    # Initailize the round_dists, with each row as the cosine
                    # similarity
                    round_dists = np.matmul(
                        dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose())
                    for i in range(round_dists.shape[0]):
                        current_rank = int(
                            np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1
                        if current_rank <= 1:
                            win_rate[round + 1] += 1

            gttxtFeatures.append(gtFeatures)

            end_t = timer()
            delta_t = " Rate: %5.2fs" % (end_t - start_t)
            start_t = end_t
            progressString = "\r[val_model] Evaluating split '%s' [%d/%d]\t" + delta_t
            sys.stdout.write(progressString % (split, idx + 1, numBatches))
            sys.stdout.flush()
    sys.stdout.write("\n")
    # Compute the win_rate, modified by Mingyang Zhou
    win_rate = [x / num_games for x in win_rate]
    print("The winning rates for {} are: {}".format(split, win_rate))

    gtFeatures = torch.cat(gttxtFeatures, 0).data.cpu().numpy()
    rankMetricsRounds = []
    # Added by Mingyang Zhou
    # perplexityAll = [sum(perplexity) / len(perplexity)
    #                  for perplexity in perplexityAll]
    # perplexityMean = sum(perplexityAll) / len(perplexityAll)
    # print("The Perplxity of current Questioner in the Dialog with a User Simulator is: {}".format(
    #     perplexityMean))

    winrateMean = sum(win_rate) / len(win_rate)
    print("Percentile mean rank (round, mean, low, high)")
    for round in range(numRounds + 1):
        if txt_retrieval_mode == "mse":
            predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                     0).data.cpu().numpy()
            dists = pairwise_distances(predFeatures, gtFeatures)
            # num_examples x num_examples
            ranks = []
            for i in range(dists.shape[0]):
                # Computing rank of i-th prediction vs all texts in split
                rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1
                ranks.append(rank)
        elif txt_retrieval_mode == "cosine_similarity":
            predFeatures = torch.cat(roundwiseFeaturePreds[round],
                                     0).data.cpu().numpy()
            dists = np.matmul(predFeatures, gtFeatures.transpose())
            ranks = []
            for i in range(dists.shape[0]):
                rank = int(np.where(dists[i, :].argsort()[::-1] == i)[0]) + 1
                ranks.append(rank)

        ranks = np.array(ranks)
        rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks)))
        assert len(ranks) == len(dataset)
        poolSize = len(dataset)
        meanRank = ranks.mean()
        se = ranks.std() / np.sqrt(poolSize)
        meanPercRank = 100 * (1 - (meanRank / poolSize))
        percRankLow = 100 * (1 - ((meanRank + se) / poolSize))
        percRankHigh = 100 * (1 - ((meanRank - se) / poolSize))
        print((round, meanPercRank, percRankLow, percRankHigh))
        rankMetrics['percentile'] = meanPercRank
        rankMetricsRounds.append(rankMetrics)

    dataset.split = original_split
    rankMetricsRounds[-1]['winrateMean'] = winrateMean
    return rankMetricsRounds[-1], rankMetricsRounds