def rankQuestioner(self): if self.selectedBatchIdxs is not None: raise RuntimeError( "Cannot use selectedBatchIdxs when evaluating PMR") for batch_idx, batch in enumerate(self.dataloader): if self.expLowerLimit is not None: if batch_idx < self.expLowerLimit: continue if batch_idx >= self.expUpperLimit: break else: if batch_idx >= self.numBatches: break self.runDialog(batch_idx, batch, printSummary=True) rankMetricsRounds = [] print("Percentile mean rank (round, mean, low, high)") if self.saveLogs: csv_file = open(os.path.join(self.logsDir, "PMR__%s_%s.csv"), mode="w") writer = csv.DictWriter( csv_file, ["round", "meanPercRank", "percRankLow", "percRankHigh"]) writer.writeheader() if not self.slGuesser: for round in range(self.numRounds + 1): ''' 1. sort and get index within each example 2. get ranks ''' # num_examples x num_examples rank = self.ranks[round] rankMetrics = metrics.computeMetrics( Variable(torch.from_numpy(rank))) poolSize = len(self.dataset) # assert len(ranks) == len(dataset) meanRank = rank.mean() se = rank.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) print('%d\t%f\t%f\t%f' % (round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetricsRounds.append(rankMetrics) if self.saveLogs: writer.writerow({ "round": round, "meanPercRank": meanPercRank, "percRankLow": percRankLow, "percRankHigh": percRankHigh }) else: gtFeatures = self.gtImgFeatures.data.cpu().numpy() for round in range(self.numRounds + 1): predFeatures = torch.cat(self.roundwiseFeaturePreds[round], 0).data.cpu().numpy() # ( dists = pairwise_distances(predFeatures, gtFeatures) # num_examples x num_examples ranks = [] for i in range(dists.shape[0]): # Computing rank of i-th prediction vs all images in split if self.expLowerLimit is None: rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 else: rank = int( np.where(dists[i, :].argsort() == i + self.expLowerLimit)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics( Variable(torch.from_numpy(ranks))) poolSize = len(self.dataset) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) print('%d\t%f\t%f\t%f' % (round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetricsRounds.append(rankMetrics) if self.saveLogs: writer.writerow({ "round": round, "meanPercRank": meanPercRank, "percRankLow": percRankLow, "percRankHigh": percRankHigh }) self.dataset.split = self.original_split return rankMetricsRounds[-1], rankMetricsRounds
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None): ''' Evaluate A-Bot performance on ranking answer option when it is shown ground truth image features, captions and questions. Arguments: aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' scoringFunction : A function which computes negative log likelihood of a sequence (answer) given log probabilities under an RNN model. Currently utils.maskedNll is the only such function used. exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. ''' batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader(dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn) totalLoss, totalTokens = 0, 0 ranks = [] logProbsAll = [[] for _ in range(numRounds)] start_t = timer() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() if hasattr(v, 'cuda') else v for key, v in batch.items() } else: batch = { key: v.contiguous() if hasattr(v, 'cuda') else v for key, v in batch.items() } image = Variable(batch['img_feat_vgg'], volatile=True) image_36 = Variable(batch['img_feat_36'], volatile=True) caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) questions = Variable(batch['ques'], volatile=True) quesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) options = Variable(batch['opt'], volatile=True) optionLens = Variable(batch['opt_len'], volatile=True) correctOptionInds = Variable(batch['ans_id'], volatile=True) aBot.reset() aBot.observe(-1, image=[image, image_36], caption=caption, captionLens=captionLens) for round in range(numRounds): aBot.observe(round, ques=questions[:, round], quesLens=quesLens[:, round], ans=answers[:, round], ansLens=ansLens[:, round]) logProbs = aBot.evalOptions( options[:, round], optionLens[:, round], scoringFunction) # Option: forward +loss (20,100) logProbsCurrent = aBot.forward( ) # current Answer: no forward (20,22,7826) logProbsAll[round].append( scoringFunction(logProbsCurrent, answers[:, round].contiguous( ))) #current Answer: loss (1) return scores = False batchRanks = rankOptions(options[:, round], correctOptionInds[:, round], logProbs) #(20,1) ranks.append(batchRanks) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") dataloader = None print("Sleeping for 3 seconds to let dataloader subprocesses exit...") ranks = torch.cat(ranks, 0) #(25(iter)*10(rounds),20), each iter different batch rankMetrics = metrics.computeMetrics(ranks.cpu()) logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll] #(10,25) =>(10) roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() # torch ->numpy logProbsMean = roundwiseLogProbs.mean() rankMetrics['logProbsMean'] = logProbsMean dataset.split = original_split return rankMetrics
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None, useNDCG=False): ''' Evaluate A-Bot performance on ranking answer option when it is shown ground truth image features, captions and questions. Arguments: aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' scoringFunction : A function which computes negative log likelihood of a sequence (answer) given log probabilities under an RNN model. Currently utils.maskedNll is the only such function used. exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. ''' batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn) # sparse_metrics = SparseGTMetrics() ndcg = None if useNDCG: ndcg = NDCG() ranks_json = [] totalLoss, totalTokens = 0, 0 ranks = [] logProbsAll = [[] for _ in range(numRounds)] start_t = timer() getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x] getImgId = lambda x: int(getImgFileName(x)[:-4][-12:]) for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() if hasattr(v, 'cuda') else v for key, v in batch.items() } else: batch = { key: v.contiguous() if hasattr(v, 'cuda') else v for key, v in batch.items() } image = Variable(batch['img_feat'], volatile=True) caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) questions = Variable(batch['ques'], volatile=True) quesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) options = Variable(batch['opt'], volatile=True) optionLens = Variable(batch['opt_len'], volatile=True) gtRelevance = None round_id = None img_ids = None correctOptionInds = None if split != 'test': correctOptionInds = Variable(batch['ans_id'], volatile=True) if split == 'val' and useNDCG: # read in gtRelevance and round gtRelevance = Variable(batch['gt_relevance'],volatile=True) round_id = Variable(batch['round_id'],volatile=True) img_ids = Variable(batch['image_id'], volatile=True) if split == 'test': img_ids = [getImgId(x) for x in batch['index']] aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) log_probs_rounds = [] for round in range(numRounds): aBot.observe( round, ques=questions[:, round], quesLens=quesLens[:, round], ans=answers[:, round], ansLens=ansLens[:, round]) logProbs = aBot.evalOptions(options[:, round], optionLens[:, round], scoringFunction) if useNDCG: log_probs_rounds.append(logProbs.unsqueeze(1)) logProbsCurrent = aBot.forward() logProbsAll[round].append( scoringFunction(logProbsCurrent, answers[:, round].contiguous())) if split != 'test': batchRanks = rankOptions(options[:, round], correctOptionInds[:, round], logProbs) ranks.append(batchRanks) batch['num_rounds'] = batch['num_rounds'].squeeze(1) output = None if useNDCG or split == 'test': output = torch.cat(log_probs_rounds,dim=1) ranks_cur = scores_to_ranks(output) for i in range(len(img_ids)): # cast into types explicitly to ensure no errors in schema # round ids are 1-10, not 0-9 # "ranks": [rank.data[0] for rank in ranks_cur[i][batch["num_rounds"][i] - 1]] if split == "test": ranks_json.append({ "image_id": img_ids[i], "round_id": int(batch["num_rounds"][i]), "ranks": ranks_cur[i][batch["num_rounds"][i] - 1].data.cpu().tolist() }) else: for j in range(numRounds): ranks_json.append({ "image_id": img_ids[i].data[0], "round_id": int(j + 1), "ranks": [rank.data[0] for rank in ranks_cur[i][j]] }) if split == "val": # sparse_metrics.observe(output, correctOptionInds) if "gt_relevance" in batch and useNDCG: indices = torch.arange(output.shape[0]).long().cpu().numpy() round_id_numpy = round_id.long().cpu().data.numpy() round_id_numpy = round_id_numpy.reshape(-1) output = output.cpu().data.numpy() output = output[indices, round_id_numpy-1, :] output = Variable(torch.from_numpy(output),volatile=True) ndcg.observe(output, gtRelevance) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") dataloader = None print("Sleeping for 3 seconds to let dataloader subprocesses exit...") dataset.split = original_split if split == 'test': # dump eval AI file dir_out = 'predictions.txt' json.dump(ranks_json, open(dir_out, "w")) return ranks = torch.cat(ranks, 0) rankMetrics = metrics.computeMetrics(ranks.cpu()) logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll] roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() rankMetrics['logProbsMean'] = logProbsMean if split == "val" and useNDCG: rankMetrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in rankMetrics.items(): print(f"{metric_name}: {metric_value}") return rankMetrics
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None): """ Evaluate A-Bot performance on ranking answer option when it is shown ground truth image features, captions and questions. Arguments: aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' scoringFunction : A function which computes negative log likelihood of a sequence (answer) given log probabilities under an RNN model. Currently utils.maskedNll is the only such function used. exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. """ batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn, ) totalLoss, totalTokens = 0, 0 ranks = [] logProbsAll = [[] for _ in range(numRounds)] start_t = timer() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() if hasattr(v, "cuda") else v for key, v in batch.items() } else: batch = { key: v.contiguous() if hasattr(v, "cuda") else v for key, v in batch.items() } image = batch["img_feat"] caption = batch["cap"] captionLens = batch["cap_len"] questions = batch["ques"] quesLens = batch["ques_len"] answers = batch["ans"] ansLens = batch["ans_len"] options = batch["opt"] optionLens = batch["opt_len"] correctOptionInds = batch["ans_id"] aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) with torch.no_grad(): for round in range(numRounds): aBot.observe( round, ques=questions[:, round], quesLens=quesLens[:, round], ans=answers[:, round], ansLens=ansLens[:, round], ) logProbs = aBot.evalOptions(options[:, round], optionLens[:, round], scoringFunction) logProbsCurrent = aBot.forward() logProbsAll[round].append( scoringFunction(logProbsCurrent, answers[:, round].contiguous())) batchRanks = rankOptions(options[:, round], correctOptionInds[:, round], logProbs) ranks.append(batchRanks) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") dataloader = None print("Sleeping for 3 seconds to let dataloader subprocesses exit...") ranks = torch.cat(ranks, 0) rankMetrics = metrics.computeMetrics(ranks.cpu()) logProbsAll = [torch.stack(lprobs).mean() for lprobs in logProbsAll] roundwiseLogProbs = torch.stack(logProbsAll).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() rankMetrics["logProbsMean"] = logProbsMean dataset.split = original_split return rankMetrics
def rankQABots(qBot, aBot, dataset, split, exampleLimit=None, beamSize=1): """ Evaluates Q-Bot and A-Bot performance on image retrieval where both agents must converse with each other without any ground truth dialog. The common caption shown to both agents is not the ground truth caption, but is instead a caption generated (pre-computed) by a pre-trained captioning model (neuraltalk2). Arguments: qBot : Q-Bot aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. beamSize : Beam search width for generating utterrances """ batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn, ) gtImgFeatures = [] roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)] start_t = timer() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() for key, v in batch.items() if hasattr(v, "cuda") } else: batch = { key: v.contiguous() for key, v in batch.items() if hasattr(v, "cuda") } caption = Variable(batch["cap"], volatile=True) captionLens = Variable(batch["cap_len"], volatile=True) gtQuestions = Variable(batch["ques"], volatile=True) gtQuesLens = Variable(batch["ques_len"], volatile=True) answers = Variable(batch["ans"], volatile=True) ansLens = Variable(batch["ans_len"], volatile=True) gtFeatures = Variable(batch["img_feat"], volatile=True) image = Variable(batch["img_feat"], volatile=True) aBot.eval(), aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) qBot.eval(), qBot.reset() qBot.observe(-1, caption=caption, captionLens=captionLens) predFeatures = qBot.predictImage() roundwiseFeaturePreds[0].append(predFeatures) for round in range(numRounds): questions, quesLens = qBot.forwardDecode(inference="greedy", beamSize=beamSize) qBot.observe(round, ques=questions, quesLens=quesLens) aBot.observe(round, ques=questions, quesLens=quesLens) answers, ansLens = aBot.forwardDecode(inference="greedy", beamSize=beamSize) aBot.observe(round, ans=answers, ansLens=ansLens) qBot.observe(round, ans=answers, ansLens=ansLens) predFeatures = qBot.predictImage() roundwiseFeaturePreds[round + 1].append(predFeatures) gtImgFeatures.append(gtFeatures) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy() rankMetricsRounds = [] print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() dists = pairwise_distances(predFeatures, gtFeatures) # num_examples x num_examples ranks = [] for i in range(dists.shape[0]): # Computing rank of i-th prediction vs all images in split rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) assert len(ranks) == len(dataset) poolSize = len(dataset) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) print((round, meanPercRank, percRankLow, percRankHigh)) rankMetrics["percentile"] = meanPercRank rankMetricsRounds.append(rankMetrics) dataset.split = original_split return rankMetricsRounds[-1], rankMetricsRounds
def rankQBot(qBot, dataset, split, exampleLimit=None, verbose=0): """ Evaluates Q-Bot performance on image retrieval when it is shown ground truth captions, questions and answers. Q-Bot does not generate dialog in this setting - it only encodes ground truth captions and dialog in order to perform image retrieval by predicting FC-7 image features after each round of dialog. Arguments: qBot : Q-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. """ batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=True, num_workers=0, collate_fn=dataset.collate_fn, ) # enumerate all gt features and all predicted features gtImgFeatures = [] # caption + dialog rounds roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)] logProbsAll = [[] for _ in range(numRounds)] featLossAll = [[] for _ in range(numRounds + 1)] start_t = timer() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() for key, v in batch.items() if hasattr(v, "cuda") } else: batch = { key: v.contiguous() for key, v in batch.items() if hasattr(v, "cuda") } caption = batch["cap"] captionLens = batch["cap_len"] gtQuestions = batch["ques"] gtQuesLens = batch["ques_len"] answers = batch["ans"] ansLens = batch["ans_len"] gtFeatures = batch["img_feat"] qBot.reset() qBot.observe(-1, caption=caption, captionLens=captionLens) predFeatures = qBot.predictImage() # Evaluating round 0 feature regression network featLoss = F.mse_loss(predFeatures, gtFeatures) featLossAll[0].append(torch.mean(featLoss)) # Keeping round 0 predictions roundwiseFeaturePreds[0].append(predFeatures) with torch.no_grad(): for round in range(numRounds): qBot.observe(round, ques=gtQuestions[:, round], quesLens=gtQuesLens[:, round]) qBot.observe(round, ans=answers[:, round], ansLens=ansLens[:, round]) logProbsCurrent = qBot.forward() # Evaluating logProbs for cross entropy logProbsAll[round].append( utils.maskedNll(logProbsCurrent, gtQuestions[:, round].contiguous())) predFeatures = qBot.predictImage() # Evaluating feature regression network featLoss = F.mse_loss(predFeatures, gtFeatures) featLossAll[round + 1].append(torch.mean(featLoss)) # Keeping predictions roundwiseFeaturePreds[round + 1].append(predFeatures) gtImgFeatures.append(gtFeatures) end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy() rankMetricsRounds = [] poolSize = len(dataset) # Keeping tracking of feature regression loss and CE logprobs logProbsAll = [torch.stack(lprobs).mean() for lprobs in logProbsAll] featLossAll = [torch.stack(floss).mean() for floss in featLossAll] roundwiseLogProbs = torch.stack(logProbsAll).data.cpu().numpy() roundwiseFeatLoss = torch.stack(featLossAll).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() featLossMean = roundwiseFeatLoss.mean() if verbose: print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() # num_examples x num_examples dists = pairwise_distances(predFeatures, gtFeatures) ranks = [] for i in range(dists.shape[0]): rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) if verbose: print((round, meanPercRank, percRankLow, percRankHigh)) rankMetrics["percentile"] = meanPercRank rankMetrics["featLoss"] = roundwiseFeatLoss[round] if round < len(roundwiseLogProbs): rankMetrics["logProbs"] = roundwiseLogProbs[round] rankMetricsRounds.append(rankMetrics) rankMetricsRounds[-1]["logProbsMean"] = logProbsMean rankMetricsRounds[-1]["featLossMean"] = featLossMean dataset.split = original_split return rankMetricsRounds[-1], rankMetricsRounds
def rankQBot(qBot, dataset, split, exampleLimit=None, verbose=0): ''' Evaluates Q-Bot performance on image retrieval when it is shown ground truth captions, questions and answers. Q-Bot does not generate dialog in this setting - it only encodes ground truth captions and dialog in order to perform image retrieval by predicting FC-7 image features after each round of dialog. Arguments: qBot : Q-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. ''' batchSize = dataset.batchSize numRounds = dataset.numRounds original_split = dataset.split # train dataset.split = split # val if exampleLimit != None: numExamples = exampleLimit elif dataset.split == 'val': numExamples = dataset.numDataPoints[split] - 3 else: numExamples = dataset.numDataPoints[split] numBatches = (numExamples - 1) // batchSize + 1 # how much val images per batch(384) dataloader = DataLoader(dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn) # enumerate all gt features and all predicted features gtImgFeatures = [] # caption + dialog rounds roundwiseFeaturePreds = [[] for _ in range(numRounds + 1) ] # initial guess at round -1 logProbsAll = [[] for _ in range(numRounds)] featLossAll = [[] for _ in range(numRounds + 1)] # initial guess at round -1 start_t = timer() for idx, batch in enumerate( dataloader ): # idx 0[[20,4096],[20,4096],[20,4096]...] -> idx 1[[40,4096],[40,4096],[40,4096]...] # featloss idx 0[[1],[1],[1]...] -> idx 1[[1,1],[1,1],[1,1]...] if idx == numBatches: # 384 break if dataset.useGPU: batch = { key: v.cuda() for key, v in batch.items() if hasattr(v, 'cuda') } else: batch = { key: v.contiguous() for key, v in batch.items() if hasattr(v, 'cuda') } caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) gtQuestions = Variable(batch['ques'], volatile=True) gtQuesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) gtFeatures = Variable(batch['img_feat'], volatile=True) qBot.reset() # only evaluate, no forward qBot.observe(-1, caption=caption, captionLens=captionLens) predFeatures = qBot.predictImage() # (20,4096) featLoss = F.mse_loss(predFeatures, gtFeatures) # batch loss featLossAll[0].append(torch.mean(featLoss)) roundwiseFeaturePreds[0].append( predFeatures) # batch predicted features for round in range( numRounds ): # predFeatures round 0[[20,4096],[],[]...] -> round 1[[20,4096],[20,4096],[]...] # featloss round 0[[1],[],[]...] -> round 1[[1],[1],[]...] qBot.observe(round, ques=gtQuestions[:, round], quesLens=gtQuesLens[:, round]) qBot.observe(round, ans=answers[:, round], ansLens=ansLens[:, round]) logProbsCurrent = qBot.forward() # (20,15,7826) logProbsAll[round].append( utils.maskedNll(logProbsCurrent, gtQuestions[:, round].contiguous())) predFeatures = qBot.predictImage() # no history encoder forward featLoss = F.mse_loss(predFeatures, gtFeatures) featLossAll[round + 1].append(torch.mean(featLoss)) roundwiseFeaturePreds[round + 1].append(predFeatures) gtImgFeatures.append(gtFeatures) end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy() rankMetricsRounds = [] poolSize = len(dataset) ############################################################################## # loss for sentense & features loss ############################################################################## logProbsAll = [ torch.cat(lprobs, 0).mean() for lprobs in logProbsAll ] # (10,384)->(10,1) mean logProbs for 384 cases at each rund featLossAll = [ torch.cat(floss, 0).mean() for floss in featLossAll ] # (11,384)->(11,1) mean featloss for 384 cases at each round roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() # torch to numpy array roundwiseFeatLoss = torch.cat(featLossAll, 0).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() # total mean logProbs featLossMean = roundwiseFeatLoss.mean() ############################################################################## # Percentile mean rank ############################################################################## if verbose: print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): predFeatures = torch.cat( roundwiseFeaturePreds[round], 0).data.cpu().numpy() # (384,20,4096)-> (7663,4096) dists = pairwise_distances(predFeatures, gtFeatures) # cosine similarity (7663,7663) ranks = [] for i in range(dists.shape[0]): rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) # (7663,) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) if verbose: print((round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetrics['featLoss'] = roundwiseFeatLoss[round] if round < len(roundwiseLogProbs): rankMetrics['logProbs'] = roundwiseLogProbs[round] rankMetricsRounds.append(rankMetrics) rankMetricsRounds[-1]['logProbsMean'] = logProbsMean rankMetricsRounds[-1]['featLossMean'] = featLossMean dataset.split = original_split return rankMetricsRounds[-1], rankMetricsRounds
def rankQABots(qBot, aBot, dataset, split, expLowerLimit=None, expUpperLimit=None, exampleLimit=None, beamSize=1, numRounds=None, zeroCaption=0, randomCaption=0): ''' Evaluates Q-Bot and A-Bot performance on image retrieval where both agents must converse with each other without any ground truth dialog. The common caption shown to both agents is not the ground truth caption, but is instead a caption generated (pre-computed) by a pre-trained captioning model (neuraltalk2). Arguments: qBot : Q-Bot aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. beamSize : Beam search width for generating utterrances ''' def getRandomCaption(dataset): dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0, collate_fn=dataset.collate_fn) for idx, batch in enumerate(dataloader): if dataset.useGPU: batch = {key: v.cuda() for key, v in batch.items() \ if hasattr(v, 'cuda')} else: batch = {key: v.contiguous() for key, v in batch.items() \ if hasattr(v, 'cuda')} caption = batch['cap'] captionLens = batch['cap_len'] return caption, captionLens if expLowerLimit is not None: assert expUpperLimit is not None # batchSize = dataset.batchSize batchSize = 1 print('dataset.batchsize', dataset.batchSize) numRounds = dataset.numRounds if numRounds is None else numRounds print('dataset.numRounds', dataset.numRounds) if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader(dataset, batch_size=batchSize, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) gtImgFeatures = [] roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)] for batch in dataloader: if dataset.useGPU: batch = {key: v.cuda() for key, v in batch.items() \ if hasattr(v, 'cuda')} else: batch = {key: v.contiguous() for key, v in batch.items() \ if hasattr(v, 'cuda')} gtImgFeatures.append(Variable(batch['img_feat'], volatile=True)) gtFeatures = torch.cat(gtImgFeatures, 0).data.cpu().numpy() # [9629, 4096] start_t = timer() for idx, batch in enumerate(dataloader): if expLowerLimit is not None: if idx < expLowerLimit: continue if idx >= expUpperLimit: break else: if idx == numBatches: break if dataset.useGPU: batch = {key: v.cuda() for key, v in batch.items() \ if hasattr(v, 'cuda')} else: batch = {key: v.contiguous() for key, v in batch.items() \ if hasattr(v, 'cuda')} if zeroCaption: # warn: no deepcopy is used; might make unexpected behaviors batch['cap'].zero_() gc.collect() if randomCaption: batch['cap'], batch['cap_len'] = getRandomCaption(dataset) caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) gtQuestions = Variable(batch['ques'], volatile=True) gtQuesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) # gtFeatures = Variable(batch['img_feat'], volatile=True) image = Variable(batch['img_feat'], volatile=True) aBot.eval(), aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) qBot.eval(), qBot.reset() qBot.observe(-1, caption=caption, captionLens=captionLens) predFeatures = qBot.predictImage() roundwiseFeaturePreds[0].append(predFeatures) for round in range(numRounds): questions, quesLens = qBot.forwardDecode(inference='greedy', beamSize=beamSize) qBot.observe(round, ques=questions, quesLens=quesLens) aBot.observe(round, ques=questions, quesLens=quesLens) answers, ansLens = aBot.forwardDecode(inference='greedy', beamSize=beamSize) aBot.observe(round, ans=answers, ansLens=ansLens) qBot.observe(round, ans=answers, ansLens=ansLens) predFeatures = qBot.predictImage() roundwiseFeaturePreds[round + 1].append(predFeatures) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Qbot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") rankMetricsRounds = [] print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() # ( dists = pairwise_distances(predFeatures, gtFeatures) # num_examples x num_examples ranks = [] for i in range(dists.shape[0]): # Computing rank of i-th prediction vs all images in split if expLowerLimit is None: rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 # ??? else: rank = int( np.where(dists[i, :].argsort() == i + expLowerLimit)[0]) + 1 # ??? ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) # assert len(ranks) == len(dataset) poolSize = len(dataset) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) print('%d\t%f\t%f\t%f' % (round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetricsRounds.append(rankMetrics) dataset.split = original_split return rankMetricsRounds[-1], rankMetricsRounds
def rankABot_category_specific(aBot, dataset, split, category, categoryFiltering, scoringFunction, exampleLimit=None): ''' Evaluate A-Bot performance on ranking answer option when it is shown ground truth image features, captions and questions. Arguments: aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' scoringFunction : A function which computes negative log likelihood of a sequence (answer) given log probabilities under an RNN model. Currently utils.maskedNll is the only such function used. exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. ''' batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 skipped_batches = [] original_split = dataset.split dataset.split = split dataloader = DataLoader(dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn) totalLoss, totalTokens = 0, 0 ranks = [] logProbsAll = [[] for _ in range(numRounds)] start_t = timer() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() if hasattr(v, 'cuda') else v for key, v in batch.items() } else: batch = { key: v.contiguous() if hasattr(v, 'cuda') else v for key, v in batch.items() } image = Variable(batch['img_feat'], volatile=True) caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) questions = Variable(batch['ques'], volatile=True) quesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) options = Variable(batch['opt'], volatile=True) optionLens = Variable(batch['opt_len'], volatile=True) correctOptionInds = Variable(batch['ans_id'], volatile=True) convId = Variable(batch['conv_id'], volatile=False) # Get conversation category mapping for the batch category_mapping_conv = [ categoryFiltering.get(str(batched_convId), []) for batched_convId in convId.data ] entire_batch_empty = True for category_rounds in category_mapping_conv: if len(category_rounds) > 0: entire_batch_empty = False if entire_batch_empty: skipped_batches.append(idx) continue aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) for round in range(numRounds): aBot.observe(round, ques=questions[:, round], quesLens=quesLens[:, round], ans=answers[:, round], ansLens=ansLens[:, round]) logProbs = aBot.evalOptions(options[:, round], optionLens[:, round], scoringFunction) #batch x 100 options logProbsCurrent = aBot.forward( ) #batch x max answer length x vocab size for bidx in range(len(convId)): if round in category_mapping_conv[bidx]: logProbsAll[round].append( scoringFunction( logProbsCurrent[bidx].unsqueeze(0), answers[bidx, round].unsqueeze(0).contiguous())) batchRanks = rankOptions( options[bidx, round].unsqueeze(0), correctOptionInds[bidx, round], logProbs[bidx].unsqueeze(0)) #batch, ranks.append(batchRanks) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") dataloader = None print("Sleeping for 3 seconds to let dataloader subprocesses exit...") ranks = torch.cat( ranks, 0 ) #list of num batches*num_rounds, each item is batchsize tensor --> flatten rankMetrics = metrics.computeMetrics(ranks.cpu()) logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll ] #list<round>:list<104 batches in dataset>:float roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() #num_rounds, logProbsMean = roundwiseLogProbs.mean() #float rankMetrics['logProbsMean'] = 1. * logProbsMean dataset.split = original_split return rankMetrics
def DialogEval(val_model, dataset, split, exampleLimit=None, verbose=0, txt_retrieval_mode='mse'): print("text retrieval mode is: {}".format(txt_retrieval_mode)) batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=True, num_workers=0, collate_fn=dataset.collate_fn) # enumerate all gt features and all predicted features gttxtFeatures = [] # caption + dialog rounds roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)] logProbsAll = [[] for _ in range(numRounds)] featLossAll = [[] for _ in range(numRounds + 1)] # Added by Mingyang Zhou for Perplexity Computation perplexityAll = [[] for _ in range(numRounds)] start_t = timer() # Modified by Mingyang Zhou # Record the wining rates for the questioner in multiple games win_rate = [0] * (numRounds + 1) num_games = 0 # Modified by Mingyang Zhou all_txt_feat = txtLoader(dataloader, dataset) im_ranker = Ranker() for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() for key, v in batch.items() if hasattr(v, 'cuda') } else: batch = { key: v.contiguous() for key, v in batch.items() if hasattr(v, 'cuda') } # caption = Variable(batch['cap'], volatile=True) # captionLens = Variable(batch['cap_len'], volatile=True) # gtQuestions = Variable(batch['ques'], volatile=True) # gtQuesLens = Variable(batch['ques_len'], volatile=True) # answers = Variable(batch['ans'], volatile=True) # ansLens = Variable(batch['ans_len'], volatile=True) # gtFeatures = Variable(batch['txt_feat'], volatile=True) with torch.no_grad(): caption = Variable(batch['cap']) captionLens = Variable(batch['cap_len']) gtQuestions = Variable(batch['ques']) gtQuesLens = Variable(batch['ques_len']) answers = Variable(batch['ans']) ansLens = Variable(batch['ans_len']) if txt_retrieval_mode == "mse": if val_model.txtEncodingMode == "txtuess": gtFeatures = val_model.forwardtext(Variable(batch['txt_feat'])) else: gtFeatures = Variable(batch['txt_feat']) else: gtFeatures = Variable(batch['txt_feat']) gtFeatures = val_model.multimodalpredictIm(gtFeatures) text = Variable(batch['txt_feat']) # Added by Mingyang Zhou # Update the Ranker if val_model.txtEncodingMode == "txtuess": im_ranker.update_rep(val_model, all_txt_feat) val_model.reset() val_model.observe(-1, caption=caption, captionLens=captionLens) if val_model.new_questioner: val_model.observe_txt(text) if val_model.txtEncodingMode == "txtuess": act_index = torch.randint( 0, all_txt_feat.size(0) - 1, (text.size(0), 1)) predicted_text = all_txt_feat[act_index].squeeze(1) val_model.observe_txt(predicted_text) if txt_retrieval_mode == "mse": predFeatures = val_model.predicttext() # Evaluating round 0 feature regression network featLoss = F.mse_loss(predFeatures, gtFeatures) #featLoss = F.mse_loss(predFeatures, gtFeatures) featLossAll[0].append(torch.mean(featLoss)) # Keeping round 0 predictions roundwiseFeaturePreds[0].append(predFeatures) # Modified by Mingyang Zhou for txtEncoding Mode == "txtuess" if val_model.txtEncodingMode == "txtuess": # act_index = im_ranker.nearest_neighbor( # predFeatures.data, all_txt_feat) act_index = im_ranker.nearest_neighbor( predFeatures.data) predicted_text = all_txt_feat[act_index] # Compute the winning rate at round 0, modified by Mingyang # Zhou round_dists = pairwise_distances( predFeatures.cpu().numpy(), gtFeatures.cpu().numpy()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort() == i)[0]) + 1 if current_rank <= 1: win_rate[0] += 1 # update the num_games num_games += 1 elif txt_retrieval_mode == "cosine_similarity": dialogEmbedding = val_model.multimodalpredictText() featLoss = pairwiseRanking_criterion( gtFeatures, dialogEmbedding) featLossAll[0].append(torch.sum(featLoss)) roundwiseFeaturePreds[0].append( dialogEmbedding) # Initailize the round_dists, with each row as the cosine # similarity round_dists = np.matmul( dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1 if current_rank <= 1: win_rate[0] += 1 # update the num_games num_games += 1 # convert gtFeatures back to tensor # gtFeatures = torch.from_numpy(gtFeatures) for round in range(numRounds): if val_model.txtEncodingMode == "txtuess": val_model.observe_txt(predicted_text) val_model.observe( round, ques=gtQuestions[:, round], quesLens=gtQuesLens[:, round]) val_model.observe( round, ans=answers[:, round], ansLens=ansLens[:, round]) logProbsCurrent = val_model.forward() # Evaluating logProbs for cross entropy logProbsAll[round].append( utils.maskedNll(logProbsCurrent, gtQuestions[:, round].contiguous())) perplexityAll[round].append(utils.maskedPerplexity(logProbsCurrent, gtQuestions[:, round].contiguous())) if txt_retrieval_mode == "mse": predFeatures = val_model.predicttext() # Evaluating feature regression network # Deal with different txtEncodingMode featLoss = F.mse_loss(predFeatures, gtFeatures) featLossAll[round + 1].append(torch.mean(featLoss)) # Keeping predictions roundwiseFeaturePreds[round + 1].append(predFeatures) # Modified by Mingyang Zhou if val_model.txtEncodingMode == "txtuess": # act_index = im_ranker.nearest_neighbor( # predFeatures.data, all_txt_feat) act_index = im_ranker.nearest_neighbor( predFeatures.data) predicted_text = all_txt_feat[act_index].squeeze(1) # Compute the winning rate at round 0, modified by Mingyang # Zhou round_dists = pairwise_distances( predFeatures.cpu().numpy(), gtFeatures.cpu().numpy()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort() == i)[0]) + 1 if current_rank <= 1: win_rate[round + 1] += 1 elif txt_retrieval_mode == "cosine_similarity": dialogEmbedding = val_model.multimodalpredictText() featLoss = pairwiseRanking_criterion( gtFeatures, dialogEmbedding) featLossAll[round + 1].append(torch.sum(featLoss)) roundwiseFeaturePreds[round + 1].append( dialogEmbedding) # Keep the dialogEmbedding, To be modified later. # Initailize the round_dists, with each row as the cosine # similarity round_dists = np.matmul( dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1 if current_rank <= 1: win_rate[round + 1] += 1 # convert gtFeatures back to tensor # gtFeatures = torch.from_numpy(gtFeatures) gttxtFeatures.append(gtFeatures) end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[val_model] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") # Compute the win_rate, modified by Mingyang Zhou win_rate = [x / num_games for x in win_rate] print("The winning rates for {} are: {}".format(split, win_rate)) gtFeatures = torch.cat(gttxtFeatures, 0).data.cpu().numpy() rankMetricsRounds = [] poolSize = len(dataset) # Keeping tracking of feature regression loss and CE logprobs # logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll] # featLossAll = [torch.cat(floss, 0).mean() for floss in featLossAll] # roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() # roundwiseFeatLoss = torch.cat(featLossAll, 0).data.cpu().numpy() logProbsAll = [torch.stack(lprobs, 0).mean() for lprobs in logProbsAll] # Compute the Mean Perplexity for each round perplexityAll = [torch.cat(perplexity, 0).mean().data.item() for perplexity in perplexityAll] featLossAll = [torch.stack(floss, 0).mean() for floss in featLossAll] roundwiseLogProbs = torch.stack(logProbsAll, 0).data.cpu().numpy() roundwiseFeatLoss = torch.stack(featLossAll, 0).data.cpu().numpy() # Compute the Mean Perplexity over all rounds # roundwisePerplexity = torch.stack(perplexityAll, 0).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() featLossMean = roundwiseFeatLoss.mean() perplexityMean = sum(perplexityAll) / len(perplexityAll) print("The Perplxity of current Questioner is: {}".format(perplexityMean)) # Added by Mingyang Zhou winrateMean = sum(win_rate) / len(win_rate) if verbose: print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): if txt_retrieval_mode == "mse": predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() # num_examples x num_examples dists = pairwise_distances(predFeatures, gtFeatures) ranks = [] for i in range(dists.shape[0]): rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 ranks.append(rank) elif txt_retrieval_mode == "cosine_similarity": predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() dists = np.matmul(predFeatures, gtFeatures.transpose()) ranks = [] for i in range(dists.shape[0]): rank = int(np.where(dists[i, :].argsort()[::-1] == i)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) if verbose: print((round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetrics['featLoss'] = roundwiseFeatLoss[round] if round < len(roundwiseLogProbs): rankMetrics['logProbs'] = roundwiseLogProbs[round] rankMetricsRounds.append(rankMetrics) rankMetricsRounds[-1]['logProbsMean'] = logProbsMean rankMetricsRounds[-1]['featLossMean'] = featLossMean rankMetricsRounds[-1]['winrateMean'] = winrateMean # Added the perplexity in eval metrics rankMetricsRounds[-1]['perplexityMean'] = perplexityMean dataset.split = original_split return rankMetricsRounds[-1], rankMetricsRounds
def DialogEval_2(val_model, target_model, dataset, split, exampleLimit=None, beamSize=1, txt_retrieval_mode='mse'): print("text Encoding Mode is: {}".format(val_model.txtEncodingMode)) batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn) gttxtFeatures = [] roundwiseFeaturePreds = [[] for _ in range(numRounds + 1)] # Added by Mingyang Zhou for Perplexity Computation # perplexityAll = [[] for _ in range(numRounds)] start_t = timer() # Defined by Mingyang Zhou win_rate = [0] * (numRounds + 1) num_games = 0 # Modified by Mingyang Zhou all_txt_feat = txtLoader(dataloader, dataset) im_ranker = Ranker() # Update the Ranker val_model.eval(), val_model.reset() if val_model.txtEncodingMode == "txtuess": im_ranker.update_rep(val_model, all_txt_feat) for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = {key: v.cuda() for key, v in batch.items() if hasattr(v, 'cuda')} else: batch = {key: v.contiguous() for key, v in batch.items() if hasattr(v, 'cuda')} # caption = Variable(batch['cap'], volatile=True) # captionLens = Variable(batch['cap_len'], volatile=True) # gtQuestions = Variable(batch['ques'], volatile=True) # gtQuesLens = Variable(batch['ques_len'], volatile=True) # answers = Variable(batch['ans'], volatile=True) # ansLens = Variable(batch['ans_len'], volatile=True) # gtFeatures = Variable(batch['txt_feat'], volatile=True) # text = Variable(batch['txt_feat'], volatile=True) with torch.no_grad(): caption = Variable(batch['cap']) captionLens = Variable(batch['cap_len']) gtQuestions = Variable(batch['ques']) gtQuesLens = Variable(batch['ques_len']) answers = Variable(batch['ans']) ansLens = Variable(batch['ans_len']) if txt_retrieval_mode == "mse": if val_model.txtEncodingMode == "txtuess": gtFeatures = val_model.forwardtext(Variable(batch['txt_feat'])) else: gtFeatures = Variable(batch['txt_feat']) else: gtFeatures = Variable(batch['txt_feat']) gtFeatures = val_model.multimodalpredictIm(gtFeatures) text = Variable(batch['txt_feat']) target_model.eval(), target_model.reset() target_model.observe(-1, text=text, caption=caption, captionLens=captionLens) val_model.eval(), val_model.reset() val_model.observe(-1, caption=caption, captionLens=captionLens) if val_model.new_questioner: val_model.observe_txt(text) if val_model.txtEncodingMode == "txtuess": act_index = torch.randint( 0, all_txt_feat.size(0) - 1, (text.size(0), 1)) predicted_text = all_txt_feat[act_index].squeeze(1) val_model.observe_txt(predicted_text) if txt_retrieval_mode == "mse": predFeatures = val_model.predicttext() roundwiseFeaturePreds[0].append(predFeatures) # Modified by Mingyang Zhou for txtEncoding Mode == "txtuess" if val_model.txtEncodingMode == "txtuess": # act_index = im_ranker.nearest_neighbor( # predFeatures.data, all_txt_feat) act_index = im_ranker.nearest_neighbor( predFeatures.data) predicted_text = all_txt_feat[act_index] # Should observe the current predicted text val_model.observe_txt(predicted_text) # Compute the winning rate at round 0, modified by Mingyang # Zhou round_dists = pairwise_distances( predFeatures.cpu().numpy(), gtFeatures.cpu().numpy()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort() == i)[0]) + 1 if current_rank <= 1: win_rate[0] += 1 # update the num_games num_games += 1 elif txt_retrieval_mode == "cosine_similarity": dialogEmbedding = val_model.multimodalpredictText() roundwiseFeaturePreds[0].append( dialogEmbedding) # Initailize the round_dists, with each row as the cosine # similarity round_dists = np.matmul( dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1 if current_rank <= 1: win_rate[0] += 1 # update the num_games num_games += 1 for round in range(numRounds): # questions, quesLens = val_model.forwardDecode( # inference='greedy', beamSize=beamSize) questions, quesLens = val_model.forwardDecode( inference='greedy', beamSize=beamSize) # print(logProbsCurrent.size()) val_model.observe(round, ques=questions, quesLens=quesLens) target_model.observe(round, ques=questions, quesLens=quesLens) # answers, ansLens = target_model.forwardDecode( # inference='greedy', beamSize=beamSize) answers, ansLens = target_model.forwardDecode( inference='greedy', beamSize=beamSize) target_model.observe(round, ans=answers, ansLens=ansLens) val_model.observe(round, ans=answers, ansLens=ansLens) if val_model.new_questioner: val_model.observe_txt(text) if val_model.txtEncodingMode == "txtuess": val_model.observe_txt(predicted_text) # Added by Mingyang Zhou # logProbsCurrent = val_model.forward() # perplexityAll[round].append(utils.maskedPerplexity(logProbsCurrent, # gtQuestions[:, round].contiguous())) if txt_retrieval_mode == "mse": predFeatures = val_model.predicttext() roundwiseFeaturePreds[round + 1].append(predFeatures) # Modified by Mingyang Zhou for txtEncoding Mode == # "txtuess" if val_model.txtEncodingMode == "txtuess": # act_index = im_ranker.nearest_neighbor( # predFeatures.data, all_txt_feat) act_index = im_ranker.nearest_neighbor( predFeatures.data) predicted_text = all_txt_feat[act_index] # Compute the winning rate at round 0, modified by Mingyang # Zhou round_dists = pairwise_distances( predFeatures.cpu().numpy(), gtFeatures.cpu().numpy()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort() == i)[0]) + 1 if current_rank <= 1: win_rate[round + 1] += 1 elif txt_retrieval_mode == "cosine_similarity": dialogEmbedding = val_model.multimodalpredictText() roundwiseFeaturePreds[round + 1].append( dialogEmbedding) # Keep the dialogEmbedding, To be modified later. # Initailize the round_dists, with each row as the cosine # similarity round_dists = np.matmul( dialogEmbedding.cpu().numpy(), gtFeatures.cpu().numpy().transpose()) for i in range(round_dists.shape[0]): current_rank = int( np.where(round_dists[i, :].argsort()[::-1] == i)[0]) + 1 if current_rank <= 1: win_rate[round + 1] += 1 gttxtFeatures.append(gtFeatures) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[val_model] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") # Compute the win_rate, modified by Mingyang Zhou win_rate = [x / num_games for x in win_rate] print("The winning rates for {} are: {}".format(split, win_rate)) gtFeatures = torch.cat(gttxtFeatures, 0).data.cpu().numpy() rankMetricsRounds = [] # Added by Mingyang Zhou # perplexityAll = [sum(perplexity) / len(perplexity) # for perplexity in perplexityAll] # perplexityMean = sum(perplexityAll) / len(perplexityAll) # print("The Perplxity of current Questioner in the Dialog with a User Simulator is: {}".format( # perplexityMean)) winrateMean = sum(win_rate) / len(win_rate) print("Percentile mean rank (round, mean, low, high)") for round in range(numRounds + 1): if txt_retrieval_mode == "mse": predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() dists = pairwise_distances(predFeatures, gtFeatures) # num_examples x num_examples ranks = [] for i in range(dists.shape[0]): # Computing rank of i-th prediction vs all texts in split rank = int(np.where(dists[i, :].argsort() == i)[0]) + 1 ranks.append(rank) elif txt_retrieval_mode == "cosine_similarity": predFeatures = torch.cat(roundwiseFeaturePreds[round], 0).data.cpu().numpy() dists = np.matmul(predFeatures, gtFeatures.transpose()) ranks = [] for i in range(dists.shape[0]): rank = int(np.where(dists[i, :].argsort()[::-1] == i)[0]) + 1 ranks.append(rank) ranks = np.array(ranks) rankMetrics = metrics.computeMetrics(Variable(torch.from_numpy(ranks))) assert len(ranks) == len(dataset) poolSize = len(dataset) meanRank = ranks.mean() se = ranks.std() / np.sqrt(poolSize) meanPercRank = 100 * (1 - (meanRank / poolSize)) percRankLow = 100 * (1 - ((meanRank + se) / poolSize)) percRankHigh = 100 * (1 - ((meanRank - se) / poolSize)) print((round, meanPercRank, percRankLow, percRankHigh)) rankMetrics['percentile'] = meanPercRank rankMetricsRounds.append(rankMetrics) dataset.split = original_split rankMetricsRounds[-1]['winrateMean'] = winrateMean return rankMetricsRounds[-1], rankMetricsRounds