def __init__(self, hparams, model=None, split="test"): self.hparams = hparams self.model = model self._logger = logging.getLogger(__name__) self.split = split self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if self.hparams.gpu_ids[0] >= 0 else torch.device("cpu")) do_valid, do_test = False, False if split == "val": do_valid = True else: do_test = True self._build_dataloader(do_valid=do_valid, do_test=do_test) self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader if model is None: self._build_model() self.sparse_metrics = SparseGTMetrics() self.ndcg = NDCG()
test_mode = False if args.split == 'test': test_mode = True config['dataset']['test_feat_img_path'] = config['dataset'][ 'train_feat_img_path'].replace( "trainval_resnet101_faster_rcnn_genome__num_boxes", "test2018_resnet101_faster_rcnn_genome__num_boxes") config['dataset']['test_json_dialog_path'] = config['dataset'][ 'train_json_dialog_path'].replace('visdial_1.0_train.json', 'visdial_1.0_test.json') model = model.to(device) sparse_metrics = SparseGTMetrics() ndcg = NDCG() dataset = VisDialDataset(config, split=args.split) dataloader = DataLoader(dataset, batch_size=1) model = model.eval() ranks_json = [] for idx, batch in enumerate(tqdm(dataloader)): torch.cuda.empty_cache() for key in batch: batch[key] = batch[key].to(device) with torch.no_grad(): output = model(batch, test_mode=test_mode)
milestone_steps=config['solver']['milestone_steps'], linear_gama=config['solver']['linear_gama']) # ============================================================================= # SETUP BEFORE TRAINING LOOP # ============================================================================= summary_writer = SummaryWriter(log_dir=config['callbacks']['log_dir']) checkpoint_manager = CheckpointManager(model, optimizer, config['callbacks']['save_dir'], config=config) sparse_metrics = SparseGTMetrics() disc_metrics = SparseGTMetrics() gen_metrics = SparseGTMetrics() ndcg = NDCG() disc_ndcg = NDCG() gen_ndcg = NDCG() print("Loading checkpoints...") start_epoch, model, optimizer = load_checkpoint_from_config( model, optimizer, config) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # ============================================================================= # TRAINING LOOP # ============================================================================= iterations = len(train_dataset) // (config['solver']['batch_size'] *
class MultiEvaluation(object): def __init__(self, hparams, model=None, split="test"): self.hparams = hparams self.model = model self._logger = logging.getLogger(__name__) self.split = split self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if self.hparams.gpu_ids[0] >= 0 else torch.device("cpu")) do_valid, do_test = False, False if split == "val": do_valid = True else: do_test = True self._build_dataloader(do_valid=do_valid, do_test=do_test) self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader if model is None: self._build_model() self.sparse_metrics = SparseGTMetrics() self.ndcg = NDCG() def _build_dataloader(self, do_valid=False, do_test=False): if do_valid: self.valid_dataset = VisDialDataset(self.hparams, overfit=self.hparams.overfit, split="val") collate_fn = None if "dan" in self.hparams.img_feature_type: collate_fn = self.valid_dataset.collate_fn self.valid_dataloader = DataLoader( self.valid_dataset, batch_size=self.hparams.eval_batch_size if "disc" in self.hparams.decoder else 5, num_workers=self.hparams.cpu_workers, drop_last=False, collate_fn=collate_fn, ) if do_test: self.test_dataset = VisDialDataset(self.hparams, overfit=self.hparams.overfit, split="test") collate_fn = None if "dan" in self.hparams.img_feature_type: collate_fn = self.test_dataset.collate_fn self.test_dataloader = DataLoader( self.test_dataset, batch_size=self.hparams.eval_batch_size if "disc" in self.hparams.decoder else 5, num_workers=self.hparams.cpu_workers, drop_last=False, collate_fn=collate_fn) def _build_model(self): vocabulary = self.valid_dataset.vocabulary if self.split == "val" else self.test_dataset.vocabulary encoder = Encoder(self.hparams, vocabulary) disc_decoder, gen_decoder = None, None if "disc" in self.hparams.decoder and "disc" in self.hparams.evaluation_type: disc_decoder = DiscriminativeDecoder(self.hparams, vocabulary) if "gen" in self.hparams.decoder and "gen" in self.hparams.evaluation_type: gen_decoder = GenerativeDecoder(self.hparams, vocabulary) # Wrap encoder and decoder in a model. self.model = MultiEncoderDecoderModel(encoder, disc_decoder, gen_decoder).to(self.device) # Use Multi-GPUs if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1: self.model = nn.DataParallel(self.model, self.hparams.gpu_ids) def run_evaluate(self, evaluation_path, global_iteration_step=0, tb_summary_writer: SummaryWriter = None, eval_json_path=None, eval_seed=None): model_state_dict, optimizer_state_dict = load_checkpoint( evaluation_path) print("evaluation model loading completes! ->", evaluation_path) self.eval_seed = self.hparams.random_seed[ 0] if eval_seed is None else eval_seed if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) print("Decoder Type : %s" % self.hparams.evaluation_type) self.model.eval() ranks_json = [] self.prob_dist_json = [] for i, batch in enumerate(tqdm(self._dataloader)): for key in batch: batch[key] = batch[key].to(self.device) with torch.no_grad(): disc_output, gen_output = self.model(batch) batch_size, num_dial, _ = batch['ques'].size() ranks = None if self.hparams.evaluation_type == "disc_gen": if self.hparams.aggregation_type == "reciprocal": disc_output = disc_output.view(batch_size, num_dial, -1) disc_ranks = scores_to_ranks(disc_output) gen_output = gen_output.view(batch_size, num_dial, -1) gen_ranks = scores_to_ranks(gen_output) # Aggregate reciprocal ranks disc_reci_ranks = torch.div( torch.ones_like(disc_ranks, dtype=torch.float32), disc_ranks) gen_reci_ranks = torch.div( torch.ones_like(gen_ranks, dtype=torch.float32), gen_ranks) agg_reci_ranks = torch.mean(torch.stack( [disc_reci_ranks, gen_reci_ranks], dim=-1), dim=-1) # bs, nr, 100, 2 ranks = scores_to_ranks(agg_reci_ranks) output = agg_reci_ranks elif self.hparams.aggregation_type == "average": # Average probability distributions output = (F.log_softmax(disc_output, dim=-1) + F.log_softmax(gen_output, dim=-1)) / 2 # output = torch.div((F.softmax(disc_output, dim=-1) + F.softmax(gen_output, dim=-1)), 2.0) ranks = scores_to_ranks(output) elif self.hparams.evaluation_type == "disc": disc_output = disc_output.view(batch_size, num_dial, -1) disc_ranks = scores_to_ranks(disc_output) ranks = disc_ranks output = disc_output else: gen_output = gen_output.view(batch_size, num_dial, -1) gen_ranks = scores_to_ranks(gen_output) ranks = gen_ranks output = gen_output for i in range(len(batch["img_ids"])): # Cast into types explicitly to ensure no errors in schema. # Round ids are 1-10, not 0-9 if self.split == "test": ranks_json.append({ "image_id": batch["img_ids"][i].item(), "round_id": int(batch["num_rounds"][i].item()), "ranks": [ rank.item() for rank in ranks[i][batch["num_rounds"][i] - 1] ], }) else: for j in range(batch["num_rounds"][i]): ranks_json.append({ "image_id": batch["img_ids"][i].item(), "round_id": int(j + 1), "ranks": [rank.item() for rank in ranks[i][j]], }) if self.split == "val": self.sparse_metrics.observe(output, batch["ans_ind"]) if "gt_relevance" in batch: # version 1.0 output = output[torch.arange(output.size(0)), batch["round_id"] - 1, :] self.ndcg.observe(output, batch["gt_relevance"]) if self.split == "val": all_metrics = {} all_metrics.update(self.sparse_metrics.retrieve(reset=True)) if self.hparams.dataset_version == '1.0': all_metrics.update(self.ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): self._logger.info(f"{metric_name}: {metric_value}") if tb_summary_writer: tb_summary_writer.add_scalars("metrics", all_metrics, global_iteration_step) # if not tb_summary_writer: print("Writing ranks to {}".format(self.hparams.root_dir)) if eval_json_path is not None: json.dump(ranks_json, open(eval_json_path, "w")) else: json.dump( ranks_json, open( os.path.join( self.hparams.root_dir, self.hparams.model_name + "_ranks_%s.json" % self.split), "w")) if not tb_summary_writer and self.split == "val": for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}")
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None, useNDCG=False): ''' Evaluate A-Bot performance on ranking answer option when it is shown ground truth image features, captions and questions. Arguments: aBot : A-Bot dataset : VisDialDataset instance split : Dataset split, can be 'val' or 'test' scoringFunction : A function which computes negative log likelihood of a sequence (answer) given log probabilities under an RNN model. Currently utils.maskedNll is the only such function used. exampleLimit : Maximum number of data points to use from the dataset split. If None, all data points. ''' batchSize = dataset.batchSize numRounds = dataset.numRounds if exampleLimit is None: numExamples = dataset.numDataPoints[split] else: numExamples = exampleLimit numBatches = (numExamples - 1) // batchSize + 1 original_split = dataset.split dataset.split = split dataloader = DataLoader( dataset, batch_size=batchSize, shuffle=True, num_workers=1, collate_fn=dataset.collate_fn) # sparse_metrics = SparseGTMetrics() ndcg = None if useNDCG: ndcg = NDCG() ranks_json = [] totalLoss, totalTokens = 0, 0 ranks = [] logProbsAll = [[] for _ in range(numRounds)] start_t = timer() getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x] getImgId = lambda x: int(getImgFileName(x)[:-4][-12:]) for idx, batch in enumerate(dataloader): if idx == numBatches: break if dataset.useGPU: batch = { key: v.cuda() if hasattr(v, 'cuda') else v for key, v in batch.items() } else: batch = { key: v.contiguous() if hasattr(v, 'cuda') else v for key, v in batch.items() } image = Variable(batch['img_feat'], volatile=True) caption = Variable(batch['cap'], volatile=True) captionLens = Variable(batch['cap_len'], volatile=True) questions = Variable(batch['ques'], volatile=True) quesLens = Variable(batch['ques_len'], volatile=True) answers = Variable(batch['ans'], volatile=True) ansLens = Variable(batch['ans_len'], volatile=True) options = Variable(batch['opt'], volatile=True) optionLens = Variable(batch['opt_len'], volatile=True) gtRelevance = None round_id = None img_ids = None correctOptionInds = None if split != 'test': correctOptionInds = Variable(batch['ans_id'], volatile=True) if split == 'val' and useNDCG: # read in gtRelevance and round gtRelevance = Variable(batch['gt_relevance'],volatile=True) round_id = Variable(batch['round_id'],volatile=True) img_ids = Variable(batch['image_id'], volatile=True) if split == 'test': img_ids = [getImgId(x) for x in batch['index']] aBot.reset() aBot.observe(-1, image=image, caption=caption, captionLens=captionLens) log_probs_rounds = [] for round in range(numRounds): aBot.observe( round, ques=questions[:, round], quesLens=quesLens[:, round], ans=answers[:, round], ansLens=ansLens[:, round]) logProbs = aBot.evalOptions(options[:, round], optionLens[:, round], scoringFunction) if useNDCG: log_probs_rounds.append(logProbs.unsqueeze(1)) logProbsCurrent = aBot.forward() logProbsAll[round].append( scoringFunction(logProbsCurrent, answers[:, round].contiguous())) if split != 'test': batchRanks = rankOptions(options[:, round], correctOptionInds[:, round], logProbs) ranks.append(batchRanks) batch['num_rounds'] = batch['num_rounds'].squeeze(1) output = None if useNDCG or split == 'test': output = torch.cat(log_probs_rounds,dim=1) ranks_cur = scores_to_ranks(output) for i in range(len(img_ids)): # cast into types explicitly to ensure no errors in schema # round ids are 1-10, not 0-9 # "ranks": [rank.data[0] for rank in ranks_cur[i][batch["num_rounds"][i] - 1]] if split == "test": ranks_json.append({ "image_id": img_ids[i], "round_id": int(batch["num_rounds"][i]), "ranks": ranks_cur[i][batch["num_rounds"][i] - 1].data.cpu().tolist() }) else: for j in range(numRounds): ranks_json.append({ "image_id": img_ids[i].data[0], "round_id": int(j + 1), "ranks": [rank.data[0] for rank in ranks_cur[i][j]] }) if split == "val": # sparse_metrics.observe(output, correctOptionInds) if "gt_relevance" in batch and useNDCG: indices = torch.arange(output.shape[0]).long().cpu().numpy() round_id_numpy = round_id.long().cpu().data.numpy() round_id_numpy = round_id_numpy.reshape(-1) output = output.cpu().data.numpy() output = output[indices, round_id_numpy-1, :] output = Variable(torch.from_numpy(output),volatile=True) ndcg.observe(output, gtRelevance) end_t = timer() delta_t = " Rate: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % (split, idx + 1, numBatches)) sys.stdout.flush() sys.stdout.write("\n") dataloader = None print("Sleeping for 3 seconds to let dataloader subprocesses exit...") dataset.split = original_split if split == 'test': # dump eval AI file dir_out = 'predictions.txt' json.dump(ranks_json, open(dir_out, "w")) return ranks = torch.cat(ranks, 0) rankMetrics = metrics.computeMetrics(ranks.cpu()) logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll] roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy() logProbsMean = roundwiseLogProbs.mean() rankMetrics['logProbsMean'] = logProbsMean if split == "val" and useNDCG: rankMetrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in rankMetrics.items(): print(f"{metric_name}: {metric_value}") return rankMetrics
class Evaluation(object): def __init__(self, hparams, model=None, split="test"): self.hparams = hparams self.model = model self._logger = logging.getLogger(__name__) self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if self.hparams.gpu_ids[0] >= 0 else torch.device("cpu")) self.split = split do_valid, do_test = False, False if split == "val": do_valid = True else: do_test = True self._build_dataloader(do_valid=do_valid, do_test=do_test) self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader if model is None: self._build_model() self.sparse_metrics = SparseGTMetrics() self.ndcg = NDCG() def _build_dataloader(self, do_valid=False, do_test=False): if do_valid: split = "train" if self.hparams.dataset_version == "0.9" else "val" old_split = "val" if self.hparams.dataset_version == "0.9" else None self.valid_dataset = VisDialDataset(self.hparams, overfit=self.hparams.overfit, split=split, old_split=old_split) collate_fn = None if "dan" in self.hparams.img_feature_type: collate_fn = self.valid_dataset.collate_fn self.valid_dataloader = DataLoader( self.valid_dataset, batch_size=self.hparams.eval_batch_size, num_workers=self.hparams.cpu_workers, drop_last=False, collate_fn=collate_fn) if do_test: self.test_dataset = VisDialDataset( self.hparams, overfit=self.hparams.overfit, split="test", ) collate_fn = None if "dan" in self.hparams.img_feature_type: collate_fn = self.test_dataset.collate_fn self.test_dataloader = DataLoader( self.test_dataset, batch_size=self.hparams.eval_batch_size, num_workers=self.hparams.cpu_workers, drop_last=False, collate_fn=collate_fn) def _build_model(self): vocabulary = self.valid_dataset.vocabulary if self.split == "val" else self.test_dataset.vocabulary encoder = Encoder(self.hparams, vocabulary) decoder = Decoder(self.hparams, vocabulary) # Wrap encoder and decoder in a model. self.model = EncoderDecoderModel(encoder, decoder).to(self.device) # Use Multi-GPUs if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1: self.model = nn.DataParallel(self.model, self.hparams.gpu_ids) def run_evaluate(self, evaluation_path, global_iteration_step=0, tb_summary_writer: SummaryWriter = None, eval_json_path=None, eval_seed=None): model_state_dict, optimizer_state_dict = load_checkpoint( evaluation_path) print("evaluation model loading completes! ->", evaluation_path) self.eval_seed = self.hparams.random_seed[ 0] if eval_seed is None else eval_seed if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) self.model.eval() ranks_json = [] for i, batch in enumerate(tqdm(self._dataloader)): for key in batch: batch[key] = batch[key].to(self.device) with torch.no_grad(): output = self.model(batch) batch_size, num_dial, _ = batch['ques'].size() ranks = scores_to_ranks(output) # bs, num_dialog, num_options for i in range(len(batch["img_ids"])): # Cast into types explicitly to ensure no errors in schema. # Round ids are 1-10, not 0-9 if self.split == "test": ranks_json.append({ "image_id": batch["img_ids"][i].item(), "round_id": int(batch["num_rounds"][i].item()), "ranks": [ rank.item() for rank in ranks[i][batch["num_rounds"][i] - 1] ], }) else: for j in range(batch["num_rounds"][i]): ranks_json.append({ "image_id": batch["img_ids"][i].item(), "round_id": int(j + 1), "ranks": [rank.item() for rank in ranks[i][j]], }) if self.split == "val": self.sparse_metrics.observe(output, batch["ans_ind"]) if "gt_relevance" in batch: # version 1.0 output = output[torch.arange(output.size(0)), batch["round_id"] - 1, :] self.ndcg.observe(output, batch["gt_relevance"]) if self.split == "val": all_metrics = {} all_metrics.update(self.sparse_metrics.retrieve(reset=True)) if self.hparams.dataset_version == '1.0': all_metrics.update(self.ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): self._logger.info(f"{metric_name}: {metric_value}") if tb_summary_writer: tb_summary_writer.add_scalars("metrics", all_metrics, global_iteration_step) # if not tb_summary_writer: print("Writing ranks to {}".format(self.hparams.root_dir)) if eval_json_path is not None: json.dump(ranks_json, open(eval_json_path, "w")) else: json.dump( ranks_json, open( os.path.join( self.hparams.root_dir, self.hparams.model_name + "_ranks_%s.json" % self.split), "w")) if not tb_summary_writer and self.split == "val": for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}")