Пример #1
0
    def __init__(self, hparams, model=None, split="test"):
        self.hparams = hparams
        self.model = model
        self._logger = logging.getLogger(__name__)
        self.split = split
        self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if
                       self.hparams.gpu_ids[0] >= 0 else torch.device("cpu"))

        do_valid, do_test = False, False
        if split == "val":
            do_valid = True
        else:
            do_test = True
        self._build_dataloader(do_valid=do_valid, do_test=do_test)
        self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader

        if model is None:
            self._build_model()

        self.sparse_metrics = SparseGTMetrics()
        self.ndcg = NDCG()
Пример #2
0
test_mode = False
if args.split == 'test':
    test_mode = True
    config['dataset']['test_feat_img_path'] = config['dataset'][
        'train_feat_img_path'].replace(
            "trainval_resnet101_faster_rcnn_genome__num_boxes",
            "test2018_resnet101_faster_rcnn_genome__num_boxes")
    config['dataset']['test_json_dialog_path'] = config['dataset'][
        'train_json_dialog_path'].replace('visdial_1.0_train.json',
                                          'visdial_1.0_test.json')

model = model.to(device)

sparse_metrics = SparseGTMetrics()
ndcg = NDCG()

dataset = VisDialDataset(config, split=args.split)
dataloader = DataLoader(dataset, batch_size=1)

model = model.eval()
ranks_json = []

for idx, batch in enumerate(tqdm(dataloader)):
    torch.cuda.empty_cache()
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        output = model(batch, test_mode=test_mode)
Пример #3
0
                           milestone_steps=config['solver']['milestone_steps'],
                           linear_gama=config['solver']['linear_gama'])

# =============================================================================
#   SETUP BEFORE TRAINING LOOP
# =============================================================================
summary_writer = SummaryWriter(log_dir=config['callbacks']['log_dir'])

checkpoint_manager = CheckpointManager(model,
                                       optimizer,
                                       config['callbacks']['save_dir'],
                                       config=config)
sparse_metrics = SparseGTMetrics()
disc_metrics = SparseGTMetrics()
gen_metrics = SparseGTMetrics()
ndcg = NDCG()
disc_ndcg = NDCG()
gen_ndcg = NDCG()

print("Loading checkpoints...")
start_epoch, model, optimizer = load_checkpoint_from_config(
    model, optimizer, config)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

# =============================================================================
#   TRAINING LOOP
# =============================================================================

iterations = len(train_dataset) // (config['solver']['batch_size'] *
Пример #4
0
class MultiEvaluation(object):
    def __init__(self, hparams, model=None, split="test"):
        self.hparams = hparams
        self.model = model
        self._logger = logging.getLogger(__name__)
        self.split = split
        self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if
                       self.hparams.gpu_ids[0] >= 0 else torch.device("cpu"))

        do_valid, do_test = False, False
        if split == "val":
            do_valid = True
        else:
            do_test = True
        self._build_dataloader(do_valid=do_valid, do_test=do_test)
        self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader

        if model is None:
            self._build_model()

        self.sparse_metrics = SparseGTMetrics()
        self.ndcg = NDCG()

    def _build_dataloader(self, do_valid=False, do_test=False):
        if do_valid:
            self.valid_dataset = VisDialDataset(self.hparams,
                                                overfit=self.hparams.overfit,
                                                split="val")
            collate_fn = None
            if "dan" in self.hparams.img_feature_type:
                collate_fn = self.valid_dataset.collate_fn
            self.valid_dataloader = DataLoader(
                self.valid_dataset,
                batch_size=self.hparams.eval_batch_size
                if "disc" in self.hparams.decoder else 5,
                num_workers=self.hparams.cpu_workers,
                drop_last=False,
                collate_fn=collate_fn,
            )

        if do_test:
            self.test_dataset = VisDialDataset(self.hparams,
                                               overfit=self.hparams.overfit,
                                               split="test")

            collate_fn = None
            if "dan" in self.hparams.img_feature_type:
                collate_fn = self.test_dataset.collate_fn

            self.test_dataloader = DataLoader(
                self.test_dataset,
                batch_size=self.hparams.eval_batch_size
                if "disc" in self.hparams.decoder else 5,
                num_workers=self.hparams.cpu_workers,
                drop_last=False,
                collate_fn=collate_fn)

    def _build_model(self):
        vocabulary = self.valid_dataset.vocabulary if self.split == "val" else self.test_dataset.vocabulary
        encoder = Encoder(self.hparams, vocabulary)
        disc_decoder, gen_decoder = None, None

        if "disc" in self.hparams.decoder and "disc" in self.hparams.evaluation_type:
            disc_decoder = DiscriminativeDecoder(self.hparams, vocabulary)
        if "gen" in self.hparams.decoder and "gen" in self.hparams.evaluation_type:
            gen_decoder = GenerativeDecoder(self.hparams, vocabulary)

        # Wrap encoder and decoder in a model.
        self.model = MultiEncoderDecoderModel(encoder, disc_decoder,
                                              gen_decoder).to(self.device)

        # Use Multi-GPUs
        if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1:
            self.model = nn.DataParallel(self.model, self.hparams.gpu_ids)

    def run_evaluate(self,
                     evaluation_path,
                     global_iteration_step=0,
                     tb_summary_writer: SummaryWriter = None,
                     eval_json_path=None,
                     eval_seed=None):

        model_state_dict, optimizer_state_dict = load_checkpoint(
            evaluation_path)
        print("evaluation model loading completes! ->", evaluation_path)

        self.eval_seed = self.hparams.random_seed[
            0] if eval_seed is None else eval_seed

        if isinstance(self.model, nn.DataParallel):
            self.model.module.load_state_dict(model_state_dict)
        else:
            self.model.load_state_dict(model_state_dict)

        print("Decoder Type : %s" % self.hparams.evaluation_type)
        self.model.eval()

        ranks_json = []
        self.prob_dist_json = []

        for i, batch in enumerate(tqdm(self._dataloader)):
            for key in batch:
                batch[key] = batch[key].to(self.device)
            with torch.no_grad():
                disc_output, gen_output = self.model(batch)

            batch_size, num_dial, _ = batch['ques'].size()
            ranks = None

            if self.hparams.evaluation_type == "disc_gen":
                if self.hparams.aggregation_type == "reciprocal":
                    disc_output = disc_output.view(batch_size, num_dial, -1)
                    disc_ranks = scores_to_ranks(disc_output)
                    gen_output = gen_output.view(batch_size, num_dial, -1)
                    gen_ranks = scores_to_ranks(gen_output)

                    # Aggregate reciprocal ranks
                    disc_reci_ranks = torch.div(
                        torch.ones_like(disc_ranks, dtype=torch.float32),
                        disc_ranks)
                    gen_reci_ranks = torch.div(
                        torch.ones_like(gen_ranks, dtype=torch.float32),
                        gen_ranks)
                    agg_reci_ranks = torch.mean(torch.stack(
                        [disc_reci_ranks, gen_reci_ranks], dim=-1),
                                                dim=-1)  # bs, nr, 100, 2
                    ranks = scores_to_ranks(agg_reci_ranks)
                    output = agg_reci_ranks

                elif self.hparams.aggregation_type == "average":
                    # Average probability distributions
                    output = (F.log_softmax(disc_output, dim=-1) +
                              F.log_softmax(gen_output, dim=-1)) / 2
                    # output = torch.div((F.softmax(disc_output, dim=-1) + F.softmax(gen_output, dim=-1)), 2.0)
                    ranks = scores_to_ranks(output)

            elif self.hparams.evaluation_type == "disc":
                disc_output = disc_output.view(batch_size, num_dial, -1)
                disc_ranks = scores_to_ranks(disc_output)
                ranks = disc_ranks
                output = disc_output

            else:
                gen_output = gen_output.view(batch_size, num_dial, -1)
                gen_ranks = scores_to_ranks(gen_output)
                ranks = gen_ranks
                output = gen_output

            for i in range(len(batch["img_ids"])):
                # Cast into types explicitly to ensure no errors in schema.
                # Round ids are 1-10, not 0-9
                if self.split == "test":
                    ranks_json.append({
                        "image_id":
                        batch["img_ids"][i].item(),
                        "round_id":
                        int(batch["num_rounds"][i].item()),
                        "ranks": [
                            rank.item()
                            for rank in ranks[i][batch["num_rounds"][i] - 1]
                        ],
                    })
                else:
                    for j in range(batch["num_rounds"][i]):
                        ranks_json.append({
                            "image_id":
                            batch["img_ids"][i].item(),
                            "round_id":
                            int(j + 1),
                            "ranks": [rank.item() for rank in ranks[i][j]],
                        })

            if self.split == "val":
                self.sparse_metrics.observe(output, batch["ans_ind"])
                if "gt_relevance" in batch:  # version 1.0
                    output = output[torch.arange(output.size(0)),
                                    batch["round_id"] - 1, :]
                    self.ndcg.observe(output, batch["gt_relevance"])

        if self.split == "val":
            all_metrics = {}
            all_metrics.update(self.sparse_metrics.retrieve(reset=True))
            if self.hparams.dataset_version == '1.0':
                all_metrics.update(self.ndcg.retrieve(reset=True))

            for metric_name, metric_value in all_metrics.items():
                self._logger.info(f"{metric_name}: {metric_value}")

            if tb_summary_writer:
                tb_summary_writer.add_scalars("metrics", all_metrics,
                                              global_iteration_step)

        # if not tb_summary_writer:
        print("Writing ranks to {}".format(self.hparams.root_dir))
        if eval_json_path is not None:
            json.dump(ranks_json, open(eval_json_path, "w"))
        else:
            json.dump(
                ranks_json,
                open(
                    os.path.join(
                        self.hparams.root_dir, self.hparams.model_name +
                        "_ranks_%s.json" % self.split), "w"))

        if not tb_summary_writer and self.split == "val":
            for metric_name, metric_value in all_metrics.items():
                print(f"{metric_name}: {metric_value}")
def rankABot(aBot, dataset, split, scoringFunction, exampleLimit=None, useNDCG=False):
    '''
        Evaluate A-Bot performance on ranking answer option when it is
        shown ground truth image features, captions and questions.

        Arguments:
            aBot    : A-Bot
            dataset : VisDialDataset instance
            split   : Dataset split, can be 'val' or 'test'

            scoringFunction : A function which computes negative log
                              likelihood of a sequence (answer) given log
                              probabilities under an RNN model. Currently
                              utils.maskedNll is the only such function used.
            exampleLimit    : Maximum number of data points to use from
                              the dataset split. If None, all data points.
    '''

    batchSize = dataset.batchSize
    numRounds = dataset.numRounds
    if exampleLimit is None:
        numExamples = dataset.numDataPoints[split]
    else:
        numExamples = exampleLimit

    numBatches = (numExamples - 1) // batchSize + 1

    original_split = dataset.split
    dataset.split = split
    dataloader = DataLoader(
        dataset,
        batch_size=batchSize,
        shuffle=True,
        num_workers=1,
        collate_fn=dataset.collate_fn)

    # sparse_metrics = SparseGTMetrics()
    ndcg = None
    if useNDCG:
        ndcg = NDCG()
    ranks_json = []

    totalLoss, totalTokens = 0, 0
    ranks = []
    logProbsAll = [[] for _ in range(numRounds)]
    start_t = timer()

    getImgFileName = lambda x: dataset.data['%s_img_fnames' % split][x]
    getImgId = lambda x: int(getImgFileName(x)[:-4][-12:])

    for idx, batch in enumerate(dataloader):
        if idx == numBatches:
            break

        if dataset.useGPU:
            batch = {
                key: v.cuda() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }
        else:
            batch = {
                key: v.contiguous() if hasattr(v, 'cuda') else v
                for key, v in batch.items()
            }

        image = Variable(batch['img_feat'], volatile=True)
        caption = Variable(batch['cap'], volatile=True)
        captionLens = Variable(batch['cap_len'], volatile=True)
        questions = Variable(batch['ques'], volatile=True)
        quesLens = Variable(batch['ques_len'], volatile=True)
        answers = Variable(batch['ans'], volatile=True)
        ansLens = Variable(batch['ans_len'], volatile=True)
        options = Variable(batch['opt'], volatile=True)
        optionLens = Variable(batch['opt_len'], volatile=True)

        gtRelevance = None
        round_id = None
        img_ids = None
        correctOptionInds = None

        if split != 'test':
            correctOptionInds = Variable(batch['ans_id'], volatile=True)

        if split == 'val' and useNDCG:
            # read in gtRelevance and round
            gtRelevance = Variable(batch['gt_relevance'],volatile=True)
            round_id = Variable(batch['round_id'],volatile=True)
            img_ids = Variable(batch['image_id'], volatile=True)

        if split == 'test':
            img_ids = [getImgId(x) for x in batch['index']]

        aBot.reset()
        aBot.observe(-1, image=image, caption=caption, captionLens=captionLens)
        log_probs_rounds = []
        for round in range(numRounds):
            aBot.observe(
                round,
                ques=questions[:, round],
                quesLens=quesLens[:, round],
                ans=answers[:, round],
                ansLens=ansLens[:, round])
            logProbs = aBot.evalOptions(options[:, round],
                                    optionLens[:, round], scoringFunction)
            if useNDCG:
                log_probs_rounds.append(logProbs.unsqueeze(1))
            logProbsCurrent = aBot.forward()
            logProbsAll[round].append(
                scoringFunction(logProbsCurrent,
                                answers[:, round].contiguous()))
            if split != 'test':
                batchRanks = rankOptions(options[:, round],
                                         correctOptionInds[:, round], logProbs)
                ranks.append(batchRanks)
        batch['num_rounds'] = batch['num_rounds'].squeeze(1)
        output = None
        if useNDCG or split == 'test':

            output = torch.cat(log_probs_rounds,dim=1)
            ranks_cur = scores_to_ranks(output)

            for i in range(len(img_ids)):
                # cast into types explicitly to ensure no errors in schema
                # round ids are 1-10, not 0-9
                # "ranks": [rank.data[0] for rank in ranks_cur[i][batch["num_rounds"][i] - 1]]

                if split == "test":
                    ranks_json.append({
                        "image_id": img_ids[i],
                        "round_id": int(batch["num_rounds"][i]),
                        "ranks": ranks_cur[i][batch["num_rounds"][i] - 1].data.cpu().tolist()
                    })
                else:
                    for j in range(numRounds):
                        ranks_json.append({
                            "image_id": img_ids[i].data[0],
                            "round_id": int(j + 1),
                            "ranks": [rank.data[0] for rank in ranks_cur[i][j]]
                        })

        if split == "val":
            # sparse_metrics.observe(output, correctOptionInds)
            if "gt_relevance" in batch and useNDCG:
                indices = torch.arange(output.shape[0]).long().cpu().numpy()
                round_id_numpy = round_id.long().cpu().data.numpy()
                round_id_numpy = round_id_numpy.reshape(-1)
                output = output.cpu().data.numpy()
                output = output[indices, round_id_numpy-1, :]
                output = Variable(torch.from_numpy(output),volatile=True)
                ndcg.observe(output, gtRelevance)

        end_t = timer()
        delta_t = " Rate: %5.2fs" % (end_t - start_t)
        start_t = end_t
        progressString = "\r[Abot] Evaluating split '%s' [%d/%d]\t" + delta_t
        sys.stdout.write(progressString % (split, idx + 1, numBatches))
        sys.stdout.flush()

    sys.stdout.write("\n")
    dataloader = None
    print("Sleeping for 3 seconds to let dataloader subprocesses exit...")
    dataset.split = original_split

    if split == 'test':
        # dump eval AI file
        dir_out = 'predictions.txt'
        json.dump(ranks_json, open(dir_out, "w"))
        return

    ranks = torch.cat(ranks, 0)
    rankMetrics = metrics.computeMetrics(ranks.cpu())

    logProbsAll = [torch.cat(lprobs, 0).mean() for lprobs in logProbsAll]
    roundwiseLogProbs = torch.cat(logProbsAll, 0).data.cpu().numpy()
    logProbsMean = roundwiseLogProbs.mean()
    rankMetrics['logProbsMean'] = logProbsMean

    if split == "val" and useNDCG:
        rankMetrics.update(ndcg.retrieve(reset=True))
        for metric_name, metric_value in rankMetrics.items():
            print(f"{metric_name}: {metric_value}")
    return rankMetrics
Пример #6
0
class Evaluation(object):
    def __init__(self, hparams, model=None, split="test"):
        self.hparams = hparams
        self.model = model
        self._logger = logging.getLogger(__name__)
        self.device = (torch.device("cuda", self.hparams.gpu_ids[0]) if
                       self.hparams.gpu_ids[0] >= 0 else torch.device("cpu"))
        self.split = split

        do_valid, do_test = False, False
        if split == "val":
            do_valid = True
        else:
            do_test = True
        self._build_dataloader(do_valid=do_valid, do_test=do_test)
        self._dataloader = self.valid_dataloader if split == 'val' else self.test_dataloader

        if model is None:
            self._build_model()

        self.sparse_metrics = SparseGTMetrics()
        self.ndcg = NDCG()

    def _build_dataloader(self, do_valid=False, do_test=False):
        if do_valid:
            split = "train" if self.hparams.dataset_version == "0.9" else "val"
            old_split = "val" if self.hparams.dataset_version == "0.9" else None

            self.valid_dataset = VisDialDataset(self.hparams,
                                                overfit=self.hparams.overfit,
                                                split=split,
                                                old_split=old_split)

            collate_fn = None
            if "dan" in self.hparams.img_feature_type:
                collate_fn = self.valid_dataset.collate_fn

            self.valid_dataloader = DataLoader(
                self.valid_dataset,
                batch_size=self.hparams.eval_batch_size,
                num_workers=self.hparams.cpu_workers,
                drop_last=False,
                collate_fn=collate_fn)

        if do_test:
            self.test_dataset = VisDialDataset(
                self.hparams,
                overfit=self.hparams.overfit,
                split="test",
            )

            collate_fn = None
            if "dan" in self.hparams.img_feature_type:
                collate_fn = self.test_dataset.collate_fn

            self.test_dataloader = DataLoader(
                self.test_dataset,
                batch_size=self.hparams.eval_batch_size,
                num_workers=self.hparams.cpu_workers,
                drop_last=False,
                collate_fn=collate_fn)

    def _build_model(self):
        vocabulary = self.valid_dataset.vocabulary if self.split == "val" else self.test_dataset.vocabulary
        encoder = Encoder(self.hparams, vocabulary)
        decoder = Decoder(self.hparams, vocabulary)

        # Wrap encoder and decoder in a model.
        self.model = EncoderDecoderModel(encoder, decoder).to(self.device)

        # Use Multi-GPUs
        if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1:
            self.model = nn.DataParallel(self.model, self.hparams.gpu_ids)

    def run_evaluate(self,
                     evaluation_path,
                     global_iteration_step=0,
                     tb_summary_writer: SummaryWriter = None,
                     eval_json_path=None,
                     eval_seed=None):

        model_state_dict, optimizer_state_dict = load_checkpoint(
            evaluation_path)
        print("evaluation model loading completes! ->", evaluation_path)

        self.eval_seed = self.hparams.random_seed[
            0] if eval_seed is None else eval_seed

        if isinstance(self.model, nn.DataParallel):
            self.model.module.load_state_dict(model_state_dict)
        else:
            self.model.load_state_dict(model_state_dict)

        self.model.eval()
        ranks_json = []
        for i, batch in enumerate(tqdm(self._dataloader)):
            for key in batch:
                batch[key] = batch[key].to(self.device)
            with torch.no_grad():
                output = self.model(batch)
            batch_size, num_dial, _ = batch['ques'].size()
            ranks = scores_to_ranks(output)  # bs, num_dialog, num_options

            for i in range(len(batch["img_ids"])):
                # Cast into types explicitly to ensure no errors in schema.
                # Round ids are 1-10, not 0-9
                if self.split == "test":
                    ranks_json.append({
                        "image_id":
                        batch["img_ids"][i].item(),
                        "round_id":
                        int(batch["num_rounds"][i].item()),
                        "ranks": [
                            rank.item()
                            for rank in ranks[i][batch["num_rounds"][i] - 1]
                        ],
                    })
                else:
                    for j in range(batch["num_rounds"][i]):
                        ranks_json.append({
                            "image_id":
                            batch["img_ids"][i].item(),
                            "round_id":
                            int(j + 1),
                            "ranks": [rank.item() for rank in ranks[i][j]],
                        })

            if self.split == "val":
                self.sparse_metrics.observe(output, batch["ans_ind"])
                if "gt_relevance" in batch:  # version 1.0
                    output = output[torch.arange(output.size(0)),
                                    batch["round_id"] - 1, :]
                    self.ndcg.observe(output, batch["gt_relevance"])

        if self.split == "val":
            all_metrics = {}
            all_metrics.update(self.sparse_metrics.retrieve(reset=True))
            if self.hparams.dataset_version == '1.0':
                all_metrics.update(self.ndcg.retrieve(reset=True))

            for metric_name, metric_value in all_metrics.items():
                self._logger.info(f"{metric_name}: {metric_value}")

            if tb_summary_writer:
                tb_summary_writer.add_scalars("metrics", all_metrics,
                                              global_iteration_step)

        # if not tb_summary_writer:
        print("Writing ranks to {}".format(self.hparams.root_dir))
        if eval_json_path is not None:
            json.dump(ranks_json, open(eval_json_path, "w"))
        else:
            json.dump(
                ranks_json,
                open(
                    os.path.join(
                        self.hparams.root_dir, self.hparams.model_name +
                        "_ranks_%s.json" % self.split), "w"))

        if not tb_summary_writer and self.split == "val":
            for metric_name, metric_value in all_metrics.items():
                print(f"{metric_name}: {metric_value}")