Exemplo n.º 1
0
def generate(encoder, decoder, dataloader, dict_idx2word, references, caption_root):
    print("generating captions...")
    candidates = {
        'train': {},
        'val': {},
        'test': {}
    }
    outputs = {
        'train': {},
        'val': {},
        'test': {}
    }
    for phase in ["train", "val", "test"]:
        print("\ngenerating {}...".format(phase))
        for batch_id, (model_ids, captions, embeddings, embeddings_interm, lengths) in enumerate(dataloader[phase]):
            start = time.time()
            if isinstance(decoder, Decoder):
                visual_inputs = embeddings.cuda()
            else:
                visual_inputs = embeddings_interm.cuda()
            caption_inputs = captions[:, :-1].cuda()
            cap_lengths = lengths.cuda()
            visual_contexts = encoder(visual_inputs)
            max_length = int(cap_lengths[0].item()) + 10
            if isinstance(decoder, Decoder):
                captions = decoder.beam_search(visual_contexts, 1, max_length)
                captions = decode_outputs(captions, None, dict_idx2word, "val")
            else:
                captions = decoder.beam_search(visual_contexts, caption_inputs, 1, max_length)
                captions = decode_attention_outputs(captions, None, dict_idx2word, "val")
            for model_id, caption in zip(model_ids, captions):
                if model_id not in outputs.keys():
                    candidates[phase][model_id] = [caption]
                else:
                    candidates[phase][model_id].append(caption)
            
            exe_s = time.time() - start
            eta_s = exe_s * (len(dataloader[phase]) - (batch_id + 1))
            eta_m = math.floor(eta_s / 60)
            eta_s = math.floor(eta_s % 60)
            print("generated: {}/{}, ETA: {}m {}s".format(batch_id + 1, len(dataloader[phase]), eta_m, int(eta_s)))

        print("\ncomputing metrics for {}...".format(phase))
        keys = list(references[phase].keys())
        _, bleu = capbleu.Bleu(4).compute_score(references[phase], candidates[phase])
        _, cider = capcider.Cider().compute_score(references[phase], candidates[phase])
        for i in range(len(keys)):
            output = (
                candidates[phase][keys[i]][0],
                [bleu[j][i] for j in range(4)],
                cider[i]
            )
            outputs[phase][keys[i]] = output

    # save results
    print("\nsaving captions...")
    output_path = os.path.join(caption_root,  "caption.p")
    pickle.dump(outputs, open(output_path, 'wb'))
Exemplo n.º 2
0
def main(args):
    embedding, keys, key2label, label2key = get_embedding(
        args.path, args.phase)
    shape_embeddings, shape_labels, text_embeddings, text_labels, text_raw = decode_embedding(
        embedding, keys, key2label)
    reference = get_reference(embedding)
    sim = shape_embeddings.dot(text_embeddings.T)
    candidate = get_candidate(sim, label2key, text_raw)

    # compute metrics
    print("computing metrics\n")
    bleu, _ = capbleu.Bleu(4).compute_score(reference, candidate)
    cider, _ = capcider.Cider().compute_score(reference, candidate)
    # report
    print("[BLEU-1]: {}".format(bleu[0]))
    print("[BLEU-2]: {}".format(bleu[1]))
    print("[BLEU-3]: {}".format(bleu[2]))
    print("[BLEU-4]: {}".format(bleu[3]))
    print("[CIDEr]: {}".format(cider))
Exemplo n.º 3
0
def eval_cap_pointgroup(model,
                        cfg,
                        epoch,
                        dataset,
                        dataloader,
                        no_detection=False,
                        no_caption=False,
                        force=True,
                        max_len=CONF.TRAIN.MAX_DES_LEN,
                        min_iou=0.25,
                        task='train'):

    am_dict = {}

    if no_caption:
        with torch.no_grad():
            #model.eval()
            #start_epoch = time.time()
            for data_dict in tqdm(dataloader):

                #move to cuda
                for key in data_dict:
                    if type(data_dict[key]) == torch.Tensor:
                        data_dict[key] = data_dict[key].cuda()
                    else:
                        pass

                ##### prepare input and forward
                #is_eval=False not important, captioning_module won't be used
                data_dict = model(data_dict,
                                  epoch,
                                  use_tf=False,
                                  is_eval=False)

                loss, loss_dict, visual_dict, meter_dict = get_pointgroup_cap_loss(
                    data_dict,
                    cfg,
                    epoch,
                    detection=not no_detection,
                    caption=False)

                ##### meter_dict
                for k, v in meter_dict.items():
                    if k not in am_dict.keys():
                        am_dict[k] = utils_pointgroup.AverageMeter()
                    am_dict[k].update(v[0], v[1])
    else:

        candidates, meter_dict, visual_dict = feed_pointgroup_cap(
            model, cfg, epoch, dataset, dataloader, no_detection, min_iou)
        ##TODO: equivelent steps of feed_scene_cap()

        if epoch > cfg.prepare_epochs:
            # corpus
            corpus_path = os.path.join(
                cfg.exp_path, "epoch{}".format(epoch) + "_corpus_val.json")
            if not os.path.exists(corpus_path) or force:
                print("preparing corpus...")
                corpus = prepare_corpus(dataset.val_data, max_len)
                with open(corpus_path, "w") as f:
                    json.dump(corpus, f, indent=4)
            else:
                print("loading corpus...")
                with open(corpus_path) as f:
                    corpus = json.load(f)

            pred_path = os.path.join(
                cfg.exp_path, "epoch{}".format(epoch) + "_pred_val.json")
            # check candidates
            # NOTE: make up the captions for the undetected object by "sos eos"
            candidates = check_candidates(corpus, candidates)

            candidates = organize_candidates(corpus, candidates)

            with open(pred_path, "w") as f:
                json.dump(candidates, f, indent=4)

            # compute scores
            print("computing scores...")
            bleu = capblue.Bleu(4).compute_score(corpus, candidates)
            cider = capcider.Cider().compute_score(corpus, candidates)
            rouge = caprouge.Rouge().compute_score(corpus, candidates)
            meteor = capmeteor.Meteor().compute_score(corpus, candidates)
            visual_dict['bleu'] = bleu[0][3]
            visual_dict['cider'] = cider[0]
            visual_dict['rouge'] = rouge[0]
            visual_dict['meteor'] = meteor[0]
            if 'bleu' not in am_dict.keys():
                am_dict['bleu'] = utils_pointgroup.AverageMeter()
                am_dict['cider'] = utils_pointgroup.AverageMeter()
                am_dict['rouge'] = utils_pointgroup.AverageMeter()
                am_dict['meteor'] = utils_pointgroup.AverageMeter()
            am_dict['bleu'].update(bleu[0][3], 1)
            am_dict['cider'].update(cider[0], 1)
            am_dict['rouge'].update(rouge[0], 1)
            am_dict['meteor'].update(meteor[0], 1)
        ##### meter_dict
        for k, v in meter_dict.items():
            if k not in am_dict.keys():
                am_dict[k] = utils_pointgroup.AverageMeter()
            am_dict[k].update(v[0], v[1])

    if task == 'train':
        return am_dict, visual_dict
    elif task == 'eval':
        return bleu, cider, rouge, meteor
Exemplo n.º 4
0
def eval_cap(model,
             device,
             dataset,
             dataloader,
             phase,
             folder,
             use_tf=False,
             is_eval=True,
             max_len=CONF.TRAIN.MAX_DES_LEN,
             force=False,
             mode="scene",
             save_interm=False,
             no_caption=False,
             no_classify=False,
             min_iou=CONF.TRAIN.MIN_IOU_THRESHOLD):
    if no_caption:
        bleu = 0
        cider = 0
        rouge = 0
        meteor = 0

        if no_classify:
            cls_acc = 0
        else:
            print("evaluating classification accuracy...")
            cls_acc = []
            for data_dict in tqdm(dataloader):
                # move to cuda
                for key in data_dict:
                    data_dict[key] = data_dict[key].to(device)

                with torch.no_grad():
                    data_dict = model(data_dict,
                                      use_tf=use_tf,
                                      is_eval=is_eval)

                # unpack
                preds = data_dict["enc_preds"]  # (B, num_cls)
                targets = data_dict["object_cat"]  # (B,)

                # classification acc
                preds = preds.argmax(-1)  # (B,)
                acc = (preds == targets).sum().float() / targets.shape[0]

                # dump
                cls_acc.append(acc.item())

            cls_acc = np.mean(cls_acc)
    else:
        # corpus
        corpus_path = os.path.join(CONF.PATH.OUTPUT, folder,
                                   "corpus_{}.json".format(phase))
        if not os.path.exists(corpus_path) or force:
            print("preparing corpus...")
            corpus = prepare_corpus(dataset.scanrefer, max_len)
            with open(corpus_path, "w") as f:
                json.dump(corpus, f, indent=4)
        else:
            print("loading corpus...")
            with open(corpus_path) as f:
                corpus = json.load(f)

        pred_path = os.path.join(CONF.PATH.OUTPUT, folder,
                                 "pred_{}.json".format(phase))
        # if not os.path.exists(pred_path) or force:
        # generate results
        print("generating descriptions...")
        if mode == "scene":
            candidates = feed_scene_cap(model, device, dataset, dataloader,
                                        phase, folder, use_tf, is_eval,
                                        max_len, save_interm, min_iou)
        elif mode == "object":
            pass
            #candidates, cls_acc = feed_object_cap(model, device, dataset, dataloader, phase, folder, use_tf, is_eval, max_len)
        elif mode == "oracle":
            pass
            #candidates = feed_oracle_cap(model, device, dataset, dataloader, phase, folder, use_tf, is_eval, max_len)
        else:
            raise ValueError("invalid mode: {}".format(mode))

        # check candidates
        # NOTE: make up the captions for the undetected object by "sos eos"
        candidates = check_candidates(corpus, candidates)

        candidates = organize_candidates(corpus, candidates)

        with open(pred_path, "w") as f:
            json.dump(candidates, f, indent=4)

        # compute scores
        print("computing scores...")
        bleu = capblue.Bleu(4).compute_score(corpus, candidates)
        cider = capcider.Cider().compute_score(corpus, candidates)
        rouge = caprouge.Rouge().compute_score(corpus, candidates)
        meteor = capmeteor.Meteor().compute_score(corpus, candidates)

        # update intermediates
        if save_interm:
            print("updating intermediate results...")
            interm_path = os.path.join(CONF.PATH.OUTPUT, folder, "interm.json")
            with open(interm_path) as f:
                interm = json.load(f)

            interm = update_interm(interm, candidates, bleu, cider, rouge,
                                   meteor)

            with open(interm_path, "w") as f:
                json.dump(interm, f, indent=4)

    if mode == "scene" or mode == "oracle":
        return bleu, cider, rouge, meteor
    else:
        return bleu, cider, rouge, meteor, cls_acc
Exemplo n.º 5
0
def evaluate(encoder, decoder, dataloader, dict_idx2word, references,
             output_root):
    encoder.eval()
    decoder.eval()
    beam_size = ['1', '3', '5', '7']
    candidates = {i: {} for i in beam_size}
    outputs = {i: {} for i in beam_size}
    bleu = {i: {} for i in beam_size}
    cider = {i: {} for i in beam_size}
    rouge = {i: {} for i in beam_size}
    result_path = os.path.join(output_root, "results.json")
    if os.path.exists(result_path):
        print("loading existing results...")
        print()
        candidates = json.load(open(result_path, 'r'))
    else:
        print("evaluating with beam search...")
        print()
        for _, (model_ids, captions, embeddings, embeddings_interm,
                lengths) in enumerate(dataloader[CONF.CAP.EVAL_DATASET]):
            if CONF.CAP.ATTN == 'fc':
                visual_inputs = embeddings.cuda()
            else:
                visual_inputs = embeddings_interm.cuda()
            caption_inputs = captions[:, :-1].cuda()
            cap_lengths = lengths.cuda()
            visual_contexts = encoder(visual_inputs)
            max_length = int(cap_lengths[0].item()) + 10
            for bs in beam_size:
                if CONF.CAP.ATTN == 'fc':
                    outputs[bs] = decoder.beam_search(visual_contexts, int(bs),
                                                      max_length)
                    outputs[bs] = decode_outputs(outputs[bs], None,
                                                 dict_idx2word, "val")
                else:
                    outputs[bs] = decoder.beam_search(visual_contexts,
                                                      caption_inputs, int(bs),
                                                      max_length)
                    outputs[bs] = decode_attention_outputs(
                        outputs[bs], None, dict_idx2word, "val")
                for model_id, output in zip(model_ids, outputs[bs]):
                    if model_id not in candidates[bs].keys():
                        candidates[bs][model_id] = [output]
                    else:
                        candidates[bs][model_id].append(output)
        # save results
        json.dump(candidates, open(result_path, 'w'))

    score_path = os.path.join(output_root, "scores.txt")
    with open(score_path, 'w') as f:
        for bs in beam_size:
            # compute
            bleu[bs] = capbleu.Bleu(4).compute_score(
                references[CONF.CAP.EVAL_DATASET], candidates[bs])
            cider[bs] = capcider.Cider().compute_score(
                references[CONF.CAP.EVAL_DATASET], candidates[bs])
            rouge[bs] = caprouge.Rouge().compute_score(
                references[CONF.CAP.EVAL_DATASET], candidates[bs])
            # report
            print("----------------------Beam_size: {}-----------------------".
                  format(bs))
            print("[BLEU-1] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                bleu[bs][0][0], max(bleu[bs][1][0]), min(bleu[bs][1][0])))
            print("[BLEU-2] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                bleu[bs][0][1], max(bleu[bs][1][1]), min(bleu[bs][1][1])))
            print("[BLEU-3] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                bleu[bs][0][2], max(bleu[bs][1][2]), min(bleu[bs][1][2])))
            print("[BLEU-4] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                bleu[bs][0][3], max(bleu[bs][1][3]), min(bleu[bs][1][3])))
            print("[CIDEr] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                cider[bs][0], max(cider[bs][1]), min(cider[bs][1])))
            print("[ROUGE-L] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}".format(
                rouge[bs][0], max(rouge[bs][1]), min(rouge[bs][1])))
            print()
            # write report
            f.write(
                "----------------------Beam_size: {}-----------------------\n".
                format(bs))
            f.write("[BLEU-1] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n".format(
                bleu[bs][0][0], max(bleu[bs][1][0]), min(bleu[bs][1][0])))
            f.write("[BLEU-2] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n".format(
                bleu[bs][0][1], max(bleu[bs][1][1]), min(bleu[bs][1][1])))
            f.write("[BLEU-3] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n".format(
                bleu[bs][0][2], max(bleu[bs][1][2]), min(bleu[bs][1][2])))
            f.write("[BLEU-4] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n".format(
                bleu[bs][0][3], max(bleu[bs][1][3]), min(bleu[bs][1][3])))
            f.write("[CIDEr] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n".format(
                cider[bs][0], max(cider[bs][1]), min(cider[bs][1])))
            f.write(
                "[ROUGE-L] Mean: {:.4f}, Max: {:.4f}, Min: {:.4f}\n\n".format(
                    rouge[bs][0], max(rouge[bs][1]), min(rouge[bs][1])))
Exemplo n.º 6
0
    def train(self,
              encoder,
              decoder,
              dataloader,
              references,
              dict_word2idx,
              dict_idx2word,
              epoch,
              verbose,
              attention,
              beam_size=1):
        scheduler = ReduceLROnPlateau(self.optimizer,
                                      factor=0.8,
                                      patience=5,
                                      threshold=0.001)
        # scheduler = StepLR(self.optimizer, gamma=0.8, step_size=3)
        best_info = {
            'epoch_id': 0,
            'loss': 0,
        }
        best_scores = {
            'bleu_1': 0.0,
            'bleu_2': 0.0,
            'bleu_3': 0.0,
            'bleu_4': 0.0,
            'cider': 0.0,
            'rouge': 0.0,
        }
        best_models = {
            'encoder': None,
            'decoder': None,
        }
        for epoch_id in range(epoch):
            print("epoch [{}/{}] starting...\n".format(epoch_id + 1, epoch))
            # scheduler.step()
            log = {
                'train_loss': [],
                'train_perplexity': [],
                'train_bleu_1': [],
                'train_bleu_2': [],
                'train_bleu_3': [],
                'train_bleu_4': [],
                'val_loss': [],
                'val_bleu_1': [],
                'val_bleu_2': [],
                'val_bleu_3': [],
                'val_bleu_4': [],
                'train_cider': [],
                'val_cider': [],
                # 'train_meteor': [],
                # 'val_meteor': [],
                'train_rouge': [],
                'val_rouge': [],
                'forward': [],
                'backward': [],
                'val_time': [],
                'eval_time': [],
                'epoch_time': []
            }
            candidates = {'train': {}, 'val': {}}
            start = time.time()
            for phase in ["train", "val"]:
                total_iter = len(dataloader[phase])
                for iter_id, (model_ids, captions, embeddings,
                              embeddings_interm,
                              lengths) in enumerate(dataloader[phase]):
                    # decoder without attention
                    if attention == "fc":
                        visual_inputs = embeddings.cuda()
                        caption_inputs = captions[:, :-1].cuda()
                        caption_targets = captions.cuda()
                        cap_lengths = lengths.cuda()

                        if phase == "train":
                            encoder.train()
                            decoder.train()
                            self.optimizer.zero_grad()
                            # forward pass
                            forward_since = time.time()
                            visual_contexts = encoder(visual_inputs)
                            # teacher forcing
                            states = decoder.init_hidden(visual_contexts)
                            outputs = decoder(visual_contexts, caption_inputs,
                                              states)
                            # # no teacher forcing
                            # outputs = decoder.sample(visual_contexts, cap_lengths)
                            outputs_packed = pack_padded_sequence(
                                outputs, [l for l in cap_lengths],
                                batch_first=True)[0]
                            targets = pack_padded_sequence(
                                caption_targets, [l for l in cap_lengths],
                                batch_first=True)[0]
                            loss = self.criterion(outputs_packed, targets)

                            # decode outputs
                            outputs = decode_outputs(outputs, cap_lengths,
                                                     dict_idx2word, phase)
                            # save to candidates
                            for model_id, output in zip(model_ids, outputs):
                                if model_id not in candidates[phase].keys():
                                    candidates[phase][model_id] = [output]
                                else:
                                    candidates[phase][model_id].append(output)
                            log['forward'].append(time.time() - forward_since)

                            # backward pass
                            # save log
                            backward_since = time.time()
                            loss.backward()
                            # clipping the gradient
                            clip_grad_value_(self.optimizer, 5)
                            self.optimizer.step()
                            log['backward'].append(time.time() -
                                                   backward_since)
                            log['train_loss'].append(loss.item())
                            log['train_perplexity'].append(np.exp(loss.item()))

                            # report
                            if verbose and (iter_id + 1) % verbose == 0:
                                print(
                                    "Epoch: [{}/{}] Iter: [{}/{}] train_loss: {:.4f} perplexity: {:.4f}"
                                    .format(epoch_id + 1, epoch, iter_id + 1,
                                            total_iter, log['train_loss'][-1],
                                            log['train_perplexity'][-1]))
                        else:
                            # validate
                            encoder.eval()
                            decoder.eval()
                            val_since = time.time()
                            visual_contexts = encoder(visual_inputs)
                            max_length = int(cap_lengths[0].item()) + 10
                            outputs = decoder.beam_search(
                                visual_contexts, beam_size, max_length)

                            # decode outputs
                            outputs = decode_outputs(outputs, None,
                                                     dict_idx2word, phase)
                            # save to candidates
                            for model_id, output in zip(model_ids, outputs):
                                if model_id not in candidates[phase].keys():
                                    candidates[phase][model_id] = [output]
                                else:
                                    candidates[phase][model_id].append(output)

                            # save log
                            log['val_time'].append(time.time() - val_since)

                    else:
                        visual_inputs = embeddings_interm.cuda()
                        caption_inputs = captions[:, :-1].cuda()
                        caption_targets = captions[:, 1:].cuda()
                        cap_lengths = lengths.cuda()

                        if phase == "train":
                            encoder.train()
                            decoder.train()
                            self.optimizer.zero_grad()
                            # forward pass
                            forward_since = time.time()
                            visual_contexts = encoder(visual_inputs)
                            # visual_contexts = (batch_size, visual_channels, visual_size, visual_size)
                            # teacher forcing
                            states = decoder.init_hidden(visual_contexts[0])
                            outputs = decoder(visual_contexts, caption_inputs,
                                              states)

                            outputs_packed = pack_padded_sequence(
                                outputs, [l - 1 for l in cap_lengths],
                                batch_first=True)[0]
                            targets = pack_padded_sequence(
                                caption_targets, [l - 1 for l in cap_lengths],
                                batch_first=True)[0]
                            loss = self.criterion(outputs_packed, targets)

                            # decode outputs
                            outputs = decode_attention_outputs(
                                outputs, cap_lengths, dict_idx2word, phase)

                            # save to candidates
                            for model_id, output in zip(model_ids, outputs):
                                if model_id not in candidates[phase].keys():
                                    candidates[phase][model_id] = [output]
                                else:
                                    candidates[phase][model_id].append(output)

                            log['forward'].append(time.time() - forward_since)
                            # backward pass
                            backward_since = time.time()
                            loss.backward()
                            # clipping the gradient
                            clip_grad_value_(self.optimizer, 5)
                            self.optimizer.step()
                            log['backward'].append(time.time() -
                                                   backward_since)
                            log['train_loss'].append(loss.item())
                            log['train_perplexity'].append(np.exp(loss.item()))

                            # report
                            if verbose and (iter_id + 1) % verbose == 0:
                                print(
                                    "Epoch: [{}/{}] Iter: [{}/{}] train_loss: {:.4f} perplexity: {:.4f}"
                                    .format(epoch_id + 1, epoch, iter_id + 1,
                                            total_iter, log['train_loss'][-1],
                                            log['train_perplexity'][-1]))
                        else:
                            # validate
                            encoder.eval()
                            decoder.eval()
                            val_since = time.time()
                            visual_contexts = encoder(visual_inputs)
                            max_length = int(cap_lengths[0].item()) + 10
                            outputs = decoder.beam_search(
                                visual_contexts, caption_inputs, beam_size,
                                max_length)

                            # decode the outputs
                            outputs = decode_attention_outputs(
                                outputs, None, dict_idx2word, phase)
                            # save to candidates
                            for model_id, output in zip(model_ids, outputs):
                                if model_id not in candidates[phase].keys():
                                    candidates[phase][model_id] = [output]
                                else:
                                    candidates[phase][model_id].append(output)
                            # save log
                            log['val_time'].append(time.time() - val_since)

            # accumulate loss
            log['train_loss'] = np.mean(log['train_loss'])
            # log['val_loss'] = np.mean(log['val_loss'])
            log['train_perplexity'] = np.mean(log['train_perplexity'])
            # evaluate bleu
            eval_since = time.time()
            train_bleu, _ = capbleu.Bleu(4).compute_score(
                references["train"], candidates["train"])
            val_bleu, _ = capbleu.Bleu(4).compute_score(
                references["val"], candidates["val"])
            # evaluate cider
            train_cider, _ = capcider.Cider().compute_score(
                references["train"], candidates["train"])
            val_cider, _ = capcider.Cider().compute_score(
                references["val"], candidates["val"])
            # reduce the learning rate on plateau if training loss if training loss is small
            if log['train_loss'] <= self.threshold['schedule']:
                scheduler.step(val_cider)
            # # evaluate meteor
            # try:
            #     train_meteor, _ = capmeteor.Meteor().compute_score(references["train"], candidates["train"])
            #     val_meteor, _ = capmeteor.Meteor().compute_score(references["val"], candidates["val"])
            # except Exception:
            #     train_meteor = 0
            #     val_meteor = 0
            # evaluate rouge
            train_rouge, _ = caprouge.Rouge().compute_score(
                references["train"], candidates["train"])
            val_rouge, _ = caprouge.Rouge().compute_score(
                references["val"], candidates["val"])
            log['eval_time'] = time.time() - eval_since

            # log
            log['train_bleu_1'] = train_bleu[0]
            log['train_bleu_2'] = train_bleu[1]
            log['train_bleu_3'] = train_bleu[2]
            log['train_bleu_4'] = train_bleu[3]
            log['val_bleu_1'] = val_bleu[0]
            log['val_bleu_2'] = val_bleu[1]
            log['val_bleu_3'] = val_bleu[2]
            log['val_bleu_4'] = val_bleu[3]
            log['train_cider'] = train_cider
            log['val_cider'] = val_cider
            # log['train_meteor'] = train_meteor
            # log['val_meteor'] = val_meteor
            log['train_rouge'] = train_rouge
            log['val_rouge'] = val_rouge
            log['epoch_time'].append(np.mean(time.time() - start))
            # show report
            exetime_s = np.sum(log['epoch_time'])
            eta_s = exetime_s * (epoch - (epoch_id + 1))
            eta_m = math.floor(eta_s / 60)
            print(
                "----------------------summary [{}/{}]-----------------------".
                format(epoch_id + 1, epoch))
            print("[Loss]    train_loss: %f, perplexity: %f" %
                  (log['train_loss'], log['train_perplexity']))
            print("[BLEU-1]  train_bleu: %f, val_bleu: %f" %
                  (log['train_bleu_1'], log['val_bleu_1']))
            print("[BLEU-2]  train_bleu: %f, val_bleu: %f" %
                  (log['train_bleu_2'], log['val_bleu_2']))
            print("[BLEU-3]  train_bleu: %f, val_bleu: %f" %
                  (log['train_bleu_3'], log['val_bleu_3']))
            print("[BLEU-4]  train_bleu: %f, val_bleu: %f" %
                  (log['train_bleu_4'], log['val_bleu_4']))
            print("[CIDEr]   train_cider: %f, val_cider: %f" %
                  (log['train_cider'], log['val_cider']))
            # print("[METEOR] train_meteor: %f, val_meteor: %f" % (
            #     log['train_meteor'],
            #     log['val_meteor'])
            # )
            print("[ROUGE_L] train_rouge: %f, val_rouge: %f" %
                  (log['train_rouge'], log['val_rouge']))
            print(
                "[Info]    forward_per_epoch: %fs\n[Info]    backward_per_epoch: %fs\n[Info]    val_per_epoch: %fs"
                % (np.sum(log['forward']), np.sum(
                    log['backward']), np.sum(log['val_time'])))
            print("[Info]    eval_time: %fs" % (np.mean(log['eval_time'])))
            print("[Info]    time_per_epoch: %fs\n[Info]    ETA: %dm %ds\n\n" %
                  (np.mean(log['epoch_time']), eta_m, eta_s - eta_m * 60))
            # print("[Debug] train_id: {}\n[Debug] train_ref: {}\n[Debug] train_can: {}\n".format(
            #     list(references["train"].keys())[0],
            #     references["train"][list(references["train"].keys())[0]],
            #     candidates["train"][list(references["train"].keys())[0]]
            # ))
            # print("[Debug] val_id: {}\n[Debug] val_ref: {}\n[Debug] val_can: {}\n\n".format(
            #     list(references["val"].keys())[0],
            #     references["val"][list(references["val"].keys())[0]],
            #     candidates["val"][list(references["val"].keys())[0]]
            # ))

            # save log
            self.log[epoch_id] = log

            # best
            if log['train_loss'] <= self.threshold['save'] and log[
                    'val_cider'] > best_scores["cider"]:
                print("best cider achieved:", log['val_cider'])
                print("current loss:", log['train_loss'])
                best_info['epoch_id'] = epoch_id + 1
                best_info['loss'] = log['train_loss']
                best_scores['bleu_1'] = log['val_bleu_1']
                best_scores['bleu_2'] = log['val_bleu_2']
                best_scores['bleu_3'] = log['val_bleu_3']
                best_scores['bleu_4'] = log['val_bleu_4']
                best_scores['cider'] = log['val_cider']
                best_scores['rouge'] = log['val_rouge']
                best_models['encoder'] = encoder
                best_models['decoder'] = decoder

                print("saving the best models...\n")
                model_root = os.path.join(self.output_root, "models")
                if not os.path.exists(model_root):
                    os.mkdir(model_root)
                torch.save(best_models['encoder'],
                           os.path.join(model_root, "encoder.pth"))
                torch.save(best_models['decoder'],
                           os.path.join(model_root, "decoder.pth"))

        # show the best
        print("---------------------best----------------------")
        print("[Best] Epoch_id: {}".format(best_info['epoch_id']))
        print("[Best] Loss: {}".format(best_info['loss']))
        print("[Best] BLEU-1: {}".format(best_scores['bleu_1']))
        print("[Best] BLEU-2: {}".format(best_scores['bleu_2']))
        print("[Best] BLEU-3: {}".format(best_scores['bleu_3']))
        print("[Best] BLEU-4: {}".format(best_scores['bleu_4']))
        print("[Best] CIDEr: {}".format(best_scores['cider']))
        print("[Best] ROUGE_L: {}".format(best_scores['rouge']))
        print()

        # save the best model
        if not best_models['encoder'] or not best_models['decoder']:
            best_models['encoder'] = encoder
            best_models['decoder'] = decoder

        return best_models['encoder'], best_models['decoder']