Exemplo n.º 1
0
def train(net: NeuralNet,
          inputs: Tensor,
          targets: Tensor,
          num_epochs: int = 5000,
          iterator: DataIterator = BatchIterator(),
          loss: Loss = MSE(),
          optimizer: Optimizer = SGD()
          ) -> None:
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch in iterator(inputs, targets):
            predicted = net.forward(batch.inputs)
            epoch_loss += loss.loss(predicted, batch.targets)
            grad = loss.grad(predicted, batch.targets)
            net.backward(grad)
            optimizer.step(net)
        print(epoch, epoch_loss)
Exemplo n.º 2
0
def train(args):
    """Run model training."""

    # Get nested namespaces.
    model_args = args.model_args
    logger_args = args.logger_args
    optim_args = args.optim_args
    data_args = args.data_args

    # Get logger.
    logger = Logger(logger_args)

    if model_args.ckpt_path:
        # CL-specified args are used to load the model, rather than the
        # ones saved to args.json.
        model_args.pretrained = False
        ckpt_path = model_args.ckpt_path
        assert False
        model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path,
                                                 gpu_ids=args.gpu_ids,
                                                 model_args=model_args,
                                                 is_training=True)
        optim_args.start_epoch = ckpt_info['epoch'] + 1
    else:
        # If no ckpt_path is provided, instantiate a new randomly
        # initialized model.
        model_fn = models.__dict__[model_args.model]
        model = model_fn(model_args)
        model = nn.DataParallel(model, args.gpu_ids)
    # Put model on gpu or cpu and put into training mode.
    model = model.to(args.device)
    model.train()

    # Get train and valid loader objects.
    train_loader = get_loader(phase="train",
                              data_args=data_args,
                              is_training=True,
                              logger=logger)
    valid_loader = get_loader(phase="valid",
                              data_args=data_args,
                              is_training=False,
                              logger=logger)
    dense_valid_loader = get_loader(phase="dense_valid",
                                    data_args=data_args,
                                    is_training=False,
                                    logger=logger)

    # Instantiate the predictor class for obtaining model predictions.
    predictor = Predictor(model, args.device)

    # Instantiate the evaluator class for evaluating models.
    # By default, get best performance on validation set.
    evaluator = Evaluator(logger=logger, tune_threshold=True)

    # Instantiate the saver class for saving model checkpoints.
    saver = ModelSaver(save_dir=logger_args.save_dir,
                       iters_per_save=logger_args.iters_per_save,
                       max_ckpts=logger_args.max_ckpts,
                       metric_name=optim_args.metric_name,
                       maximize_metric=optim_args.maximize_metric,
                       keep_topk=True,
                       logger=logger)

    # Instantiate the optimizer class for guiding model training.
    optimizer = Optimizer(parameters=model.parameters(),
                          optim_args=optim_args,
                          batch_size=data_args.batch_size,
                          iters_per_print=logger_args.iters_per_print,
                          iters_per_visual=logger_args.iters_per_visual,
                          iters_per_eval=logger_args.iters_per_eval,
                          dataset_len=len(train_loader.dataset),
                          logger=logger)
    if model_args.ckpt_path:
        # Load the same optimizer as used in the original training.
        optimizer.load_optimizer(ckpt_path=model_args.ckpt_path,
                                 gpu_ids=args.gpu_ids)

    loss_fn = evaluator.get_loss_fn(loss_fn_name=optim_args.loss_fn)

    # Run training
    while not optimizer.is_finished_training():
        optimizer.start_epoch()

        for inputs, targets in train_loader:
            optimizer.start_iter()

            if optimizer.global_step % optimizer.iters_per_eval == 0:
                # Only evaluate every iters_per_eval examples.
                predictions, groundtruth = predictor.predict(valid_loader)
                metrics = evaluator.evaluate(groundtruth, predictions)

                # Evaluate on dense dataset
                dense_predictions, dense_groundtruth = predictor.predict(
                    dense_valid_loader)
                dense_metrics = evaluator.dense_evaluate(
                    dense_groundtruth, dense_predictions)
                # Merge the metrics dicts together
                metrics = {**metrics, **dense_metrics}

                # Log metrics to stdout.
                logger.log_metrics(metrics, phase='valid')

                # Log to tb
                logger.log_scalars(metrics,
                                   optimizer.global_step,
                                   phase='valid')

                if optimizer.global_step % logger_args.iters_per_save == 0:
                    # Only save every iters_per_save examples directly
                    # after evaluation.
                    saver.save(iteration=optimizer.global_step,
                               epoch=optimizer.epoch,
                               model=model,
                               optimizer=optimizer,
                               device=args.device,
                               metric_val=metrics[optim_args.metric_name])

                # Step learning rate scheduler.
                optimizer.step_scheduler(metrics[optim_args.metric_name])

            with torch.set_grad_enabled(True):

                # Run the minibatch through the model.
                logits = model(inputs.to(args.device))

                # Compute the minibatch loss.
                loss = loss_fn(logits, targets.to(args.device))

                # Log the data from this iteration.
                optimizer.log_iter(inputs, logits, targets, loss)

                # Perform a backward pass.
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            optimizer.end_iter()

        optimizer.end_epoch(metrics)

    # Save the most recent model.
    saver.save(iteration=optimizer.global_step,
               epoch=optimizer.epoch,
               model=model,
               optimizer=optimizer,
               device=args.device,
               metric_val=metrics[optim_args.metric_name])
Exemplo n.º 3
0
def train(args):
    """Run model training."""

    print("Start Training ...")

    # Get nested namespaces.
    model_args = args.model_args
    logger_args = args.logger_args
    optim_args = args.optim_args
    data_args = args.data_args
    transform_args = args.transform_args

    # Get logger.
    print('Getting logger... log to path: {}'.format(logger_args.log_path))
    logger = Logger(logger_args.log_path, logger_args.save_dir)

    # For conaug, point to the MOCO pretrained weights.
    if model_args.ckpt_path and model_args.ckpt_path != 'None':
        print("pretrained checkpoint specified : {}".format(
            model_args.ckpt_path))
        # CL-specified args are used to load the model, rather than the
        # ones saved to args.json.
        model_args.pretrained = False
        ckpt_path = model_args.ckpt_path
        model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path,
                                                 gpu_ids=args.gpu_ids,
                                                 model_args=model_args,
                                                 is_training=True)

        if not model_args.moco:
            optim_args.start_epoch = ckpt_info['epoch'] + 1
        else:
            optim_args.start_epoch = 1
    else:
        print(
            'Starting without pretrained training checkpoint, random initialization.'
        )
        # If no ckpt_path is provided, instantiate a new randomly
        # initialized model.
        model_fn = models.__dict__[model_args.model]
        if data_args.custom_tasks is not None:
            tasks = NamedTasks[data_args.custom_tasks]
        else:
            tasks = model_args.__dict__[TASKS]  # TASKS = "tasks"
        print("Tasks: {}".format(tasks))
        model = model_fn(tasks, model_args)
        model = nn.DataParallel(model, args.gpu_ids)

    # Put model on gpu or cpu and put into training mode.
    model = model.to(args.device)
    model.train()

    print("========= MODEL ==========")
    print(model)

    # Get train and valid loader objects.
    train_loader = get_loader(phase="train",
                              data_args=data_args,
                              transform_args=transform_args,
                              is_training=True,
                              return_info_dict=False,
                              logger=logger)
    valid_loader = get_loader(phase="valid",
                              data_args=data_args,
                              transform_args=transform_args,
                              is_training=False,
                              return_info_dict=False,
                              logger=logger)

    # Instantiate the predictor class for obtaining model predictions.
    predictor = Predictor(model, args.device)
    # Instantiate the evaluator class for evaluating models.
    evaluator = Evaluator(logger)
    # Get the set of tasks which will be used for saving models
    # and annealing learning rate.
    eval_tasks = EVAL_METRIC2TASKS[optim_args.metric_name]

    # Instantiate the saver class for saving model checkpoints.
    saver = ModelSaver(save_dir=logger_args.save_dir,
                       iters_per_save=logger_args.iters_per_save,
                       max_ckpts=logger_args.max_ckpts,
                       metric_name=optim_args.metric_name,
                       maximize_metric=optim_args.maximize_metric,
                       keep_topk=logger_args.keep_topk)

    # TODO: JBY: handle threshold for fine tuning
    if model_args.fine_tuning == 'full':  # Fine tune all layers.
        pass
    else:
        # Freeze other layers.
        models.PretrainedModel.set_require_grad_for_fine_tuning(
            model, model_args.fine_tuning.split(','))

    # Instantiate the optimizer class for guiding model training.
    optimizer = Optimizer(parameters=model.parameters(),
                          optim_args=optim_args,
                          batch_size=data_args.batch_size,
                          iters_per_print=logger_args.iters_per_print,
                          iters_per_visual=logger_args.iters_per_visual,
                          iters_per_eval=logger_args.iters_per_eval,
                          dataset_len=len(train_loader.dataset),
                          logger=logger)

    if model_args.ckpt_path and not model_args.moco:
        # Load the same optimizer as used in the original training.
        optimizer.load_optimizer(ckpt_path=model_args.ckpt_path,
                                 gpu_ids=args.gpu_ids)

    model_uncertainty = model_args.model_uncertainty
    loss_fn = evaluator.get_loss_fn(
        loss_fn_name=optim_args.loss_fn,
        model_uncertainty=model_args.model_uncertainty,
        mask_uncertain=True,
        device=args.device)

    # Run training
    while not optimizer.is_finished_training():
        optimizer.start_epoch()

        # TODO: JBY, HACK WARNING  # What is the hack?
        metrics = None
        for inputs, targets in train_loader:
            optimizer.start_iter()
            if optimizer.global_step and optimizer.global_step % optimizer.iters_per_eval == 0 or len(
                    train_loader.dataset
            ) - optimizer.iter < optimizer.batch_size:

                # Only evaluate every iters_per_eval examples.
                predictions, groundtruth = predictor.predict(valid_loader)
                # print("predictions: {}".format(predictions))
                metrics, curves = evaluator.evaluate_tasks(
                    groundtruth, predictions)
                # Log metrics to stdout.
                logger.log_metrics(metrics)

                # Add logger for all the metrics for valid_loader
                logger.log_scalars(metrics, optimizer.global_step)

                # Get the metric used to save model checkpoints.
                average_metric = evaluator.evaluate_average_metric(
                    metrics, eval_tasks, optim_args.metric_name)

                if optimizer.global_step % logger_args.iters_per_save == 0:
                    # Only save every iters_per_save examples directly
                    # after evaluation.
                    print("Save global step: {}".format(optimizer.global_step))
                    saver.save(iteration=optimizer.global_step,
                               epoch=optimizer.epoch,
                               model=model,
                               optimizer=optimizer,
                               device=args.device,
                               metric_val=average_metric)

                # Step learning rate scheduler.
                optimizer.step_scheduler(average_metric)

            with torch.set_grad_enabled(True):
                logits, embedding = model(inputs.to(args.device))
                loss = loss_fn(logits, targets.to(args.device))
                optimizer.log_iter(inputs, logits, targets, loss)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            optimizer.end_iter()

        optimizer.end_epoch(metrics)

    logger.log('=== Training Complete ===')
Exemplo n.º 4
0
class SupervisedTrainer(object):
    def __init__(self, db):
        self.cfg = db.cfg
        self.db = db
        net = DrawModel(db, if_bert=True)

        if self.cfg.cuda:
            if self.cfg.parallel and torch.cuda.device_count() > 1:
                print("Let's use", torch.cuda.device_count(), "GPUs!")
                net = nn.DataParallel(net)
            net = net.cuda()
        self.net = net
        self.start_epoch = 0
        if self.cfg.pretrained is not None:
            self.load_pretrained_net(self.cfg.pretrained)
        self.writer = SummaryWriter(comment='Layout')
        self.xymap = CocoLocationMap(self.cfg)
        self.whmap = CocoTransformationMap(self.cfg)
        self.optimizer = None
        self.bert_optimizer = None
        self.bert_scheduler = None
        self.mse_loss = nn.MSELoss(reduction='sum')
        self.train_bucket_sampler = None
        self.val_bucket_sampler = None
        self.global_step = 0

    def get_parameter_number(self):
        net = self.net
        total_num = sum(p.numel() for p in net.parameters())
        trainable_num = sum(p.numel() for p in net.parameters()
                            if p.requires_grad)
        print('Total', total_num, 'Trainable', trainable_num)

    # def load_optimizer(self, pretrained_name):
    #     cache_dir = osp.join(self.cfg.data_dir, 'caches')
    #     pretrained_path = osp.join(cache_dir, 'bert_layout_ckpts', pretrained_name+'.pkl')
    #     bert_path = osp.join(cache_dir, 'bert_layout_ckpts', 'bert-'+pretrained_name+'.pkl')

    #     assert osp.exists(pretrained_path)
    #     if self.cfg.cuda:
    #         checkpoint = torch.load(pretrained_path)
    #         bert_checkpoint = torch.load(bert_path)
    #     else:
    #         checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
    #         bert_checkpoint = torch.load(bert_path, map_location=lambda storage, loc: storage)
    #     self.optimizer.optimizer.load_state_dict(checkpoint['optimizer'])
    #     self.bert_optimizer.load_state_dict(bert_checkpoint['optimizer'])

    def load_pretrained_net(self, pretrained_name):
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net

        # cache_dir = osp.join(self.cfg.data_dir, 'caches')
        # pretrained_path = osp.join(cache_dir, 'bert_layout_ckpts', pretrained_name+'.pkl')
        pretrained_path = pretrained_name
        # self.begin_epoch = int(pretrained_name.split('-')[1]) + 1
        print('loading ckpt from ', pretrained_path)

        assert osp.exists(pretrained_path)
        if self.cfg.cuda:
            checkpoint = torch.load(pretrained_path)
        else:
            checkpoint = torch.load(pretrained_path,
                                    map_location=lambda storage, loc: storage)
        net.load_state_dict(checkpoint['net'])
        # self.optimizer.optimizer.load_state_dict(checkpoint['optimizer'])
        self.start_epoch = checkpoint['epoch'] + 1
        print('Start training from {} epoch'.format(self.start_epoch))

    def batch_data(self, entry):
        ################################################
        # Inputs
        maxlen = max(entry['obj_cnt']).item()
        # print(entry['obj_cnt'])
        input_inds = entry['bert_inds'].long()
        input_lens = entry['bert_lens'].unsqueeze(-1).long()
        fg_inds = entry['fg_inds'].long()
        bg_imgs = entry['background'].float()
        fg_onehots = indices2onehots(fg_inds, self.cfg.output_cls_size)

        ################################################
        # Outputs
        ################################################
        gt_inds = entry['out_inds'].long()
        gt_msks = entry['out_msks'].float()
        gt_scene_inds = entry['scene_idx'].long().numpy()
        gt_boxes = entry['boxes'].float()
        box_msks = entry['box_msks'].float()

        ################################################
        # print(entry['obj_cnt'][0])
        # print(fg_onehots[0])
        # print(gt_inds[0])

        if self.cfg.cuda:
            input_inds = input_inds.cuda()
            input_lens = input_lens.cuda()
            fg_onehots = fg_onehots[:, :maxlen + 1, :].cuda()  #(bsize,11,83)
            bg_imgs = bg_imgs[:, :maxlen, :, :, :].cuda(
            )  #(bsize, 10, 83, 64, 64)
            gt_inds = gt_inds[:, :maxlen, :].cuda()  #(bsize, 10,4)
            gt_msks = gt_msks[:, :maxlen, :].cuda()  #(bsize, 10,4)
            gt_boxes = gt_boxes[:, :maxlen, :].cuda()  #(bsize, 10,5)
            box_msks = box_msks[:, :maxlen, :].cuda()  #(bsize, 10,5)
        # print(input_inds.shape, input_lens.shape, fg_onehots.shape, bg_imgs.shape,
        # gt_inds.shape, gt_msks.shape, gt_boxes.shape, box_msks.shape)

        return input_inds, input_lens, bg_imgs, fg_onehots, gt_inds, gt_msks, gt_scene_inds, gt_boxes, box_msks

    def evaluate(self, inf_outs, ref_inds, ref_msks, ref_boxes, box_msks):
        ####################################################################
        # Prediction loss
        ####################################################################
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net
        if not self.cfg.mse:
            _, _, _, enc_msks, what_wei, where_wei = inf_outs
        else:
            obj_logits, pred_boxes, enc_msks, what_wei, where_wei = inf_outs

        ####################################################################
        # doubly stochastic attn loss
        ####################################################################
        attn_loss = 0
        encoder_msks = enc_msks

        if self.cfg.what_attn:
            obj_msks = ref_msks[:, :, 0].unsqueeze(-1)
            what_att_logits = what_wei
            raw_obj_att_loss = torch.mul(what_att_logits, obj_msks)
            raw_obj_att_loss = torch.sum(raw_obj_att_loss, dim=1)
            obj_att_loss = raw_obj_att_loss - encoder_msks
            obj_att_loss = torch.sum(obj_att_loss**2, dim=-1)
            obj_att_loss = torch.mean(obj_att_loss)
            attn_loss = attn_loss + obj_att_loss

        attn_loss = self.cfg.attn_loss_weight * attn_loss

        eos_loss = 0
        if self.cfg.what_attn and self.cfg.eos_loss_weight > 0:
            # print('-------------------')
            # print('obj_msks: ', obj_msks.size())
            inds_1 = torch.sum(obj_msks, 1, keepdim=True) - 1
            # print('inds_1: ', inds_1.size())
            bsize, tlen, slen = what_att_logits.size()
            # print('what_att_logits: ', what_att_logits.size())
            inds_1 = inds_1.expand(bsize, 1, slen).long()
            local_eos_probs = torch.gather(what_att_logits, 1,
                                           inds_1).squeeze(1)
            # print('local_eos_probs: ', local_eos_probs.size())
            # print('encoder_msks: ', encoder_msks.size())
            inds_2 = torch.sum(encoder_msks, 1, keepdim=True) - 1
            # print('inds_2: ', inds_2.size())
            eos_probs = torch.gather(local_eos_probs, 1, inds_2.long())
            norm_probs = torch.gather(raw_obj_att_loss, 1, inds_2.long())
            # print('norm_probs:', norm_probs.size())
            # print('eos_probs: ', eos_probs.size())
            eos_loss = -torch.log(eos_probs.clamp(min=self.cfg.eps))
            eos_loss = torch.mean(eos_loss)
            diff = torch.sum(norm_probs) - 1.0
            norm_loss = diff * diff
            # print('obj_att_loss: ', att_loss)
            # print('eos_loss: ', eos_loss)
            # print('norm_loss: ', norm_loss)
        eos_loss = self.cfg.eos_loss_weight * eos_loss

        # torch.cuda.synchronize()
        # s = time()

        if not self.cfg.mse:
            # _, _, _, enc_msks, what_wei, where_wei = inf_outs

            logits = net.collect_logits(inf_outs, ref_inds)
            bsize, slen, _ = logits.size()
            loss_wei = [
                self.cfg.obj_loss_weight, \
                self.cfg.coord_loss_weight, \
                self.cfg.scale_loss_weight, \
                self.cfg.ratio_loss_weight
            ]
            loss_wei = torch.from_numpy(np.array(loss_wei)).float()
            if self.cfg.cuda:
                loss_wei = loss_wei.cuda()
            loss_wei = loss_wei.view(1, 1, 4)
            loss_wei = loss_wei.expand(bsize, slen, 4)

            pred_loss = -torch.log(
                logits.clamp(min=self.cfg.eps)) * loss_wei * ref_msks
            pred_loss = torch.sum(pred_loss) / (torch.sum(ref_msks) +
                                                self.cfg.eps)

            ####################################################################
            # Accuracies
            ####################################################################
            pred_accu, pred_mse = net.collect_accuracies(inf_outs, ref_inds)
            pred_accu = pred_accu * ref_msks

            mse_msks = ref_msks[:, :, -1].unsqueeze(-1).expand(-1, -1, 4)
            pred_mse = pred_mse * mse_msks

            comp_accu = torch.sum(torch.sum(pred_accu, 0), 0)
            comp_msks = torch.sum(torch.sum(ref_msks, 0), 0)
            pred_accu = comp_accu / (comp_msks + self.cfg.eps)

            comp_mse = torch.sum(torch.sum(pred_mse, 0), 0)
            comp_msks = torch.sum(torch.sum(mse_msks, 0), 0)
            pred_mse = comp_mse / (comp_msks + self.cfg.eps)

        else:
            #inf_outs = (obj_logits, pred_boxes, enc_msks, what_wei, where_wei)
            # obj_logits, pred_boxes, enc_msks, what_wei, where_wei = inf_outs
            b_size, tlen, _ = pred_boxes.size()
            ref_boxes = ref_boxes * box_msks

            # logits = net.collect_logits(inf_outs, ref_inds)
            obj_inds = ref_inds[:, :, 0].unsqueeze(-1)
            sample_obj_logits = torch.gather(obj_logits, -1, obj_inds)

            obj_loss = -torch.log(sample_obj_logits.clamp(
                min=self.cfg.eps)) * ref_msks[:, :, 0].unsqueeze(-1)
            obj_loss = torch.sum(obj_loss) / (torch.sum(ref_msks[:, :, 0]) +
                                              self.cfg.eps)

            # obj_logits = obj_logits.float() * box_msks[:,:,0].unsqueeze(-1).expand(-1,-1,83)
            pred_boxes = pred_boxes * box_msks[:, :, 1:]

            gt_obj = ref_boxes[:, :, 0].unsqueeze(-1).long()
            gt_boxes = ref_boxes[:, :, 1:]

            # loss_wei = [
            #     self.cfg.obj_loss_weight, \
            #     self.cfg.coord_loss_weight, \
            #     self.cfg.scale_loss_weight, \
            #     self.cfg.ratio_loss_weight
            # ]
            # loss_wei = torch.from_numpy(np.array(loss_wei)).float()
            # if self.cfg.cuda:
            #     loss_wei = loss_wei.cuda()
            # loss_wei = loss_wei.view(1,1,4)
            # loss_wei = loss_wei.expand(bsize, slen, 4)

            # torch.cuda.synchronize()
            # print(time()-s)
            # torch.cuda.synchronize()
            # s = time()

            all_mse = (pred_boxes - gt_boxes)**2

            comp_mse = torch.sum(torch.sum(all_mse, 0), 0)
            comp_msks = torch.sum(torch.sum(box_msks[:, :, 1:], 0), 0)
            pred_mse = comp_mse / (comp_msks + self.cfg.eps)
            coord_loss = torch.sum(pred_mse)

            pred_loss = obj_loss + coord_loss
            # pred_loss = obj_loss + x_loss + y_loss + w_loss + h_loss
            # torch.cuda.synchronize()
            # print(time()-s)
            # torch.cuda.synchronize()
            # s = time()

            ####################################################################
            # Accuracies
            ####################################################################
            coord = self.db.boxes2indices(pred_boxes.contiguous().view(
                -1, 4).detach().cpu().numpy())
            coord = torch.from_numpy(coord).view(b_size, tlen, -1).cuda()

            _, pred_obj_inds = torch.max(obj_logits, -1)
            obj_accu = torch.eq(pred_obj_inds,
                                ref_inds[:, :, 0]).float().unsqueeze(-1)
            coord_accu = torch.eq(coord, ref_inds[:, :, 1:]).float()

            # pred_accu, pred_mse = net.collect_accuracies(inf_outs, ref_inds)
            pred_accu = torch.cat([obj_accu, coord_accu], -1)
            pred_accu = pred_accu * ref_msks

            comp_accu = torch.sum(torch.sum(pred_accu, 0), 0)
            comp_msks = torch.sum(torch.sum(ref_msks, 0), 0)
            pred_accu = comp_accu / (comp_msks + self.cfg.eps)

            # pred_mse = torch.stack([x_loss, y_loss, w_loss, h_loss], -1)
            # torch.cuda.synchronize()
            # print(time()-s)
            # print('====================')

        return pred_loss, attn_loss, eos_loss, pred_accu, pred_mse

    def train(self, train_db, val_db, test_db):
        ##################################################################
        ## Optimizer
        ##################################################################
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net
        image_encoder_trainable_paras = \
            filter(lambda p: p.requires_grad, net.image_encoder.parameters())
        # raw_optimizer = optim.Adam([
        #         {'params': net.text_encoder.parameters(), 'lr': self.cfg.finetune_lr},
        #         {'params': image_encoder_trainable_paras},
        #         {'params': net.what_decoder.parameters()},
        #         {'params': net.where_decoder.parameters()}
        #     ], lr=self.cfg.lr)
        raw_optimizer = optim.Adam([{
            'params': image_encoder_trainable_paras,
            'initial_lr': self.cfg.lr
        }, {
            'params': net.what_decoder.parameters(),
            'initial_lr': self.cfg.lr
        }, {
            'params': net.where_decoder.parameters(),
            'initial_lr': self.cfg.lr
        }],
                                   lr=self.cfg.lr)
        self.optimizer = Optimizer(raw_optimizer,
                                   max_grad_norm=self.cfg.grad_norm_clipping)
        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer.optimizer, factor=0.8, patience=3)
        scheduler = optim.lr_scheduler.StepLR(self.optimizer.optimizer,
                                              step_size=3,
                                              gamma=0.8,
                                              last_epoch=self.start_epoch - 1)
        self.optimizer.set_scheduler(scheduler)

        num_train_steps = int(
            len(train_db) / self.cfg.accumulation_steps * self.cfg.n_epochs)
        num_warmup_steps = int(num_train_steps * self.cfg.warmup)
        self.bert_optimizer = AdamW([{
            'params': net.text_encoder.parameters(),
            'initial_lr': self.cfg.finetune_lr
        }],
                                    lr=self.cfg.finetune_lr)
        self.bert_scheduler = get_linear_schedule_with_warmup(
            self.bert_optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_steps,
            last_epoch=self.start_epoch - 1)

        bucket_boundaries = [4, 8, 12, 16, 22]  # [4,8,12,16,22]
        print('preparing training bucket sampler')
        self.train_bucket_sampler = BucketSampler(
            train_db, bucket_boundaries, batch_size=self.cfg.batch_size)
        print('preparing validation bucket sampler')
        self.val_bucket_sampler = BucketSampler(val_db,
                                                bucket_boundaries,
                                                batch_size=4)

        ##################################################################
        ## LOG
        ##################################################################
        logz.configure_output_dir(self.cfg.model_dir)
        logz.save_config(self.cfg)

        ##################################################################
        ## Main loop
        ##################################################################
        start = time()

        for epoch in range(self.start_epoch, self.cfg.n_epochs):
            ##################################################################
            ## Training
            ##################################################################
            print('Training...')
            torch.cuda.empty_cache()
            train_pred_loss, train_attn_loss, train_eos_loss, train_accu, train_mse = \
                self.train_epoch(train_db, self.optimizer, epoch)

            ##################################################################
            ## Validation
            ##################################################################
            print('Validation...')
            val_loss, val_accu, val_mse, val_infos = self.validate_epoch(
                val_db)

            ##################################################################
            ## Sample
            ##################################################################
            if self.cfg.if_sample:
                print('Sample...')
                torch.cuda.empty_cache()
                self.sample(epoch, test_db, self.cfg.n_samples)
                torch.cuda.empty_cache()
            ##################################################################
            ## Logging
            ##################################################################

            # update optim scheduler
            print('Loging...')
            self.optimizer.update(np.mean(val_loss), epoch)

            logz.log_tabular("Time", time() - start)
            logz.log_tabular("Iteration", epoch)

            logz.log_tabular("TrainAverageError", np.mean(train_pred_loss))
            logz.log_tabular("TrainAverageAccu", np.mean(train_accu))
            logz.log_tabular("TrainAverageMse", np.mean(train_mse))
            logz.log_tabular("ValAverageError", np.mean(val_loss))
            logz.log_tabular("ValAverageAccu", np.mean(val_accu))
            logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0]))
            logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1]))
            logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2]))
            logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3]))
            logz.log_tabular("ValAverageMse", np.mean(val_mse))
            logz.log_tabular("ValAverageXMse", np.mean(val_mse[:, 0]))
            logz.log_tabular("ValAverageYMse", np.mean(val_mse[:, 1]))
            logz.log_tabular("ValAverageWMse", np.mean(val_mse[:, 2]))
            logz.log_tabular("ValAverageHMse", np.mean(val_mse[:, 3]))
            logz.log_tabular("ValUnigramF3", np.mean(val_infos.unigram_F3()))
            logz.log_tabular("ValBigramF3", np.mean(val_infos.bigram_F3()))
            logz.log_tabular("ValUnigramP", np.mean(val_infos.unigram_P()))
            logz.log_tabular("ValUnigramR", np.mean(val_infos.unigram_R()))
            logz.log_tabular("ValBigramP", val_infos.mean_bigram_P())
            logz.log_tabular("ValBigramR", val_infos.mean_bigram_R())
            logz.log_tabular("ValUnigramScale", np.mean(val_infos.scale()))
            logz.log_tabular("ValUnigramRatio", np.mean(val_infos.ratio()))
            logz.log_tabular("ValUnigramSim",
                             np.mean(val_infos.unigram_coord()))
            logz.log_tabular("ValBigramSim", val_infos.mean_bigram_coord())

            logz.dump_tabular()

            ##################################################################
            ## Checkpoint
            ##################################################################
            print('Saving checkpoint...')
            log_info = [np.mean(val_loss), np.mean(val_accu)]
            self.save_checkpoint(epoch, log_info)
            torch.cuda.empty_cache()

    def train_epoch(self, train_db, optimizer, epoch):
        train_db.cfg.sent_group = -1

        train_loader = DataLoader(train_db,
                                  batch_size=1,
                                  num_workers=self.cfg.num_workers,
                                  batch_sampler=self.train_bucket_sampler,
                                  drop_last=False,
                                  pin_memory=False)

        train_pred_loss, train_attn_loss, train_eos_loss, train_accu, train_mse = [], [], [], [], []

        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net

        get_data = 0.
        forward = 0.
        eva = 0.
        back = 0.
        opt = 0.
        start = time()
        for cnt, batched in tqdm(enumerate(train_loader)):
            ##################################################################
            ## Batched data
            #############################F#####################################
            # torch.cuda.synchronize()
            # s = time()
            input_sentences, input_lens, bg_imgs, fg_onehots, \
            gt_inds, gt_msks, gt_scene_inds, boxes, box_msks = \
                self.batch_data(batched)
            gt_scenes = [deepcopy(train_db.scenedb[x]) for x in gt_scene_inds]

            ##################################################################
            ## Train one step
            ##################################################################
            self.net.train()
            # torch.cuda.synchronize()
            # get_data += time()-s
            # torch.cuda.synchronize()
            # s = time()

            if self.cfg.teacher_forcing:
                # self.get_parameter_number()
                # net.get_parameter_number()
                inputs = (input_sentences, input_lens, bg_imgs, fg_onehots)
                # inputs = (torch.zeros(8, 25).long().cuda(), torch.zeros(8).long().cuda(), torch.zeros(8, 10, 83, 64, 64).float().cuda(), torch.zeros(8,11,83).float().cuda())
                # self.writer.add_graph(self.net, inputs, True)
                # sys.exit()
                inf_outs, _ = self.net(inputs)
            else:
                inf_outs, _ = net.inference(input_sentences, input_lens, -1,
                                            -0.1, 0, gt_inds)

            # torch.cuda.synchronize()
            # forward += time()-s
            # torch.cuda.synchronize()
            # s = time()

            pred_loss, attn_loss, eos_loss, pred_accu, pred_mse = self.evaluate(
                inf_outs, gt_inds, gt_msks, boxes, box_msks)
            # torch.cuda.synchronize()
            # eva += time()-s
            # torch.cuda.synchronize()
            # s = time()

            loss = pred_loss + attn_loss + eos_loss
            loss = loss / self.cfg.accumulation_steps

            loss.backward()
            # torch.cuda.synchronize()
            # back += time()-s
            # torch.cuda.synchronize()
            # s = time()

            if ((cnt + 1) % self.cfg.accumulation_steps) == 0:
                self.optimizer.step()
                self.bert_optimizer.step()
                self.bert_scheduler.step()
                self.net.zero_grad()
                self.global_step += 1

                # torch.cuda.synchronize()
                # opt += time()-s

                ##################################################################
                ## Collect info
                ##################################################################
                train_pred_loss.append(pred_loss.cpu().data.item())
                if attn_loss == 0:
                    attn_loss_np = 0
                else:
                    attn_loss_np = attn_loss.cpu().data.item()
                train_attn_loss.append(attn_loss_np)
                if eos_loss == 0:
                    eos_loss_np = 0
                else:
                    eos_loss_np = eos_loss.cpu().data.item()
                train_eos_loss.append(eos_loss_np)
                train_accu.append(pred_accu.cpu().data.numpy())
                train_mse.append(pred_mse.cpu().data.numpy())

                ##################################################################
                ## Print info
                ##################################################################
                if self.global_step % self.cfg.log_per_steps == 0:
                    print('Epoch %03d, iter %07d:' % (epoch, cnt))
                    print('loss: ', np.mean(train_pred_loss),
                          np.mean(train_attn_loss), np.mean(train_eos_loss))
                    print('accu: ', np.mean(np.array(train_accu), 0))
                    print('xy & wh mse: ', np.mean(np.array(train_mse), 0))
                    torch.cuda.synchronize()
                    print('-------------------------')

        return train_pred_loss, train_attn_loss, train_eos_loss, train_accu, train_mse

    def validate_epoch(self, val_db):
        val_loss, val_accu, val_mse, top1_scores = [], [], [], []
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net

        for G in range(1):
            val_db.cfg.sent_group = G

            val_loader = DataLoader(val_db,
                                    batch_size=1,
                                    num_workers=self.cfg.num_workers,
                                    batch_sampler=self.val_bucket_sampler,
                                    drop_last=False,
                                    pin_memory=False)

            for cnt, batched in tqdm(enumerate(val_loader)):
                ##################################################################
                ## Batched data
                ##################################################################
                input_inds, input_lens, bg_imgs, fg_onehots, \
                gt_inds, gt_msks, gt_scene_inds, boxes, box_msks = \
                    self.batch_data(batched)
                gt_scenes = [
                    deepcopy(val_db.scenedb[x]) for x in gt_scene_inds
                ]

                ##################################################################
                ## Validate one step
                ##################################################################
                self.net.eval()
                with torch.no_grad():
                    _, env = net.inference(input_inds, input_lens, -1, 2.0, 0,
                                           None)
                    # infos = env.batch_evaluation(gt_inds.cpu().data.numpy())
                    scores = env.batch_evaluation(gt_scenes)
                    # scores = np.stack(scores, 0)
                    # infos = eval_info(self.cfg, scores)
                    inputs = (input_inds, input_lens, bg_imgs, fg_onehots)
                    inf_outs, _ = self.net(inputs)
                    # inf_outs, _ = self.net.teacher_forcing(input_inds, input_lens, bg_imgs, fg_onehots)
                    # print('gt_inds', gt_inds)
                    pred_loss, attn_loss, eos_loss, pred_accu, pred_mse = self.evaluate(
                        inf_outs, gt_inds, gt_msks, boxes,
                        box_msks)  #self.evaluate(inf_outs, gt_inds, gt_msks)

                top1_scores.extend(scores)
                val_loss.append(pred_loss.cpu().data.item())
                val_accu.append(pred_accu.cpu().data.numpy())
                val_mse.append(pred_mse.cpu().data.numpy())

                # print(G, cnt)
                # print('pred_loss', pred_loss.data.item())
                # print('pred_accu', pred_accu)
                # print('scores', scores)
                # if cnt > 0:
                #     break

        top1_scores = np.stack(top1_scores, 0)
        val_loss = np.array(val_loss)
        val_accu = np.stack(val_accu, 0)
        val_mse = np.stack(val_mse, 0)
        infos = eval_info(self.cfg, top1_scores)

        return val_loss, val_accu, val_mse, infos

    def sample(self, epoch, test_db, N, random_or_not=False):
        ##############################################################
        # Output prefix
        ##############################################################
        output_dir = osp.join(self.cfg.model_dir, '%03d' % epoch, 'vis')
        pred_dir = osp.join(self.cfg.model_dir, '%03d' % epoch, 'pred')
        gt_dir = osp.join(self.cfg.model_dir, '%03d' % epoch, 'gt')
        img_dir = osp.join(self.cfg.model_dir, '%03d' % epoch, 'color')

        maybe_create(output_dir)
        maybe_create(pred_dir)
        maybe_create(gt_dir)
        maybe_create(img_dir)

        ##############################################################
        # Main loop
        ##############################################################
        plt.switch_backend('agg')
        if random_or_not:
            indices = np.random.permutation(range(len(test_db)))
        else:
            indices = range(len(test_db))
        indices = indices[:N]

        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net

        for i in indices:
            entry = test_db[i]
            gt_scene = test_db.scenedb[i]

            gt_img = cv2.imread(entry['color_path'], cv2.IMREAD_COLOR)
            gt_img, _, _ = create_squared_image(gt_img)
            gt_img = cv2.resize(gt_img,
                                (self.cfg.draw_size[0], self.cfg.draw_size[1]))

            ##############################################################
            # Inputs
            ##############################################################
            # input_inds_np = np.array(entry['word_inds'])
            sentence = np.array(entry['bert_inds'])
            input_lens_np = np.array(entry['bert_lens'])

            # input_inds = torch.from_numpy(input_inds_np).long().unsqueeze(0)
            input_lens = torch.from_numpy(input_lens_np).long().unsqueeze(0)
            if self.cfg.cuda:
                input_inds = input_inds.cuda()
                input_lens = input_lens.cuda()

            ##############################################################
            # Inference
            ##############################################################
            self.net.eval()
            with torch.no_grad():
                inf_outs, env = net.inference(input_inds, input_lens, -1, 2.0,
                                              0, None)
            frames = env.batch_redraw(return_sequence=True)[0]
            # _, _, _, _, what_wei, where_wei = inf_outs
            what_wei, where_wei = inf_outs[-2:]

            if self.cfg.what_attn:
                what_attn_words = self.decode_attention(
                    input_inds_np, input_lens_np, what_wei.squeeze(0))
            if self.cfg.where_attn > 0:
                where_attn_words = self.decode_attention(
                    input_inds_np, input_lens_np, where_wei.squeeze(0))

            ##############################################################
            # Draw
            ##############################################################
            fig = plt.figure(figsize=(60, 30))
            plt.suptitle(entry['sentence'], fontsize=50)
            for j in range(frames.shape[0]):
                # print(attn_words[j])
                subtitle = ''
                if self.cfg.what_attn:
                    subtitle = subtitle + ' '.join(what_attn_words[j])
                if self.cfg.where_attn > 0:
                    subtitle = subtitle + '\n' + ' '.join(where_attn_words[j])

                plt.subplot(4, 4, j + 1)
                plt.title(subtitle, fontsize=30)
                plt.imshow(frames[j, :, :, ::-1])
                plt.axis('off')
            plt.subplot(4, 4, 16)
            plt.imshow(gt_img[:, :, ::-1])
            plt.axis('off')

            name = osp.splitext(osp.basename(entry['color_path']))[0]
            out_path = osp.join(output_dir, name + '.png')
            fig.savefig(out_path, bbox_inches='tight')
            plt.close(fig)

            cv2.imwrite(osp.join(pred_dir, name + '.png'), frames[-1])
            cv2.imwrite(osp.join(img_dir, name + '.png'), gt_img)
            gt_layout = self.db.render_scene_as_output(gt_scene, False, gt_img)
            cv2.imwrite(osp.join(gt_dir, name + '.png'), gt_layout)
            print('sampling: %d, %d' % (epoch, i))

    def show_metric(self, epoch, test_db, N, random_or_not=False):
        ##############################################################
        # Output prefix
        ##############################################################
        output_dir = osp.join(self.cfg.model_dir, '%03d' % epoch, 'metric')
        maybe_create(output_dir)
        ##############################################################
        # Main loop
        ##############################################################
        plt.switch_backend('agg')
        if random_or_not:
            indices = np.random.permutation(range(len(test_db)))
        else:
            indices = range(len(test_db))
        indices = indices[:N]
        test_db.cfg.sent_group = 1

        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net

        ev = evaluator(self.db)

        for i in indices:
            entry = test_db[i]
            gt_scene = test_db.scenedb[i]
            scene_idx = int(gt_scene['img_idx'])
            name = osp.splitext(osp.basename(entry['color_path']))[0]

            ##############################################################
            # Inputs
            ##############################################################
            input_inds_np = np.array(entry['word_inds'])
            input_lens_np = np.array(entry['word_lens'])

            input_inds = torch.from_numpy(input_inds_np).long().unsqueeze(0)
            input_lens = torch.from_numpy(input_lens_np).long().unsqueeze(0)
            if self.cfg.cuda:
                input_inds = input_inds.cuda()
                input_lens = input_lens.cuda()

            ##############################################################
            # Inference
            ##############################################################
            self.net.eval()
            with torch.no_grad():
                inf_outs, env = self.net.inference(input_inds, input_lens, -1,
                                                   2.0, 0, None)
            frame = env.batch_redraw(return_sequence=False)[0][0]
            raw_pred_scene = env.scenes[0]

            pred_inds = deepcopy(raw_pred_scene['out_inds'])
            pred_inds = np.stack(pred_inds, 0)
            pred_scene = self.db.output_inds_to_scene(pred_inds)

            graph_1 = scene_graph(self.db, pred_scene, None, False)
            graph_2 = scene_graph(self.db, gt_scene, None, False)

            color_1 = frame.copy()
            gt_img = cv2.imread(entry['color_path'], cv2.IMREAD_COLOR)
            gt_img, _, _ = create_squared_image(gt_img)
            gt_img = cv2.resize(gt_img,
                                (self.cfg.draw_size[0], self.cfg.draw_size[1]))
            color_2 = gt_img

            cv2.imwrite('%09d_b.png' % i, color_1)
            cv2.imwrite('%09d_i.png' % i, color_2)

            color_1 = visualize_unigram(self.cfg, color_1, graph_1.unigrams,
                                        (225, 0, 0))
            color_2 = visualize_unigram(self.cfg, color_2, graph_2.unigrams,
                                        (225, 0, 0))
            color_1 = visualize_bigram(self.cfg, color_1, graph_1.bigrams,
                                       (0, 0, 255))
            color_2 = visualize_bigram(self.cfg, color_2, graph_2.bigrams,
                                       (0, 0, 255))

            scores = ev.evaluate_graph(graph_1, graph_2)

            color_1 = visualize_unigram(self.cfg, color_1,
                                        ev.common_pred_unigrams, (0, 225, 0))
            color_2 = visualize_unigram(self.cfg, color_2,
                                        ev.common_gt_unigrams, (0, 225, 0))
            color_1 = visualize_bigram(self.cfg, color_1,
                                       ev.common_pred_bigrams, (0, 255, 255))
            color_2 = visualize_bigram(self.cfg, color_2, ev.common_gt_bigrams,
                                       (0, 255, 255))

            info = eval_info(self.cfg, scores[None, ...])

            plt.switch_backend('agg')
            fig = plt.figure(figsize=(16, 10))
            title = entry['sentence']
            title += 'UR:%f,UP:%f,BR:%f,BP:%f\n' % (
                info.unigram_R()[0], info.unigram_P()[0], info.bigram_R()[0],
                info.bigram_P()[0])
            title += 'scale: %f, ratio: %f, coord: %f, b:%f \n' % (
                info.scale()[0], info.ratio()[0], info.unigram_coord()[0],
                info.bigram_coord()[0])

            plt.suptitle(title)
            plt.subplot(1, 2, 1)
            plt.imshow(color_1[:, :, ::-1])
            plt.axis('off')
            plt.subplot(1, 2, 2)
            plt.imshow(color_2[:, :, ::-1])
            plt.axis('off')

            out_path = osp.join(output_dir, name + '.png')
            fig.savefig(out_path, bbox_inches='tight')
            plt.close(fig)

    def sample_all_top1(self, test_db):
        ##############################################################
        # Output prefix
        ##############################################################
        out_dir = osp.join(self.cfg.model_dir, 'top1_scenes')
        maybe_create(out_dir)
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net
        indices = range(len(test_db))
        scores = []
        for G in range(5):
            test_db.cfg.sent_group = G
            G_dir = osp.join(out_dir, '%02d' % G)
            maybe_create(G_dir)
            img_dir = osp.join(G_dir, 'images')
            maybe_create(img_dir)
            scene_dir = osp.join(G_dir, 'scenes')
            maybe_create(scene_dir)

            for i in indices:
                entry = test_db[i]
                gt_scene = test_db.scenedb[i]
                ##############################################################
                # Inputs
                ##############################################################
                input_inds_np = np.array(entry['word_inds'])
                input_lens_np = np.array(entry['word_lens'])

                input_inds = torch.from_numpy(input_inds_np).long().unsqueeze(
                    0)
                input_lens = torch.from_numpy(input_lens_np).long().unsqueeze(
                    0)
                if self.cfg.cuda:
                    input_inds = input_inds.cuda()
                    input_lens = input_lens.cuda()

                ##############################################################
                # Inference
                ##############################################################
                self.net.eval()
                with torch.no_grad():
                    inf_outs, env = net.inference(input_inds, input_lens, -1,
                                                  2.0, 0, None)

                # for j in range(len(env.scenes)):
                #     bar_scene = env.scenes[j]
                #     bar_inds = bar_scene['out_inds']
                #     if bar_inds[0][0] > self.cfg.EOS_idx:
                #         break
                # if bar_inds[0][0] <= self.cfg.EOS_idx:
                #     continue
                # pred_scene = deepcopy(bar_scene)
                pred_scene = env.scenes[0]

                ##############################################################
                # Evaluate
                ##############################################################
                pred_scores = env.evaluate_scene(pred_scene, gt_scene)
                scores.append(pred_scores)

                ##############################################################
                # Draw
                ##############################################################
                frame = env.batch_redraw(return_sequence=False)[0][0]

                ##############################################################
                # Save scene
                ##############################################################
                pred_inds = deepcopy(pred_scene['out_inds'])
                pred_inds = np.stack(pred_inds, 0)
                foo = test_db.output_inds_to_scene(pred_inds)
                out_scene = {}
                out_scene['boxes'] = [
                    x.tolist() for x in foo['boxes'].astype(np.float64)
                ]
                out_scene['clses'] = foo['clses'].astype(np.int64).tolist()
                out_scene['caption'] = entry['sentence']
                out_scene['width'] = int(gt_scene['width'])
                out_scene['height'] = int(gt_scene['height'])
                img_idx = int(gt_scene['img_idx'])
                out_scene['img_idx'] = img_idx
                scene_path = osp.join(
                    scene_dir, '%02d_' % G + str(img_idx).zfill(12) + '.json')
                img_path = osp.join(
                    img_dir, '%02d_' % G + str(img_idx).zfill(12) + '.jpg')
                cv2.imwrite(img_path, frame)
                with open(scene_path, 'w') as fp:
                    json.dump(out_scene, fp, indent=4, sort_keys=True)
                print(G, i, img_idx)

                # if len(scores) > 5:
                #     break

        scores = np.stack(scores, 0).astype(np.float64)
        infos = eval_info(self.cfg, scores)
        info_path = osp.join(out_dir, 'eval_info_top1.json')
        log_coco_scores(infos, info_path)

    def sample_demo(self, input_sentences):
        output_dir = osp.join(self.cfg.model_dir, 'bert_layout_samples')
        print(output_dir)
        maybe_create(output_dir)
        num_sents = len(input_sentences)
        ##############################################################
        # Main loop
        ##############################################################
        plt.switch_backend('agg')
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net
        for i in range(num_sents):
            # for i in range(5):
            sentence = input_sentences[i]
            ##############################################################
            # Inputs
            ##############################################################
            word_inds, word_lens = self.db.encode_sentence(sentence)
            input_inds_np = np.array(word_inds)
            input_lens_np = np.array(word_lens)
            input_inds = torch.from_numpy(input_inds_np).long().unsqueeze(0)
            input_lens = torch.from_numpy(input_lens_np).long().unsqueeze(0)
            if self.cfg.cuda:
                input_inds = input_inds.cuda()
                input_lens = input_lens.cuda()
            ##############################################################
            # Inference
            ##############################################################
            self.net.eval()
            with torch.no_grad():
                inf_outs, env = net.inference(input_inds, input_lens, -1, 2.0,
                                              0, None)
            frames = env.batch_redraw(return_sequence=True)[0]

            _, objs = torch.max(inf_outs[0], -1)
            objs = objs[0].cpu().data
            print('------------{}------------'.format(i))
            for k in range(len(objs)):
                if objs[k] <= self.cfg.EOS_idx:
                    break

                print(self.db.classes[objs[k]])
                print(inf_outs[1][0][k])
            # _, _, _, _, what_wei, where_wei = inf_outs
            # if self.cfg.what_attn:
            #     what_attn_words = self.decode_attention(
            #         input_inds_np, input_lens_np, what_wei.squeeze(0))
            # if self.cfg.where_attn > 0:
            #     where_attn_words = self.decode_attention(
            #         input_inds_np, input_lens_np, where_wei.squeeze(0))
            ##############################################################
            # Draw
            ##############################################################
            fig = plt.figure(figsize=(60, 40))
            plt.suptitle(sentence, fontsize=40)
            for j in range(frames.shape[0]):
                # subtitle = ''
                # if self.cfg.what_attn:
                #     subtitle = subtitle + ' '.join(what_attn_words[j])
                # if self.cfg.where_attn > 0:
                #     subtitle = subtitle + '\n' + ' '.join(where_attn_words[j])
                plt.subplot(4, 3, j + 1)
                # plt.title(subtitle, fontsize=30)
                plt.imshow(frames[j, :, :, ::-1])
                plt.axis('off')
            out_path = osp.join(output_dir, '%09d.png' % i)
            fig.savefig(out_path, bbox_inches='tight')
            plt.close(fig)

    def decode_attention(self, word_inds, word_lens, att_logits):
        _, att_inds = torch.topk(att_logits, 3, -1)
        att_inds = att_inds.cpu().data.numpy()

        if len(word_inds.shape) > 1:
            lin_inds = []
            for i in range(word_inds.shape[0]):
                lin_inds.extend(word_inds[i, :word_lens[i]].tolist())
            vlen = len(lin_inds)
            npad = self.cfg.max_input_length * 3 - vlen
            lin_inds = lin_inds + [0] * npad
            # print(lin_inds)
            lin_inds = np.array(lin_inds).astype(np.int32)
        else:
            lin_inds = word_inds.copy()

        slen, _ = att_inds.shape
        attn_words = []
        for i in range(slen):
            w_inds = [lin_inds[x] for x in att_inds[i]]
            w_strs = [self.db.lang_vocab.index2word[x] for x in w_inds]
            attn_words = attn_words + [w_strs]

        return attn_words

    def save_checkpoint(self, epoch, log):
        print(" [*] Saving checkpoints...")
        if self.cfg.cuda and self.cfg.parallel:
            net = self.net.module
        else:
            net = self.net
        checkpoint_dir = osp.join(self.cfg.model_dir, 'bert_layout_ckpts')
        if not osp.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        model_name = "ckpt-%03d-%.4f-%.4f.pkl" % (epoch, log[0], log[1])
        bert_name = "bert-ckpt-%03d-%.4f-%.4f.pkl" % (epoch, log[0], log[1])

        print('saving ckpt to ', checkpoint_dir)
        state = {
            'net': net.state_dict(),
            'optimizer': self.optimizer.optimizer.state_dict(),
            'epoch': epoch
        }
        # torch.save(net.state_dict(), osp.join(checkpoint_dir, model_name))
        torch.save(state, osp.join(checkpoint_dir, model_name))

        print('saving bert to ', checkpoint_dir)
        bert_state = {
            'net': net.text_encoder.embedding.state_dict(),
            'optimizer': self.bert_optimizer.state_dict(),
            'epoch': epoch
        }
        torch.save(bert_state, osp.join(checkpoint_dir, bert_name))