예제 #1
0
 def _validate(self):
     self.model.eval()
     results = []
     for data in tqdm(self.val_dataset):
         img = data['img'].cuda().unsqueeze(0)
         img_meta = data['img_meta']
         with torch.no_grad():
             out = self.model(img)
             result = self.model.pred(out, img_meta, self.test_cfg, rescale=True)
         results.append(result)
     tmp_json_file = 'tmp.json'
     results2json(self.val_dataset, results, tmp_json_file)
     coco_eval(tmp_json_file, self.val_dataset.coco)
예제 #2
0
def main(args):
    # tb_summary_writer = SummaryWriter(args.checkpoint_path)
    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        # torch.cuda.set_device(args.gpu)
        torch.backends.cudnn.benchmark = True

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args, len(vocab))
    if torch.cuda.is_available():
        adaptive.cuda()
    # adaptive = Encoder2Decoder(args, len(vocab), args.gpu)
    if vars(args).get('start_from', None) is not None and os.path.isfile(args.start_from):
        adaptive.load_state_dict(torch.load(args.start_from))
    # cider_scores = []

    # Start Training
    # for epoch in range(start_epoch, args.num_epochs + 1):

    cider, metrics = coco_eval(adaptive, args, 0, split='test')
    print('Testing Model: CIDEr score %.2f' % (cider))
예제 #3
0
def main(args):

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = get_model(args, len(vocab))

    if args.pretrained:

        adaptive.load_state_dict(torch.load(args.pretrained))
        # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl'
        # A little messy here.
        start_epoch = int(
            args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1

    else:
        start_epoch = 1

    # Constructing CNN parameters for optimization, only fine-tuning higher layers
    cnn_params = adaptive.cnn_params(args.fine_tune_start_layer)
    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))

    #rnn params
    params = adaptive.rnn_params()

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss
    if 'mean' in args.model:
        LMcriterion = CrossEntropyLoss_mean()
    else:
        LMcriterion = nn.CrossEntropyLoss()
    test_loss_fun = nn.CrossEntropyLoss()
    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()
        test_loss_fun.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0

    # Start Training
    for epoch in range(start_epoch, args.num_epochs + 1):

        # Start Learning Rate Decay
        if epoch > args.lr_decay:

            frac = float(epoch -
                         args.lr_decay) / args.learning_rate_decay_every
            decay_factor = math.pow(0.5, frac)

            # Decay the learning rate
            learning_rate = args.learning_rate * decay_factor

        print 'Learning Rate for Epoch %d: %.6f' % (epoch, learning_rate)

        optimizer = torch.optim.Adam(params,
                                     lr=learning_rate,
                                     betas=(args.alpha, args.beta))
        # Language Modeling Training
        total_collect = 0

        print '------------------Training for Epoch %d----------------' % (
            epoch)
        for i, (images_, captions_, lengths_, _, _) in enumerate(data_loader):
            # Set mini-batch dataset

            images = to_var(images_)
            captions = to_var(captions_)
            lengths = [cap_len - 1 for cap_len in lengths_]
            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()
            packed_scores = adaptive(images, captions, lengths)

            #compute correct num of words
            pad = pad_packed_sequence(packed_scores, batch_first=True)
            num_correct = 0
            for ids in range(len(lengths)):
                cap_len = lengths[ids]
                pred = pad[0][ids][:cap_len].max(1)[1]
                ground = captions[ids][1:cap_len + 1]
                correct = np.sum(
                    pred.data.cpu().numpy() == ground.data.cpu().numpy())
                num_correct = num_correct + correct
            total_collect += num_correct
            correct_prop = float(num_correct) / sum(lengths)
            # Compute loss and backprop
            if 'mean' in args.model:
                loss = LMcriterion(packed_scores, targets, lengths)
            else:
                loss = LMcriterion(packed_scores[0], targets)
            loss.backward()

            test_loss = test_loss_fun(packed_scores[0], targets)
            adaptive.clip_params(args.clip)

            optimizer.step()

            # Start CNN fine-tuning
            if epoch > args.cnn_epoch:

                cnn_optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f\ncorrect_prop:%5.4f,test_loss:%.4f' % (
                    epoch, args.num_epochs, i, total_step, loss.data[0],
                    np.exp(loss.data[0]), correct_prop, test_loss)

        # Save the Adaptive Attention model after each epoch
        torch.save(adaptive.state_dict(),
                   os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch)))

        # Evaluation on validation set
        eval_result = coco_eval(adaptive, args, epoch)
        eval_result['collect_num'] = total_collect
        cider = eval_result['CIDEr']
        cider_scores.append(cider)

        result_path = 'results-score/' + args.model
        if not os.path.exists(result_path):
            re = {}
            re['adaptive-%d.pkl' % epoch] = eval_result
            json.dump(re, open(result_path, 'w'))
        else:
            re = json.load(open(result_path, 'r'))
            re['adaptive-%d.pkl' % epoch] = eval_result
            json.dump(re, open(result_path, 'w'))

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

        if len(cider_scores) > 5 and epoch > 20:

            last_6 = cider_scores[-6:]
            last_6_max = max(last_6)

            # Test if there is improvement, if not do early stopping
            if last_6_max != best_cider:

                print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.'
                print 'Model of best epoch #: %d with CIDEr score %.2f' % (
                    best_epoch, best_cider)
                break
예제 #4
0
def main(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    transform = transforms.Compose([
        transforms.Resize(args.crop_size),
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    img_list = prepare_entry(args.train_dir, args.train_cap)
    sentences = [c for img in img_list for c in img['cap']]
    vocab = build_dictionary(sentences,
                             threshold=args.threshold,
                             dict_path=args.dict_path,
                             override=False)
    train_set = ImageCaptionSet(img_list, vocab, transform, shuffle=True)
    train_loader = get_loader(train_set,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=2,
                              drop_last=True)

    num_words = vocab.ukn_id + 1
    print('num_words:', num_words)
    model = CapGenerator(args.emb_dim, num_words, args.hidden_dim)

    if args.pretrained:
        model.load_state_dict(torch.load(args.pretrained))
        start_epoch = int(args.pretrained.split('/')[-1].split('_')[1]) + 1
    else:
        start_epoch = 1

    cnn_blocks = list(
        model.encoder.resnet_conv.children())[args.fine_tune_start_layer:]
    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_blocks]
    cnn_params = [item for sublist in cnn_params for item in sublist]
    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.lr_cnn,
                                     betas=(args.alpha, args.beta))

    other_params = (list(model.encoder.ai2v.parameters()) +
                    list(model.encoder.ag2v.parameters()) +
                    list(model.decoder.parameters()))

    lr = args.lr
    criterion = nn.CrossEntropyLoss().cuda()
    model.cuda()
    iter_size = len(train_loader)
    #val_iter = len(val_loader)
    cider_scores = []
    best_cider = 0.0
    best_epoch = 0
    print('ITER size: {}', iter_size)
    for epoch in range(start_epoch, args.num_epochs + 1):
        if train_set.shuffle:
            np.random.shuffle(train_set.entries)
            print('shuffle train dataset')
        if epoch > args.lr_decay_start:
            frac = float(epoch - args.lr_decay_start) / args.lr_decay_ratio
            decay_fac = np.power(0.5, frac)
            lr = lr * decay_fac

        print('learning rate for Epoch {}: {:.3e}'.format(epoch, lr))
        optimizer = torch.optim.Adam(other_params,
                                     lr=lr,
                                     betas=(args.alpha, args.beta))
        model.train()
        for i, data in enumerate(train_loader):
            inputs, _, caps, last_pos = data
            inputs, caps = Variable(inputs).cuda(), Variable(caps).cuda()
            lstm_steps = max(last_pos)
            #targets = pack_padded_sequence(caps, last_pos, batch_first=True)
            model.zero_grad()
            packed_scores = model(inputs, caps, last_pos)
            targets = pack_padded_sequence(caps[:, 1:],
                                           last_pos,
                                           batch_first=True)
            #print(caps.shape, caps[:, 1:].shape, last_pos)
            loss = criterion(packed_scores[0], targets[0])
            loss.backward()

            #????
            for p in model.decoder.LSTM.parameters():
                p.data.clamp_(-args.clip, args.clip)

            optimizer.step()

            cnn_lr = args.lr_cnn
            if epoch > args.cnn_epoch:
                #cnn_lr = cnn_lr * decay_fac
                cnn_optimizer = torch.optim.Adam(cnn_params,
                                                 lr=cnn_lr,
                                                 betas=(args.alpha, args.beta))
                cnn_optimizer.step()

            scores = pad_packed_sequence(packed_scores, batch_first=True)[0]
            last = scores[-1]
            last_ind = list(last.max(1)[1].data)
            last_truth = list(caps[-1, 1:].data)
            print(
                'TRAIN ITER: {} / {}, lstm_steps:{}, loss: {:.4f},Perplexity:{}\r'
                .format(i, iter_size, lstm_steps, loss.data[0],
                        np.exp(loss.data[0])),
                end="")
        print("\n", end="")
        if epoch % args.save_freq == args.save_freq - 1:
            name = os.path.join(args.model_dir, 'epoch_{}'.format(epoch))
            torch.save(model.state_dict(), name)

        scores = pad_packed_sequence(packed_scores, batch_first=True)[0]
        last = scores[-1]
        last_ind = list(last.max(1)[1].data)
        last_truth = list(caps[-1, 1:].data)
        print(last_truth, last_pos[-1])

        print('pred: ', end="")
        for ix in last_ind:
            print(vocab.ix2word(ix), end="")
            if ix == 0:
                print("")
                break
            print(' ', end="")
        if ix != 0:
            print("\b.")
        print('truth: ', end="")
        for ix in last_truth:
            print(vocab.ix2word(ix), end="")
            if ix == 0:
                print("")
                break
            print(' ', end="")
        if ix != 0:
            print("\b.")

        #cider scores
        cider = coco_eval(model, args, epoch)
        cider_scores.append(cider)

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

        if len(cider_scores) > 5:
            last_6 = np.array(cider_scores[-6:])
            if max(last_6) < best_cider:
                print(
                    'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.'
                )
                print('Model of best epoch #: %d with CIDEr score %.2f' %
                      (best_epoch, best_cider))
                break

    torch.save(model.state_dict(), os.path.join(args.model_dir,
                                                'trained_model'))
예제 #5
0
파일: train.py 프로젝트: zzy-ucas/LAM
def main(args):
    args.checkpoint_path = os.path.join(
        'log_' + args.dataset + '_' + args.pattern, args.session)
    tb_summary_writer = SummaryWriter(args.checkpoint_path)
    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        # torch.cuda.set_device(args.gpu)
        torch.backends.cudnn.benchmark = True

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        print('### CUDA is available!')
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.checkpoint_path):
        os.makedirs(args.checkpoint_path)
    if not os.path.exists('data'):
        os.mkdir('data')

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
        args.vocab = vocab

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args, len(vocab))
    # adaptive = Encoder2Decoder(args, len(vocab), args.gpu)

    infos = {}
    if args.start_from is not None:
        with open(
                os.path.join(args.start_from,
                             'infos_' + args.dataset + '.pkl')) as f:
            infos = cPickle.load(f)
            # saved_model_opt = infos['args']
            # need_be_same = ["caption_model", "rnn_type", "rnn_size", "num_layers"]
            # for checkme in need_be_same:
            #     assert vars(saved_model_opt)[checkme] == vars(args)[
            #         checkme], "Command line argument and saved model disagree on '%s' " % checkme
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "model.pth")):
        adaptive.load_state_dict(
            torch.load(os.path.join(args.start_from, 'model.pth')))

    epoch = infos.get('epoch', 1)

    # Constructing CNN parameters for optimization, only fine-tuning higher layers
    cnn_subs = list(adaptive.encoder.vgg_conv.children())
    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
    cnn_params = [item for sublist in cnn_params for item in sublist]

    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "cnn_optimizer.pth")):
        cnn_optimizer.load_state_dict(
            torch.load(os.path.join(args.start_from, 'cnn_optimizer.pth')))

    # Other parameter optimization
    params = list(adaptive.decoder.parameters())

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss, Optimizers
    LMcriterion = nn.CrossEntropyLoss()

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0
    best_cider_test = 0.0
    best_epoch_test = 0
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(args.start_from, 'optimizer.pth')))

    # Start Training
    # for epoch in range(start_epoch, args.num_epochs + 1):
    update_lr_flag = True
    while True:
        if update_lr_flag:
            if epoch > args.lr_decay:
                frac = (epoch -
                        args.cnn_epoch) / args.learning_rate_decay_every
                decay_factor = math.pow(0.5, frac)

                # Decay the learning rate
                learning_rate = learning_rate * decay_factor
                for group in optimizer.param_groups:
                    group['lr'] = learning_rate
            update_lr_flag = False
        # Language Modeling Training
        print('------------------Training for Epoch %d----------------' %
              (epoch))
        cur_time = time.time()
        for i, (images, captions, lengths) in enumerate(data_loader):
            start_time = time.time()
            # print('### images:', images.size())
            # print('### captions:', captions.size())
            # print('### lengths:', len(lengths))
            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            lengths = [cap_len - 1 for cap_len in lengths]
            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()

            packed_scores = adaptive(images, captions, lengths, args.pattern)

            # Compute loss and backprop
            loss = LMcriterion(packed_scores[0], targets)
            loss.backward()

            # Gradient clipping for gradient exploding problem in LSTM
            for p in adaptive.decoder.lstm_cell.parameters():
                p.data.clamp_(-args.clip, args.clip)

            optimizer.step()

            # Start learning rate decay

            # Start CNN fine-tuning
            if epoch > args.cnn_epoch:
                cnn_optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f, Elapsed: %.2fs' % \
                      (epoch, args.num_epochs,
                       i, total_step,
                       loss.item(),
                       np.exp(loss.item()),
                       time.time() - start_time))

                add_summary_value(tb_summary_writer, 'loss', loss.item(),
                                  epoch)
        print('##### Per Epoch Cost time: %.2fs' % (time.time() - cur_time))
        infos['epoch'] = epoch
        infos['vocab'] = vocab
        infos['args'] = args
        with open(os.path.join(args.checkpoint_path, 'infos.pkl'), 'wb') as f:
            cPickle.dump(infos, f)
        torch.save(optimizer.state_dict(),
                   os.path.join(args.checkpoint_path, 'optimizer.pth'))
        torch.save(cnn_optimizer.state_dict(),
                   os.path.join(args.checkpoint_path, 'cnn_optimizer.pth'))
        torch.save(adaptive.state_dict(),
                   os.path.join(args.checkpoint_path, 'model.pkl'))
        # with open(os.path.join(args.checkpoint_path, 'histories.pkl'), 'wb') as f:
        #     cPickle.dump(infos, f)
        # Evaluation on validation set
        cider, metrics = coco_eval(adaptive, args, epoch, split='val')
        cider_scores.append(cider)
        add_summary_dict(tb_summary_writer, 'metrics', metrics, epoch)

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

            # Save the Adaptive Attention model after each epoch
            # name = str(args.yml).split('.')[0].split('/')[-1]
            torch.save(adaptive.state_dict(),
                       os.path.join(args.checkpoint_path, 'model-best.pkl'))
            with open(os.path.join(args.checkpoint_path, 'infos-best.pkl'),
                      'wb') as f:
                cPickle.dump(infos, f)
        print('Model of best epoch #: %d with CIDEr score %.2f' %
              (best_epoch, best_cider))

        # Test on test set
        caption_val_path = args.caption_val_path
        args.caption_val_path = args.caption_val_path.replace('val', 'test')
        cider_test, metrics_test = coco_eval(adaptive,
                                             args,
                                             epoch,
                                             split='test')
        args.caption_val_path = caption_val_path
        if cider_test > best_cider_test:
            best_cider_test = cider_test
            best_epoch_test = epoch
        print('Test Phase: Model of best epoch #: %d with CIDEr score %.2f' %
              (best_epoch_test, best_cider_test))

        epoch += 1
        if epoch > 80:
            break
예제 #6
0
def main(args):

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size)

    if args.pretrained:

        adaptive.load_state_dict(torch.load(args.pretrained))
        # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl'
        # A little messy here.
        start_epoch = int(
            args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1

    else:
        start_epoch = 1

    # Constructing CNN parameters for optimization, only fine-tuning higher layers
    cnn_subs = list(
        adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:]
    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
    cnn_params = [item for sublist in cnn_params for item in sublist]

    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))

    # Other parameter optimization
    params = list( adaptive.encoder.affine_a.parameters() ) + list( adaptive.encoder.affine_b.parameters() ) \
                + list( adaptive.decoder.parameters() )

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss, Optimizers
    LMcriterion = nn.CrossEntropyLoss()

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0

    # Start Training
    for epoch in range(start_epoch, args.num_epochs + 1):

        optimizer = torch.optim.Adam(params, lr=learning_rate)

        # Language Modeling Training
        print '------------------Training for Epoch %d----------------' % (
            epoch)
        for i, (images, captions, lengths, _) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            lengths = [cap_len - 1 for cap_len in lengths]
            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()

            packed_scores = adaptive(images, captions, lengths)

            # Compute loss and backprop
            loss = LMcriterion(packed_scores[0], targets)
            loss.backward()

            # Gradient clipping for gradient exploding problem in LSTM
            for p in adaptive.decoder.LSTM.parameters():
                p.data.clamp_(-args.clip, args.clip)

            optimizer.step()

            # Start learning rate decay
            if epoch > args.lr_decay:

                frac = (epoch -
                        args.cnn_epoch) / args.learning_rate_decay_every
                decay_factor = math.pow(0.5, frac)

                # Decay the learning rate
                learning_rate = learning_rate * decay_factor

            # Start CNN fine-tuning
            if epoch > args.cnn_epoch:

                cnn_optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f' % (
                    epoch, args.num_epochs, i, total_step, loss.data[0],
                    np.exp(loss.data[0]))

        # Save the Adaptive Attention model after each epoch
        torch.save(adaptive.state_dict(),
                   os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch)))

        # Evaluation on validation set
        cider = coco_eval(adaptive, args, epoch)
        cider_scores.append(cider)

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

        if len(cider_scores) > 5:

            last_6 = cider_scores[-6:]
            last_6_max = max(last_6)

            # Test if there is improvement, if not do early stopping
            if last_6_max != best_cider:

                print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.'
                print 'Model of best epoch #: %d with CIDEr score %.2f' % (
                    best_epoch, best_cider)
                break