def _validate(self): self.model.eval() results = [] for data in tqdm(self.val_dataset): img = data['img'].cuda().unsqueeze(0) img_meta = data['img_meta'] with torch.no_grad(): out = self.model(img) result = self.model.pred(out, img_meta, self.test_cfg, rescale=True) results.append(result) tmp_json_file = 'tmp.json' results2json(self.val_dataset, results, tmp_json_file) coco_eval(tmp_json_file, self.val_dataset.coco)
def main(args): # tb_summary_writer = SummaryWriter(args.checkpoint_path) if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args, len(vocab)) if torch.cuda.is_available(): adaptive.cuda() # adaptive = Encoder2Decoder(args, len(vocab), args.gpu) if vars(args).get('start_from', None) is not None and os.path.isfile(args.start_from): adaptive.load_state_dict(torch.load(args.start_from)) # cider_scores = [] # Start Training # for epoch in range(start_epoch, args.num_epochs + 1): cider, metrics = coco_eval(adaptive, args, 0, split='test') print('Testing Model: CIDEr score %.2f' % (cider))
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = get_model(args, len(vocab)) if args.pretrained: adaptive.load_state_dict(torch.load(args.pretrained)) # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl' # A little messy here. start_epoch = int( args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1 else: start_epoch = 1 # Constructing CNN parameters for optimization, only fine-tuning higher layers cnn_params = adaptive.cnn_params(args.fine_tune_start_layer) cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) #rnn params params = adaptive.rnn_params() # Will decay later learning_rate = args.learning_rate # Language Modeling Loss if 'mean' in args.model: LMcriterion = CrossEntropyLoss_mean() else: LMcriterion = nn.CrossEntropyLoss() test_loss_fun = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() test_loss_fun.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 # Start Training for epoch in range(start_epoch, args.num_epochs + 1): # Start Learning Rate Decay if epoch > args.lr_decay: frac = float(epoch - args.lr_decay) / args.learning_rate_decay_every decay_factor = math.pow(0.5, frac) # Decay the learning rate learning_rate = args.learning_rate * decay_factor print 'Learning Rate for Epoch %d: %.6f' % (epoch, learning_rate) optimizer = torch.optim.Adam(params, lr=learning_rate, betas=(args.alpha, args.beta)) # Language Modeling Training total_collect = 0 print '------------------Training for Epoch %d----------------' % ( epoch) for i, (images_, captions_, lengths_, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images_) captions = to_var(captions_) lengths = [cap_len - 1 for cap_len in lengths_] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() packed_scores = adaptive(images, captions, lengths) #compute correct num of words pad = pad_packed_sequence(packed_scores, batch_first=True) num_correct = 0 for ids in range(len(lengths)): cap_len = lengths[ids] pred = pad[0][ids][:cap_len].max(1)[1] ground = captions[ids][1:cap_len + 1] correct = np.sum( pred.data.cpu().numpy() == ground.data.cpu().numpy()) num_correct = num_correct + correct total_collect += num_correct correct_prop = float(num_correct) / sum(lengths) # Compute loss and backprop if 'mean' in args.model: loss = LMcriterion(packed_scores, targets, lengths) else: loss = LMcriterion(packed_scores[0], targets) loss.backward() test_loss = test_loss_fun(packed_scores[0], targets) adaptive.clip_params(args.clip) optimizer.step() # Start CNN fine-tuning if epoch > args.cnn_epoch: cnn_optimizer.step() # Print log info if i % args.log_step == 0: print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f\ncorrect_prop:%5.4f,test_loss:%.4f' % ( epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]), correct_prop, test_loss) # Save the Adaptive Attention model after each epoch torch.save(adaptive.state_dict(), os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch))) # Evaluation on validation set eval_result = coco_eval(adaptive, args, epoch) eval_result['collect_num'] = total_collect cider = eval_result['CIDEr'] cider_scores.append(cider) result_path = 'results-score/' + args.model if not os.path.exists(result_path): re = {} re['adaptive-%d.pkl' % epoch] = eval_result json.dump(re, open(result_path, 'w')) else: re = json.load(open(result_path, 'r')) re['adaptive-%d.pkl' % epoch] = eval_result json.dump(re, open(result_path, 'w')) if cider > best_cider: best_cider = cider best_epoch = epoch if len(cider_scores) > 5 and epoch > 20: last_6 = cider_scores[-6:] last_6_max = max(last_6) # Test if there is improvement, if not do early stopping if last_6_max != best_cider: print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.' print 'Model of best epoch #: %d with CIDEr score %.2f' % ( best_epoch, best_cider) break
def main(args): torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) transform = transforms.Compose([ transforms.Resize(args.crop_size), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) img_list = prepare_entry(args.train_dir, args.train_cap) sentences = [c for img in img_list for c in img['cap']] vocab = build_dictionary(sentences, threshold=args.threshold, dict_path=args.dict_path, override=False) train_set = ImageCaptionSet(img_list, vocab, transform, shuffle=True) train_loader = get_loader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=2, drop_last=True) num_words = vocab.ukn_id + 1 print('num_words:', num_words) model = CapGenerator(args.emb_dim, num_words, args.hidden_dim) if args.pretrained: model.load_state_dict(torch.load(args.pretrained)) start_epoch = int(args.pretrained.split('/')[-1].split('_')[1]) + 1 else: start_epoch = 1 cnn_blocks = list( model.encoder.resnet_conv.children())[args.fine_tune_start_layer:] cnn_params = [list(sub_module.parameters()) for sub_module in cnn_blocks] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.lr_cnn, betas=(args.alpha, args.beta)) other_params = (list(model.encoder.ai2v.parameters()) + list(model.encoder.ag2v.parameters()) + list(model.decoder.parameters())) lr = args.lr criterion = nn.CrossEntropyLoss().cuda() model.cuda() iter_size = len(train_loader) #val_iter = len(val_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 print('ITER size: {}', iter_size) for epoch in range(start_epoch, args.num_epochs + 1): if train_set.shuffle: np.random.shuffle(train_set.entries) print('shuffle train dataset') if epoch > args.lr_decay_start: frac = float(epoch - args.lr_decay_start) / args.lr_decay_ratio decay_fac = np.power(0.5, frac) lr = lr * decay_fac print('learning rate for Epoch {}: {:.3e}'.format(epoch, lr)) optimizer = torch.optim.Adam(other_params, lr=lr, betas=(args.alpha, args.beta)) model.train() for i, data in enumerate(train_loader): inputs, _, caps, last_pos = data inputs, caps = Variable(inputs).cuda(), Variable(caps).cuda() lstm_steps = max(last_pos) #targets = pack_padded_sequence(caps, last_pos, batch_first=True) model.zero_grad() packed_scores = model(inputs, caps, last_pos) targets = pack_padded_sequence(caps[:, 1:], last_pos, batch_first=True) #print(caps.shape, caps[:, 1:].shape, last_pos) loss = criterion(packed_scores[0], targets[0]) loss.backward() #???? for p in model.decoder.LSTM.parameters(): p.data.clamp_(-args.clip, args.clip) optimizer.step() cnn_lr = args.lr_cnn if epoch > args.cnn_epoch: #cnn_lr = cnn_lr * decay_fac cnn_optimizer = torch.optim.Adam(cnn_params, lr=cnn_lr, betas=(args.alpha, args.beta)) cnn_optimizer.step() scores = pad_packed_sequence(packed_scores, batch_first=True)[0] last = scores[-1] last_ind = list(last.max(1)[1].data) last_truth = list(caps[-1, 1:].data) print( 'TRAIN ITER: {} / {}, lstm_steps:{}, loss: {:.4f},Perplexity:{}\r' .format(i, iter_size, lstm_steps, loss.data[0], np.exp(loss.data[0])), end="") print("\n", end="") if epoch % args.save_freq == args.save_freq - 1: name = os.path.join(args.model_dir, 'epoch_{}'.format(epoch)) torch.save(model.state_dict(), name) scores = pad_packed_sequence(packed_scores, batch_first=True)[0] last = scores[-1] last_ind = list(last.max(1)[1].data) last_truth = list(caps[-1, 1:].data) print(last_truth, last_pos[-1]) print('pred: ', end="") for ix in last_ind: print(vocab.ix2word(ix), end="") if ix == 0: print("") break print(' ', end="") if ix != 0: print("\b.") print('truth: ', end="") for ix in last_truth: print(vocab.ix2word(ix), end="") if ix == 0: print("") break print(' ', end="") if ix != 0: print("\b.") #cider scores cider = coco_eval(model, args, epoch) cider_scores.append(cider) if cider > best_cider: best_cider = cider best_epoch = epoch if len(cider_scores) > 5: last_6 = np.array(cider_scores[-6:]) if max(last_6) < best_cider: print( 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.' ) print('Model of best epoch #: %d with CIDEr score %.2f' % (best_epoch, best_cider)) break torch.save(model.state_dict(), os.path.join(args.model_dir, 'trained_model'))
def main(args): args.checkpoint_path = os.path.join( 'log_' + args.dataset + '_' + args.pattern, args.session) tb_summary_writer = SummaryWriter(args.checkpoint_path) if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): print('### CUDA is available!') torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.checkpoint_path): os.makedirs(args.checkpoint_path) if not os.path.exists('data'): os.mkdir('data') # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) args.vocab = vocab # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args, len(vocab)) # adaptive = Encoder2Decoder(args, len(vocab), args.gpu) infos = {} if args.start_from is not None: with open( os.path.join(args.start_from, 'infos_' + args.dataset + '.pkl')) as f: infos = cPickle.load(f) # saved_model_opt = infos['args'] # need_be_same = ["caption_model", "rnn_type", "rnn_size", "num_layers"] # for checkme in need_be_same: # assert vars(saved_model_opt)[checkme] == vars(args)[ # checkme], "Command line argument and saved model disagree on '%s' " % checkme if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "model.pth")): adaptive.load_state_dict( torch.load(os.path.join(args.start_from, 'model.pth'))) epoch = infos.get('epoch', 1) # Constructing CNN parameters for optimization, only fine-tuning higher layers cnn_subs = list(adaptive.encoder.vgg_conv.children()) cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "cnn_optimizer.pth")): cnn_optimizer.load_state_dict( torch.load(os.path.join(args.start_from, 'cnn_optimizer.pth'))) # Other parameter optimization params = list(adaptive.decoder.parameters()) # Will decay later learning_rate = args.learning_rate # Language Modeling Loss, Optimizers LMcriterion = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 best_cider_test = 0.0 best_epoch_test = 0 optimizer = torch.optim.Adam(params, lr=learning_rate) if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(args.start_from, 'optimizer.pth'))) # Start Training # for epoch in range(start_epoch, args.num_epochs + 1): update_lr_flag = True while True: if update_lr_flag: if epoch > args.lr_decay: frac = (epoch - args.cnn_epoch) / args.learning_rate_decay_every decay_factor = math.pow(0.5, frac) # Decay the learning rate learning_rate = learning_rate * decay_factor for group in optimizer.param_groups: group['lr'] = learning_rate update_lr_flag = False # Language Modeling Training print('------------------Training for Epoch %d----------------' % (epoch)) cur_time = time.time() for i, (images, captions, lengths) in enumerate(data_loader): start_time = time.time() # print('### images:', images.size()) # print('### captions:', captions.size()) # print('### lengths:', len(lengths)) # Set mini-batch dataset images = to_var(images) captions = to_var(captions) lengths = [cap_len - 1 for cap_len in lengths] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() packed_scores = adaptive(images, captions, lengths, args.pattern) # Compute loss and backprop loss = LMcriterion(packed_scores[0], targets) loss.backward() # Gradient clipping for gradient exploding problem in LSTM for p in adaptive.decoder.lstm_cell.parameters(): p.data.clamp_(-args.clip, args.clip) optimizer.step() # Start learning rate decay # Start CNN fine-tuning if epoch > args.cnn_epoch: cnn_optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f, Elapsed: %.2fs' % \ (epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), time.time() - start_time)) add_summary_value(tb_summary_writer, 'loss', loss.item(), epoch) print('##### Per Epoch Cost time: %.2fs' % (time.time() - cur_time)) infos['epoch'] = epoch infos['vocab'] = vocab infos['args'] = args with open(os.path.join(args.checkpoint_path, 'infos.pkl'), 'wb') as f: cPickle.dump(infos, f) torch.save(optimizer.state_dict(), os.path.join(args.checkpoint_path, 'optimizer.pth')) torch.save(cnn_optimizer.state_dict(), os.path.join(args.checkpoint_path, 'cnn_optimizer.pth')) torch.save(adaptive.state_dict(), os.path.join(args.checkpoint_path, 'model.pkl')) # with open(os.path.join(args.checkpoint_path, 'histories.pkl'), 'wb') as f: # cPickle.dump(infos, f) # Evaluation on validation set cider, metrics = coco_eval(adaptive, args, epoch, split='val') cider_scores.append(cider) add_summary_dict(tb_summary_writer, 'metrics', metrics, epoch) if cider > best_cider: best_cider = cider best_epoch = epoch # Save the Adaptive Attention model after each epoch # name = str(args.yml).split('.')[0].split('/')[-1] torch.save(adaptive.state_dict(), os.path.join(args.checkpoint_path, 'model-best.pkl')) with open(os.path.join(args.checkpoint_path, 'infos-best.pkl'), 'wb') as f: cPickle.dump(infos, f) print('Model of best epoch #: %d with CIDEr score %.2f' % (best_epoch, best_cider)) # Test on test set caption_val_path = args.caption_val_path args.caption_val_path = args.caption_val_path.replace('val', 'test') cider_test, metrics_test = coco_eval(adaptive, args, epoch, split='test') args.caption_val_path = caption_val_path if cider_test > best_cider_test: best_cider_test = cider_test best_epoch_test = epoch print('Test Phase: Model of best epoch #: %d with CIDEr score %.2f' % (best_epoch_test, best_cider_test)) epoch += 1 if epoch > 80: break
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size) if args.pretrained: adaptive.load_state_dict(torch.load(args.pretrained)) # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl' # A little messy here. start_epoch = int( args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1 else: start_epoch = 1 # Constructing CNN parameters for optimization, only fine-tuning higher layers cnn_subs = list( adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:] cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) # Other parameter optimization params = list( adaptive.encoder.affine_a.parameters() ) + list( adaptive.encoder.affine_b.parameters() ) \ + list( adaptive.decoder.parameters() ) # Will decay later learning_rate = args.learning_rate # Language Modeling Loss, Optimizers LMcriterion = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 # Start Training for epoch in range(start_epoch, args.num_epochs + 1): optimizer = torch.optim.Adam(params, lr=learning_rate) # Language Modeling Training print '------------------Training for Epoch %d----------------' % ( epoch) for i, (images, captions, lengths, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images) captions = to_var(captions) lengths = [cap_len - 1 for cap_len in lengths] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() packed_scores = adaptive(images, captions, lengths) # Compute loss and backprop loss = LMcriterion(packed_scores[0], targets) loss.backward() # Gradient clipping for gradient exploding problem in LSTM for p in adaptive.decoder.LSTM.parameters(): p.data.clamp_(-args.clip, args.clip) optimizer.step() # Start learning rate decay if epoch > args.lr_decay: frac = (epoch - args.cnn_epoch) / args.learning_rate_decay_every decay_factor = math.pow(0.5, frac) # Decay the learning rate learning_rate = learning_rate * decay_factor # Start CNN fine-tuning if epoch > args.cnn_epoch: cnn_optimizer.step() # Print log info if i % args.log_step == 0: print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f' % ( epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0])) # Save the Adaptive Attention model after each epoch torch.save(adaptive.state_dict(), os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch))) # Evaluation on validation set cider = coco_eval(adaptive, args, epoch) cider_scores.append(cider) if cider > best_cider: best_cider = cider best_epoch = epoch if len(cider_scores) > 5: last_6 = cider_scores[-6:] last_6_max = max(last_6) # Test if there is improvement, if not do early stopping if last_6_max != best_cider: print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.' print 'Model of best epoch #: %d with CIDEr score %.2f' % ( best_epoch, best_cider) break