def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) # 数据预处理 data = t.load(opt.caption_data_path) word2ix, ix2word = data['word2ix'], data['ix2word'] test_datas = t.load('test_results2.pth') imgs = t.load('test_imgs.pth') # Caption模型 model = CaptionModel(opt, None, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() model.cuda() results = [] for ii, (img_feat, img_id) in tqdm.tqdm(enumerate(zip(test_datas, imgs))): sentences = model.generate(img_feat) item = { 'image_id': img_id.replace('.jpg', ''), 'caption': sentences[0].replace('</EOS>', '') } results.append(item) if ii % 1000 == 0: print sentences[0] import json with open('submit.json', 'w') as f: json.dump(results, f)
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Scale(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = Image.open(opt.test_img) img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x if opt.use_gpu: resnet50.cuda() img = img.cuda() img_feats = resnet50(Variable(img, volatile=True)) # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() if opt.use_gpu: model.cuda() results = model.generate(img_feats.data[0]) print('\r\n'.join(results))
def generate(**kwargs): opt = Config() for k,v in kwargs.items(): setattr(opt,k,v) # 数据预处理 data = t.load(opt.caption_data_path,map_location=lambda s,l:s) word2ix,ix2word = data['word2ix'],data['ix2word'] IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Scale(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = Image.open(opt.test_img) img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x:x if opt.use_gpu: resnet50.cuda() img = img.cuda() img_feats = resnet50(Variable(img,volatile=True)) # Caption模型 model = CaptionModel(opt,word2ix,ix2word) model = model.load(opt.model_ckpt).eval() if opt.use_gpu: model.cuda() results = model.generate(img_feats.data[0]) print('\r\n'.join(results))
def main(): global args args = parser.parse_args() if args.save is '': args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar') logging.debug("run arguments: %s", args) logging.info("using pretrained cnn %s", args.cnn) cnn = resnet.__dict__[args.cnn](pretrained=True) vocab = build_vocab() model = CaptionModel(cnn, vocab, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, share_embedding_weights=args.share_weights) train_data = get_iterator(get_coco_data(vocab, train=True), batch_size=args.batch_size, max_length=args.max_length, shuffle=True, num_workers=args.workers) val_data = get_iterator(get_coco_data(vocab, train=False), batch_size=args.eval_batch_size, max_length=args.max_length, shuffle=False, num_workers=args.workers) if 'cuda' in args.type: cudnn.benchmark = True model.cuda() optimizer = select_optimizer( args.optimizer, params=model.parameters(), lr=args.lr) regime = lambda e: {'lr': args.lr * (args.lr_decay ** e), 'momentum': args.momentum, 'weight_decay': args.weight_decay} model.finetune_cnn(False) def forward(model, data, training=True, optimizer=None): use_cuda = 'cuda' in args.type loss = nn.CrossEntropyLoss() perplexity = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() if training: model.train() else: model.eval() end = time.time() for i, (imgs, (captions, lengths)) in enumerate(data): data_time.update(time.time() - end) if use_cuda: imgs = imgs.cuda() captions = captions.cuda(async=True) imgs = Variable(imgs, volatile=not training) captions = Variable(captions, volatile=not training) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] pred, _ = model(imgs, input_captions, lengths) err = loss(pred, target_captions) perplexity.update(math.exp(err.data[0])) if training: optimizer.zero_grad() err.backward() clip_grad_norm(model.rnn.parameters(), args.grad_clip) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format( epoch, i, len(data), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, perp=perplexity)) return perplexity.avg for epoch in range(args.start_epoch, args.epochs): if epoch >= args.finetune_epoch: model.finetune_cnn(True) optimizer = adjust_optimizer( optimizer, epoch, regime) # Train train_perp = forward( model, train_data, training=True, optimizer=optimizer) # Evaluate val_perp = forward(model, val_data, training=False) logging.info('\n Epoch: {0}\t' 'Training Perplexity {train_perp:.4f} \t' 'Validation Perplexity {val_perp:.4f} \n' .format(epoch + 1, train_perp=train_perp, val_perp=val_perp)) model.save_checkpoint(checkpoint_file % (epoch + 1))
def train(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # cnn = tv.models.resnet50(True) model = CaptionModel(opt, None, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr1) criterion = t.nn.CrossEntropyLoss() model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() perplexity = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() perplexity.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): optimizer.zero_grad() input_captions = captions[:-1] imgs = imgs.cuda() captions = captions.cuda() imgs = Variable(imgs) captions = Variable(captions) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() # clip_grad_norm(model.rnn.parameters(),opt.grad_clip) optimizer.step() loss_meter.add(loss.data[0]) perplexity.add(t.exp(loss.data)[0]) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss', loss_meter.value()[0]) vis.plot('perplexity', perplexity.value()[0]) # 可视化原始图片 raw_img = _data['train']['ix2id'][indexes[0]] img_path = '/data/image/ai_cha/caption/ai_challenger_caption_train_20170902/caption_train_images_20170902/' + raw_img raw_img = Image.open(img_path).convert('RGB') raw_img = tv.transforms.ToTensor()(raw_img) vis.img('raw', raw_img) # raw_img = (imgs.data[0]*0.25+0.45).clamp(max=1,min=0) # vis.img('raw',raw_img) # 可视化人工的描述语句 raw_caption = captions.data[:, 0] raw_caption = ''.join( [_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption, u'raw_caption') # 可视化网络生成的描述语句 results = model.generate(imgs.data[0]) vis.text('</br>'.join(results), u'caption') if (epoch + 1) % 100 == 0: model.save()
test_loader = DataLoader(test_opt) opt.vocab = train_loader.get_vocab() opt.vocab_size = train_loader.get_vocab_size() opt.seq_length = train_loader.get_seq_length() opt.feat_dims = train_loader.get_feat_dims() opt.history_file = opt.model_file.replace('.pth', '_history.json', 1) logger.info('Building model...') model = CaptionModel(opt) xe_criterion = CrossEntropyCriterion() rl_criterion = RewardCriterion() if torch.cuda.is_available(): model.cuda() xe_criterion.cuda() rl_criterion.cuda() logger.info('Start training...') start = datetime.now() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) infos = train(model, xe_criterion, optimizer, train_loader, val_loader, opt, rl_criterion=rl_criterion) logger.info('Best val %s score: %f. Best iter: %d. Best epoch: %d',
def train(**kwargs): opt = Config() opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 #opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据w vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # 模型 model = CaptionModel(opt, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr) criterion = t.nn.CrossEntropyLoss() if opt.use_gpu: model.cuda() criterion.cuda() # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() if opt.use_gpu: imgs = imgs.cuda() captions = captions.cuda() imgs = Variable(imgs) captions = Variable(captions) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) ''' if (ii+1)%opt.plot_every ==0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss',loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path=opt.img_path+raw_img raw_img = Image.open(img_path).convert('RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption = captions.data[:,0] raw_caption = ''.join([_data['ix2word'][int(ii)] for ii in raw_caption]) vis.text(raw_caption,u'raw_caption') vis.img('raw',raw_img,caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate(imgs.data[0]) vis.text('</br>'.join(results),u'caption') ''' model.save()
def train(**kwargs): opt = Config() opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 #opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据 vis = Visualizer(env = opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix,ix2word = _data['word2ix'],_data['ix2word'] # 模型 model = CaptionModel(opt,word2ix,ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr) criterion = t.nn.CrossEntropyLoss() if opt.use_gpu: model.cuda() criterion.cuda() # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii,(imgs, (captions, lengths),indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() input_captions = captions[:-1] if opt.use_gpu: imgs = imgs.cuda() captions = captions.cuda() imgs = Variable(imgs) captions = Variable(captions) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions,lengths)[0] score,_ = model(imgs,input_captions,lengths) loss = criterion(score,target_captions) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) # 可视化 if (ii+1)%opt.plot_every ==0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss',loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path=opt.img_path+raw_img raw_img = Image.open(img_path).convert('RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption = captions.data[:,0] raw_caption = ''.join([_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption,u'raw_caption') vis.img('raw',raw_img,caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate(imgs.data[0]) vis.text('</br>'.join(results),u'caption') model.save()