def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device=t.device('cuda') if opt.use_gpu else t.device('cpu') # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = Image.open(opt.test_img) img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x resnet50.to(device) img = img.to(device) img_feats = resnet50(img).detach() # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() model.to(device) results = model.generate(img_feats.data[0]) print('\r\n'.join(results))
def generate(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device = t.device('cuda') if opt.use_gpu else t.device('cpu') # 数据预处理 data = t.load(opt.caption_data_path, map_location=lambda s, l: s) word2ix, ix2word = data['word2ix'], data['ix2word'] normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.scale_size), tv.transforms.CenterCrop(opt.img_size), tv.transforms.ToTensor(), normalize ]) img = Image.open(opt.test_img) img = transforms(img).unsqueeze(0) # 用resnet50来提取图片特征 resnet50 = tv.models.resnet50(True).eval() del resnet50.fc resnet50.fc = lambda x: x resnet50.to(device) img = img.to(device) img_feats = resnet50(img).detach() # Caption模型 model = CaptionModel(opt, word2ix, ix2word) model = model.load(opt.model_ckpt).eval() model.to(device) results = model.generate(img_feats.data[0]) print('\r\n'.join(results))
def train(**kwargs): device = t.device('cuda') if t.cuda.is_available() else t.device('cpu') for k, v in kwargs.items(): setattr(opt, k, v) dataloader = get_dataloader(opt) model = CaptionModel(opt, dataloader.dataset.word2ix, dataloader.dataset.id2ix) if opt.model_path: model.load_state_dict(t.load(opt.model_path, map_location='cpu')) t.backends.cudnn.enabled = False model = model.to(device) optimizer = Adam(model.parameters(), opt.lr) criterion = t.nn.CrossEntropyLoss() for epoch in range(opt.max_epoch): for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): imgs = Variable(imgs).to(device) captions = Variable(captions).to(device) pred, _ = model(imgs, captions, lengths) target_captions = pack_padded_sequence(captions, lengths)[0] loss = criterion(pred, target_captions) optimizer.zero_grad() loss.backward() optimizer.step() print("Current Loss: ", loss.item()) if (epoch + 1) % opt.save_model == 0: t.save(model.state_dict(), "checkpoints/{}.pth".format(epoch))
def train(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device=t.device('cuda') if opt.use_gpu else t.device('cpu') opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 # opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据 vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # 模型 model = CaptionModel(opt, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr) criterion = t.nn.CrossEntropyLoss() model.to(device) # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() imgs = imgs.to(device) captions = captions.to(device) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() optimizer.step() loss_meter.add(loss.item()) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss', loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path = opt.img_path + raw_img raw_img = Image.open(img_path).convert('RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption = captions.data[:, 0] raw_caption = ''.join([_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption, u'raw_caption') vis.img('raw', raw_img, caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate(imgs.data[0]) vis.text('</br>'.join(results), u'caption') model.save()
def train(**kwargs): opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) device = t.device('cuda') if opt.use_gpu else t.device('cpu') opt.caption_data_path = 'caption.pth' # 原始数据 opt.test_img = '' # 输入图片 # opt.model_ckpt='caption_0914_1947' # 预训练的模型 # 数据 vis = Visualizer(env=opt.env) dataloader = get_dataloader(opt) _data = dataloader.dataset._data word2ix, ix2word = _data['word2ix'], _data['ix2word'] # 模型 model = CaptionModel(opt, word2ix, ix2word) if opt.model_ckpt: model.load(opt.model_ckpt) optimizer = model.get_optimizer(opt.lr) criterion = t.nn.CrossEntropyLoss() model.to(device) # 统计 loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() imgs = imgs.to(device) captions = captions.to(device) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] score, _ = model(imgs, input_captions, lengths) loss = criterion(score, target_captions) loss.backward() optimizer.step() loss_meter.add(loss.item()) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('loss', loss_meter.value()[0]) # 可视化原始图片 + 可视化人工的描述语句 raw_img = _data['ix2id'][indexes[0]] img_path = opt.img_path + raw_img raw_img = Image.open(img_path).convert('RGB') raw_img = tv.transforms.ToTensor()(raw_img) raw_caption = captions.data[:, 0] raw_caption = ''.join( [_data['ix2word'][ii] for ii in raw_caption]) vis.text(raw_caption, u'raw_caption') vis.img('raw', raw_img, caption=raw_caption) # 可视化网络生成的描述语句 results = model.generate(imgs.data[0]) vis.text('</br>'.join(results), u'caption') model.save()
train_dataset = SampleDataset(train_descriptions, train_img_features, model_info['wordtoidx'], model_info['max_length']) train_loader = DataLoader(train_dataset, batch_size, collate_fn=my_collate) caption_model = CaptionModel(model_info['vocab_size'], embedding_dim, hidden_size=hidden_size, embedding_matrix=embedding_matrix, embedding_train=True) init_weights(caption_model, embedding_pretrained=True) caption_model.to(device) # we will ignore the pad token in true target set criterion = nn.CrossEntropyLoss(ignore_index=0) optimizer = torch.optim.Adam(caption_model.parameters(), lr=0.01) clip = 1 start = time() print(f'Training...') for i in tqdm(range(EPOCHS * 6)): loss = train(caption_model, train_loader, optimizer, criterion, clip, model_info['vocab_size']) print(f'loss = {loss}')