def gen_example(self, data_dic): if cfg.TRAIN.NET_G == '': print('Error: the path for morels is not found!') else: # Build and load the generator text_encoder = \ RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = \ torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder = text_encoder.cuda() text_encoder.eval() # the path to save generated images if cfg.GAN.B_DCGAN: netG = G_DCGAN() else: netG = G_NET() s_tmp = cfg.TRAIN.NET_G[:cfg.TRAIN.NET_G.rfind('.pth')] model_dir = cfg.TRAIN.NET_G state_dict = \ torch.load(model_dir, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) netG.cuda() netG.eval() for key in data_dic: save_dir = '%s/%s' % (s_tmp, key) mkdir_p(save_dir) captions, cap_lens, sorted_indices = data_dic[key] batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM captions = Variable(torch.from_numpy(captions), volatile=True) cap_lens = Variable(torch.from_numpy(cap_lens), volatile=True) captions = captions.cuda() cap_lens = cap_lens.cuda() for i in range(1): # 16 noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) noise = noise.cuda() ####################################################### # (1) Extract text embeddings ###################################################### hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) ####################################################### # (2) Generate fake images ###################################################### noise.data.normal_(0, 1) with torch.no_grad(): fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) # G attention cap_lens_np = cap_lens.cpu().data.numpy() for j in range(batch_size): save_name = '%s/%d_s_%d' % (save_dir, i, sorted_indices[j]) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) # print('im', im.shape) im = np.transpose(im, (1, 2, 0)) # print('im', im.shape) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], self.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d.png' % (save_name, k) im.save(fullpath)
def gen_example(n_words, wordtoix, ixtoword, model_dir): '''generate images from example sentences''' # filepath = 'example_captions.txt' filepath = 'caption.txt' data_dic = {} with open(filepath, "r") as f: filenames = f.read().split('\n') captions = [] cap_lens = [] for sent in filenames: if len(sent) == 0: continue sent = sent.replace("\ufffd\ufffd", " ") tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sent.lower()) if len(tokens) == 0: print('sentence token == 0 !') continue rev = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0 and t in wordtoix: rev.append(wordtoix[t]) captions.append(rev) cap_lens.append(len(rev)) max_len = np.max(cap_lens) sorted_indices = np.argsort(cap_lens)[::-1] cap_lens = np.asarray(cap_lens) cap_lens = cap_lens[sorted_indices] cap_array = np.zeros((len(captions), max_len), dtype='int64') for i in range(len(captions)): idx = sorted_indices[i] cap = captions[idx] c_len = len(cap) cap_array[i, :c_len] = cap # key = name[(name.rfind('/') + 1):] key = 0 data_dic[key] = [cap_array, cap_lens, sorted_indices] # algo.gen_example(data_dic) text_encoder = RNN_ENCODER(n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder.eval() netG = G_NET() netG.apply(weights_init) # netG.cuda() netG.eval() state_dict = torch.load(model_dir, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) save_dir = 'results/' mkdir_p(save_dir) for key in data_dic: captions, cap_lens, sorted_indices = data_dic[key] batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM with torch.no_grad(): captions = Variable(torch.from_numpy(captions)) cap_lens = Variable(torch.from_numpy(cap_lens)) # captions = captions.cuda() # cap_lens = cap_lens.cuda() for i in range(image_per_caption): # 16 with torch.no_grad(): noise = Variable(torch.FloatTensor(batch_size, nz)) # noise = noise.cuda() # (1) Extract text embeddings hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) # (2) Generate fake images noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask, cap_lens) cap_lens_np = cap_lens.data.numpy() for j in range(batch_size): save_name = '%s/%d_%d' % (save_dir, i, sorted_indices[j]) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) # print('im', im.shape) im = np.transpose(im, (1, 2, 0)) # print('im', im.shape) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1] else: im = fake_imgs[0] attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d_attention.png' % (save_name, k) im.save(fullpath)
def generate(caption, wordtoix, ixtoword, text_encoder, netG, blob_service, copies=2): # load word vector captions, cap_lens = vectorize_caption(wordtoix, caption, copies) n_words = len(wordtoix) # only one to generate batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM with torch.no_grad(): captions = Variable(torch.from_numpy(captions)) cap_lens = Variable(torch.from_numpy(cap_lens)) noise = Variable(torch.FloatTensor(batch_size, nz)) if cfg.CUDA: captions = captions.cuda() cap_lens = cap_lens.cuda() noise = noise.cuda() ####################################################### # (1) Extract text embeddings ####################################################### hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) ####################################################### # (2) Generate fake images ####################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) # ONNX EXPORT #export = os.environ["EXPORT_MODEL"].lower() == 'true' if False: print("saving text_encoder.onnx") text_encoder_out = torch.onnx._export(text_encoder, (captions, cap_lens, hidden), "text_encoder.onnx", export_params=True) print("uploading text_encoder.onnx") blob_service.create_blob_from_path('models', "text_encoder.onnx", os.path.abspath("text_encoder.onnx")) print("done") print("saving netg.onnx") netg_out = torch.onnx._export(netG, (noise, sent_emb, words_embs, mask), "netg.onnx", export_params=True) print("uploading netg.onnx") blob_service.create_blob_from_path('models', "netg.onnx", os.path.abspath("netg.onnx")) print("done") return # G attention cap_lens_np = cap_lens.cpu().data.numpy() # storing to blob storage container_name = "images" full_path = "https://attgan.blob.core.windows.net/images/%s" prefix = datetime.now().strftime('%Y/%B/%d/%H_%M_%S_%f') urls = [] # only look at first one #j = 0 for j in range(batch_size): for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) # save image to stream stream = io.BytesIO() im.save(stream, format="png") stream.seek(0) if copies > 2: blob_name = '%s/%d/%s_g%d.png' % (prefix, j, "bird", k) else: blob_name = '%s/%s_g%d.png' % (prefix, "bird", k) blob_service.create_blob_from_stream(container_name, blob_name, stream) urls.append(full_path % blob_name) if copies == 2: for k in range(len(attention_maps)): #if False: if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) stream = io.BytesIO() im.save(stream, format="png") stream.seek(0) blob_name = '%s/%s_a%d.png' % (prefix, "attmaps", k) blob_service.create_blob_from_stream(container_name, blob_name, stream) urls.append(full_path % blob_name) if copies == 2: break #print(len(urls), urls) return urls
def save_img_results(self, netG, noise, imgs, bbox_maps_fwd, bbox_maps_bwd, bbox_fmaps, hmaps, rois, num_rois, gen_iterations, name='current'): # Save images font_max = 20 font_size = 12 imgs = imgs.cpu() fake_hmaps = netG(noise, bbox_maps_fwd, bbox_maps_bwd, bbox_fmaps) fake_hmaps = fake_hmaps.squeeze().detach().cpu() hmaps = hmaps.squeeze().cpu() # prepare captions batch_size = fake_hmaps.size(0) captions = Variable(torch.zeros(batch_size, cfg.ROI.BOXES_NUM)).cuda() for batch_index in range(self.batch_size): for roi_index in range(num_rois[batch_index]): rela_cat_id = int(rois[batch_index, roi_index, 4]) captions[batch_index, roi_index] = self.cats_dict[rela_cat_id][0] att_sze = fake_hmaps.size(2) img_set, _ = build_super_images(imgs, captions, self.ixtoword, fake_hmaps, att_sze, lr_imgs=None, font_max=font_max, font_size=font_size, max_word_num=cfg.ROI.BOXES_NUM) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/G_%s_%d.png' % (self.image_dir, name, gen_iterations) im.save(fullpath) img_set, _ = build_super_images(imgs, captions, self.ixtoword, hmaps, att_sze, lr_imgs=None, font_max=font_max, font_size=font_size, max_word_num=cfg.ROI.BOXES_NUM) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/D_%s_%d.png' % (self.image_dir, name, gen_iterations) im.save(fullpath) # img_set, _ = build_super_images2(imgs, captions, self.ixtoword, fake_hmaps, att_sze, lr_imgs=None, font_max=font_max, font_size=font_size, max_word_num=cfg.ROI.BOXES_NUM) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/G2_%s_%d.png' % (self.image_dir, name, gen_iterations) im.save(fullpath) img_set, _ = build_super_images2(imgs, captions, self.ixtoword, hmaps, att_sze, lr_imgs=None, font_max=font_max, font_size=font_size, max_word_num=cfg.ROI.BOXES_NUM) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/D2_%s_%d.png' % (self.image_dir, name, gen_iterations) im.save(fullpath)
def gen_samples(self, idx): text_encoder = RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from: {}'.format(cfg.TRAIN.NET_E)) text_encoder = text_encoder.cuda() text_encoder.eval() netG = G_NET() state_dict = torch.load(cfg.TRAIN.NET_G, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: {}'.format(cfg.TRAIN.NET_G)) netG.cuda() netG.eval() s_tmp = cfg.TRAIN.NET_G[:cfg.TRAIN.NET_G.rfind('.pth')] save_dir = '%s/samples' % (s_tmp) mkdir_p(save_dir) batch_size = self.batch_size nz = cfg.GAN.Z_DIM with torch.no_grad(): noise = Variable(torch.FloatTensor(batch_size, nz)) noise = noise.cuda() step = 0 data_iter = iter(self.data_loader) while step < self.num_batches: data = data_iter.next() imgs, captions, cap_lens, class_ids, sorted_cap_indices = self.prepare_data( data) hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] for i in range(10): noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) cap_lens_np = cap_lens.cpu().data.numpy() for j in range(batch_size): right_idx = step * batch_size + sorted_cap_indices[j] save_name = '%s/%d_s_%d' % (save_dir, i, right_idx) original_idx = idx[right_idx] shutil.copyfile( '/.local/AttnGAN/data/FashionSynthesis/test/original/test128_{}.png' .format(original_idx + 1), save_dir + '/test128_{0}_{1}.png'.format( original_idx + 1, right_idx)) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], self.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d.png' % (save_name, k) im.save(fullpath) step += 1
def generate(self, caption, copies=2): # load word vector captions, cap_lens, n_words = self.vectorize_caption(caption, copies) # only one to generate batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM captions = Variable(torch.from_numpy(captions), volatile=True) cap_lens = Variable(torch.from_numpy(cap_lens), volatile=True) noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) if self.cuda: captions = captions.cuda() cap_lens = cap_lens.cuda() noise = noise.cuda() ####################################################### # (1) Extract text embeddings ####################################################### hidden = self.text_encoder.init_hidden(batch_size) words_embs, sent_emb = self.text_encoder(captions, cap_lens, hidden) mask = (captions == 0) ####################################################### # (2) Generate fake images ####################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = self.netG(noise, sent_emb, words_embs, mask) # G attention cap_lens_np = cap_lens.cpu().data.numpy() # prefix for partitioning images prefix = datetime.now().strftime('%Y/%B/%d/%H_%M_%S_%f') urls = [] # only look at first one for j in range(batch_size): for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) # save using saveable birdy = 'bird_g{}'.format(k) if copies > 2: item = self.saveable.save('{}/{}'.format(prefix, j), birdy, im) else: item = self.saveable.save(prefix, birdy, im) urls.append(item) if copies == 2: for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], self.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: attnmap = 'attmaps_a{}'.format(k) item = self.saveable.save(prefix, attnmap, img_set) urls.append(item) if copies == 2: break return urls
def gen_example(self, data_dic): if cfg.TRAIN.NET_G == '' or cfg.TRAIN.NET_C == '': print('Error: the path for main module or DCM is not found!') else: # The text encoder text_encoder = \ RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = \ torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder = text_encoder.cuda() text_encoder.eval() # The image encoder image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM) img_encoder_path = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder') state_dict = \ torch.load(img_encoder_path, map_location=lambda storage, loc: storage) image_encoder.load_state_dict(state_dict) print('Load image encoder from:', img_encoder_path) image_encoder = image_encoder.cuda() image_encoder.eval() # The VGG network VGG = VGGNet() print("Load the VGG model") VGG.cuda() VGG.eval() # The main module if cfg.GAN.B_DCGAN: netG = G_DCGAN() else: netG = G_NET() s_tmp = cfg.TRAIN.NET_G[:cfg.TRAIN.NET_G.rfind('.pth')] model_dir = cfg.TRAIN.NET_G state_dict = \ torch.load(model_dir, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) netG.cuda() netG.eval() # The DCM netDCM = DCM_Net() if cfg.TRAIN.NET_C != '': state_dict = \ torch.load(cfg.TRAIN.NET_C, map_location=lambda storage, loc: storage) netDCM.load_state_dict(state_dict) print('Load DCM from: ', cfg.TRAIN.NET_C) netDCM.cuda() netDCM.eval() for key in data_dic: save_dir = '%s/%s' % (s_tmp, key) mkdir_p(save_dir) captions, cap_lens, sorted_indices, imgs = data_dic[key] batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM captions = Variable(torch.from_numpy(captions), volatile=True) cap_lens = Variable(torch.from_numpy(cap_lens), volatile=True) captions = captions.cuda() cap_lens = cap_lens.cuda() for i in range(1): noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) noise = noise.cuda() ####################################################### # (1) Extract text and image embeddings ###################################################### hidden = text_encoder.init_hidden(batch_size) # The text embeddings words_embs, sent_emb = text_encoder( captions, cap_lens, hidden) # The image embeddings region_features, cnn_code = \ image_encoder(imgs[cfg.TREE.BRANCH_NUM - 1].unsqueeze(0)) mask = (captions == 0) ####################################################### # (2) Modify real images ###################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, mu, logvar, h_code, c_code = netG( noise, sent_emb, words_embs, mask, cnn_code, region_features) real_img = imgs[cfg.TREE.BRANCH_NUM - 1].unsqueeze(0) real_features = VGG(real_img)[0] fake_img = netDCM(h_code, real_features, sent_emb, words_embs, \ mask, c_code) cap_lens_np = cap_lens.cpu().data.numpy() for j in range(batch_size): save_name = '%s/%d_s_%d' % (save_dir, i, sorted_indices[j]) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], self.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d.png' % (save_name, k) im.save(fullpath) save_name = '%s/%d_sf_%d' % (save_dir, 1, sorted_indices[j]) im = fake_img[j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_SF.png' % (save_name) im.save(fullpath) save_name = '%s/%d_s_%d' % (save_dir, 1, 9) im = imgs[2].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_SR.png' % (save_name) im.save(fullpath)
def gen_example(self, data_dic): if cfg.TRAIN.NET_G == '': print('Error: the path for morels is not found!') else: # Build and load the generator batch_size = 16 text_encoder = \ RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) print("=======self.n_words: %d", self.n_words) state_dict = \ torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) # customed restore text encoder parameters # ext_encoder.load_state_dict(state_dict) own_state = text_encoder.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name] = param print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder = text_encoder.cuda() text_encoder.eval() # the path to save generated images if cfg.GAN.B_DCGAN: netG = G_DCGAN() else: netG = G_NET(text_encoder) s_tmp = cfg.TRAIN.NET_G[:cfg.TRAIN.NET_G.rfind('.pth')] model_dir = cfg.TRAIN.NET_G state_dict = \ torch.load(model_dir, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) netG.cuda() netG.eval() for key in data_dic: save_dir = '%s/%s' % (s_tmp, key) mkdir_p(save_dir) captions, cap_lens, sorted_indices = data_dic[key] # batch_size = captions.shape[0] total_time = len(captions)//batch_size nz = cfg.GAN.Z_DIM # captions = Variable(torch.from_numpy(captions), volatile=True) # cap_lens = Variable(torch.from_numpy(cap_lens), volatile=True) # captions = captions.cuda() # cap_lens = cap_lens.cuda() with torch.no_grad(): for i in range(total_time): # 16 noise = Variable(torch.FloatTensor(batch_size, nz)) noise = noise.cuda() caption_tmp = Variable(torch.from_numpy(captions[i*batch_size:(i+1)*batch_size])) if i < 3: print(caption_tmp.data) cap_len_tmp = Variable(torch.from_numpy(cap_lens[i*batch_size:(i+1)*batch_size])) caption_tmp = caption_tmp.cuda() cap_len_tmp = cap_len_tmp.cuda() ####################################################### # (1) Extract text embeddings ###################################################### hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb, _ = text_encoder(caption_tmp, cap_len_tmp, None) words_embs, sent_emb = words_embs.detach(), sent_emb.detach() mask = (caption_tmp == 0) ####################################################### # (2) Generate fake images ###################################################### random.seed(datetime.now()) rnd= random.randint(0,1000) torch.cuda.manual_seed(rnd) noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _, _ = netG(noise, sent_emb, words_embs, mask, caption_tmp, cap_len_tmp) # G attention # cap_lens_np = cap_lens.cpu().data.numpy() cap_lens_np = cap_len_tmp.cpu().data.numpy() for j in range(batch_size): save_name = '%s/s_%d' % (save_dir, sorted_indices[i*batch_size+j]) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = ((im + 1.0) / 2)* 255.0 im = im.astype(np.uint8) # print('im', im.shape) im = np.transpose(im, (1, 2, 0)) # print('im', im.shape) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) # save to seperate directory save_dir2 = '%s/stage_%d' % (save_dir, k) mkdir_p(save_dir2) fullpath = '%s/%d_g%d.png' % (save_dir2, sorted_indices[i*batch_size+j], k) im.save(fullpath) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), caption_tmp[j].unsqueeze(0), [cap_len_tmp[j]], self.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d.png' % (save_name, k) im.save(fullpath)
def generate_image_sent(sent,model_values): algo,text_encoder,netG,dataset=model_values my_caption=tokenize_sent(sent,dataset.wordtoix) my_cap_len=[len(my_caption[0])] #converting things into their proper forms batch_size = 1 nz = cfg.GAN.Z_DIM my_caption = Variable(torch.from_numpy(np.array(my_caption)), volatile=True) my_cap_len = Variable(torch.from_numpy(np.array(my_cap_len)), volatile=True) my_caption = my_caption.type(torch.LongTensor)##changed this, f**k this line really if cfg.CUDA: my_caption = my_caption.cuda() my_cap_len = my_cap_len.cuda() #generating noise, mask and impt embeddings noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) if cfg.CUDA: noise = noise.cuda() ####################################################### # (1) Extract text embeddings ###################################################### hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb = text_encoder(my_caption, my_cap_len, hidden) mask = (my_caption == 0) ####################################################### # (2) Generate fake images ###################################################### noise.data.normal_(0, 1) #print(noise, sent_emb, words_embs, mask) #Generating (Fake)Images my_fake_imgs, my_attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) #important for extracting text back from tokenized form my_cap_lens_np = my_cap_len.cpu().data.numpy() #saving images for j in range(batch_size): #which is always 1 for sentance will remove this loop soon #save_name = '%s/%d_s_%d' % (save_dir, i, sorted_indices[j]) save_name='output/my_img' #name any folder, right now its named output which you have to create manually inside AttnGAN/code for k in range(len(my_fake_imgs)): im = my_fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) # print('im', im.shape) im = np.transpose(im, (1, 2, 0)) # print('im', im.shape) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) im.save(fullpath) for k in range(len(my_attention_maps)): if len(my_fake_imgs) > 1: im = my_fake_imgs[k + 1].detach().cpu() else: im = my_fake_imgs[0].detach().cpu() attn_maps = my_attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), my_caption[j].unsqueeze(0), [my_cap_lens_np[j]], algo.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s_a%d.png' % (save_name, k) im.save(fullpath)
def generate(caption, wordtoix, ixtoword, text_encoder, netG, blob_service, copies=2): # load word vector captions, cap_lens = vectorize_caption(wordtoix, caption, copies) n_words = len(wordtoix) # only one to generate batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM captions = Variable(torch.from_numpy(captions), volatile=True) cap_lens = Variable(torch.from_numpy(cap_lens), volatile=True) noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) if cfg.CUDA: captions = captions.cuda() cap_lens = cap_lens.cuda() noise = noise.cuda() ####################################################### # (1) Extract text embeddings ####################################################### hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) ####################################################### # (2) Generate fake images ####################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) # ONNX EXPORT #export = os.environ["EXPORT_MODEL"].lower() == 'true' if False: print("saving text_encoder.onnx") text_encoder_out = torch.onnx._export(text_encoder, (captions, cap_lens, hidden), "text_encoder.onnx", export_params=True) print("uploading text_encoder.onnx") blob_service.create_blob_from_path('models', "text_encoder.onnx", os.path.abspath("text_encoder.onnx")) print("done") print("saving netg.onnx") netg_out = torch.onnx._export(netG, (noise, sent_emb, words_embs, mask), "netg.onnx", export_params=True) print("uploading netg.onnx") blob_service.create_blob_from_path('models', "netg.onnx", os.path.abspath("netg.onnx")) print("done") return # G attention cap_lens_np = cap_lens.cpu().data.numpy() # storing to blob storage container_name = "images" full_path = "https://attgan123.blob.core.windows.net/images/%s" prefix = datetime.now().strftime('%Y/%B/%d/%H_%M_%S_%f') urls = [] # only look at first one #j = 0 for j in range(batch_size): for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) # save image to stream stream = io.BytesIO() im.save(stream, format="png") stream.seek(0) if copies > 2: blob_name = '%s/%d/%s_g%d.png' % (prefix, j, "bird", k) else: blob_name = '%s/%s_g%d.png' % (prefix, "bird", k) #blob_service.create_blob_from_stream(container_name, blob_name, stream) my_connection_string = MY_CONNECTION_STRING my_credential = MY_CREDENTIAL blob = BlobClient.from_connection_string(my_connection_string, container_name, blob_name, credential=my_credential) blob.upload_blob(stream, overwrite=True) urls.append(full_path % blob_name) if copies == 2: for k in range(len(attention_maps)): #if False: if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) stream = io.BytesIO() im.save(stream, format="png") stream.seek(0) blob_name = '%s/%s_a%d.png' % (prefix, "attmaps", k) #blob_service.create_blob_from_stream(container_name, blob_name, stream) my_connection_string = "DefaultEndpointsProtocol=https;AccountName=attgantrain123;AccountKey=JtYnNYKOofdWsFkNhYjxL5dV7NuLM6Ad6mcgNoZvb2CQPeQCkzkic7OSbFnBdeW+bdHThlWM3akqP5oK9pP3RQ==;EndpointSuffix=core.windows.net" blob = BlobClient.from_connection_string(my_connection_string, container_name, blob_name, credential=my_credential) blob.upload_blob(stream, overwrite=True) urls.append(full_path % blob_name) if copies == 2: break #print(len(urls), urls) return urls
def gen_img(sentences): output = [] '''generate images from example sentences''' from nltk.tokenize import RegexpTokenizer data_dic = {} captions = [] cap_lens = [] for sent in sentences: if len(sent) == 0: continue sent = sent.replace("\ufffd\ufffd", " ") tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sent.lower()) if len(tokens) == 0: print('sent', sent) continue rev = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0 and t in wordtoix: rev.append(wordtoix[t]) captions.append(rev) cap_lens.append(len(rev)) max_len = np.max(cap_lens) sorted_indices = np.argsort(cap_lens)[::-1] cap_lens = np.asarray(cap_lens) cap_lens = cap_lens[sorted_indices] cap_array = np.zeros((len(captions), max_len), dtype='int64') for i in range(len(captions)): idx = sorted_indices[i] cap = captions[idx] c_len = len(cap) cap_array[i, :c_len] = cap data_dic[0] = [cap_array, cap_lens, sorted_indices] for key in data_dic: save_dir = 'op/' captions, cap_lens, sorted_indices = data_dic[key] batch_size = captions.shape[0] nz = 100 with torch.no_grad(): captions = Variable(torch.from_numpy(captions)) cap_lens = Variable(torch.from_numpy(cap_lens)) captions = captions.cpu() cap_lens = cap_lens.cpu() for i in range(1): with torch.no_grad(): noise = Variable(torch.FloatTensor(batch_size, nz)) noise = noise.cpu() hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) cap_lens_np = cap_lens.cpu().data.numpy() for j in range(batch_size): save_name = '%s/%d_s_%d' % (save_dir, i, sorted_indices[j]) for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_g%d.png' % (save_name, k) output.append(im) for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], dataset.ixtoword, [attn_maps[j]], att_sze) if img_set is not None: im = Image.fromarray(img_set) output.append(im) return output
def generate(caption, wordtoix, ixtoword, text_encoder, netG, copies=2): # load word vector captions, cap_lens = vectorize_caption(wordtoix, caption, copies) # only one to generate batch_size = captions.shape[0] nz = cfg.GAN.Z_DIM with torch.no_grad(): captions = Variable(torch.from_numpy(captions)) cap_lens = Variable(torch.from_numpy(cap_lens)) noise = Variable(torch.FloatTensor(batch_size, nz)) if cfg.CUDA: captions = captions.cuda() cap_lens = cap_lens.cuda() noise = noise.cuda() ####################################################### # (1) Extract text embeddings ####################################################### hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) mask = (captions == 0) ####################################################### # (2) Generate fake images ####################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, _, _ = netG(noise, sent_emb, words_embs, mask) # G attention cap_lens_np = cap_lens.cpu().data.numpy() names2images = {} for j in range(batch_size): for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = rdn.predict(im) im = rdn.predict(im) # save image to stream if copies > 2: blob_name = osp.join(str(j), f'coco_g{k}.png') else: blob_name = f'coco_g{k}.png' names2images[blob_name] = im for k in range(len(attention_maps)): if len(fake_imgs) > 1: im = fake_imgs[k + 1].detach().cpu() else: im = fake_imgs[0].detach().cpu() attn_maps = attention_maps[k] att_sze = attn_maps.size(2) img_set, sentences = \ build_super_images2(im[j].unsqueeze(0), captions[j].unsqueeze(0), [cap_lens_np[j]], ixtoword, [attn_maps[j]], att_sze) if img_set is not None: blob_name = f'attmaps_a{k}.png' names2images[blob_name] = img_set return names2images