def __getitem__(self, index): if self.mode.lower() == 'train': data_path, label_path = self.train_data[index], self.train_labels[ index] elif self.mode.lower() == 'valid': data_path, label_path = self.valid_data[index], self.valid_labels[ index] elif self.mode.lower() == 'test': data_path, label_path = self.test_data[index], self.test_labels[ index] else: raise RuntimeError( 'Unexpected dataset mode. Supported modes are: train, valid and test' ) image, label = utils.pil_loader(data_path, label_path) if self.data_transform is not None: image = self.data_transform(image) if self.label_transform is not None: label = self.label_transform(label) # perform one-hot-encoding target = utils.one_hot_encode(label) target = torch.FloatTensor(target) return image, label, target
def extract_feature(model, image_path): try: img = pil_loader(image_path) img = img.resize((256, 256)) img_data = image.img_to_array(img) except: img_data = np.zeros((256, 256, 3)) img_data = np.expand_dims(img_data, axis=0) img_data = preprocess_input(img_data) feature = model.predict(img_data) return feature
def _read(self, idx=None): if idx is None: idx = np.random.randint(self.num) fn = os.path.join(self.prefix, self.img_lst[idx]) lb = self.lb_lst[idx] try: img = pil_loader(open(fn, 'rb').read()) return img, lb except Exception as err: print('Read image[{}, {}] failed ({})'.format(idx, fn, err)) return self._read()
def _read(self, idx=None): if idx == None: idx = np.random.randint(self.num) idx %= self.num fn = self.img_lst[idx] lb = self.lb_lst[idx] try: img = pil_loader(open(fn, 'rb').read()) return img, lb, fn except Exception as err: print('Read image[{}, {}] failed ({})'.format(idx, fn, err)) return self._read()
def read_image(src, line): line = line.strip() line = line.split() path1 = os.path.join(src, line[0]) path2 = os.path.join(src, line[1]) path3 = os.path.join(src, line[2]) target = int(line[3]) try: img1 = utils.pil_loader(path1) except: target = 0 random_r = random.randint(0, 255) random_g = random.randint(0, 255) random_b = random.randint(0, 255) img1 = Image.new('RGB', (opt.imgH, opt.imgH), (random_r, random_g, random_b)) try: img2 = utils.pil_loader(path2) except: target = 0 random_r = random.randint(0, 255) random_g = random.randint(0, 255) random_b = random.randint(0, 255) img2 = Image.new('RGB', (opt.imgH, opt.imgH), (random_r, random_g, random_b)) try: img3 = utils.pil_loader(path3) except: target = 0 random_r = random.randint(0, 255) random_g = random.randint(0, 255) random_b = random.randint(0, 255) img3 = Image.new('RGB', (opt.imgH, opt.imgH), (random_r, random_g, random_b)) return (img1, img2, img3), target
def __getitem__(self, index: int) -> dict: """ :param index: The index of the item to retrieve :return: One data pair (image and caption). """ imgid = self.imgids[index] image_path = self.imagepaths[imgid] image = utils.pil_loader(image_path) # self.load_image(image_path) return { 'image': image, 'id': imgid, 'image_file': os.path.basename(image_path), }
def _load_image(self, fn): if self.memcached: try: img_value = mc.pyvector() self.mclient.Get(fn, img_value) img_value_str = mc.ConvertBuffer(img_value) img = utils.pil_loader(img_value_str) except: print('Read image failed ({})'.format(fn)) raise Exception("Exit") else: return img else: return Image.open(fn).convert('RGB')
updown_pred = [updown_qent[ent['question_id']]['answer'] for ent in label] import utils utils.accuracy(gt, updown_pred) RCN_pred = [RCN_qent[ent['question_id']] for ent in label] utils.accuracy(gt, RCN_pred) #%% from tqdm import tqdm import utils import os import matplotlib.pyplot as plt #%% entdis = [] for ent in tqdm(testset): qid = ent['question_id'] if RCN_qent[qid] != int(updown_qent[qid]['answer']): if RCN_qent[qid] != ent['answer'] and ent['data_source'] == 'amt': if int(updown_qent[qid]['answer']) == ent['answer']: ent['disagree'] = 'RCN = {} updown = {}'.format( RCN_qent[qid], int(updown_qent[qid]['answer'])) entdis.append(ent) path = os.path.join('/home/manoj/', ent['image']) a = utils.pil_loader(path) plt.imshow(np.asarray(a)) plt.title("{} {}".format(ent['question'], ent['answer'])) plt.xlabel(ent['disagree']) plt.ylabel(ent['question_id']) l = ent['image'].split("/")[-1] plt.savefig("disagg/" + l, dpi=150) plt.close()
def generate_caption_visualization(encoder, decoder, img_path, word_dict, beam_size=3): ''' Function to visualize the step by step development of the caption along with the corresponding attention component visualization. Arguments: encoder: Instance of the trained Encoder for encoding of images decoder: Instance of the trained Decoder for caption prediction from encoded image img_path (str): Complete path of the image to be visualized word_dict (dict): Dictionary of words (vocabulary) beam_size (int): Number of top candidates to consider for beam search. Default = 3 ''' # Load the image and transform it img = pil_loader(img_path) img = data_transforms(img) img = torch.FloatTensor(img) img = img.unsqueeze(0) # Get the caption and the corresponding attention weights from the trained network img_features = encoder(img) img_features = img_features.expand(beam_size, img_features.size(1), img_features.size(2)) sentence, alpha = decoder.caption(img_features, beam_size) # Using the dictionary, convert the encoded caption to normal words token_dict = {idx: word for word, idx in word_dict.items()} sentence_tokens = [] for word_idx in sentence: sentence_tokens.append(token_dict[word_idx]) if word_idx == word_dict['<eos>']: break # Resizing image for a standard display img = Image.open(img_path) w, h = img.size if w > h: w = w * 256 / h h = 256 else: h = h * 256 / w w = 256 left = (w - 224) / 2 top = (h - 224) / 2 resized_img = img.resize((int(w), int(h)), Image.BICUBIC).crop( (left, top, left + 224, top + 224)) img = np.array(resized_img.convert('RGB').getdata()).reshape(224, 224, 3) img = img.astype('float32') / 255 num_words = len(sentence_tokens) w = np.round(np.sqrt(num_words)) h = np.ceil(np.float32(num_words) / w) alpha = torch.tensor(alpha) # Plot the different attention weighted versions of the original image along with the resultant caption word prediction f = plt.figure(figsize=(8, 9)) plot_height = ceil((num_words + 3) / 4.0) ax1 = f.add_subplot(4, plot_height, 1) plt.imshow(img) plt.axis('off') for idx in range(num_words): ax2 = f.add_subplot(4, plot_height, idx + 2) label = sentence_tokens[idx] plt.text(0, 1, label, backgroundcolor='white', fontsize=13) plt.text(0, 1, label, color='black', fontsize=13) plt.imshow(img) if encoder.network == 'vgg19': shape_size = 14 else: shape_size = 7 alpha_img = skimage.transform.pyramid_expand(alpha[idx, :].reshape( shape_size, shape_size), upscale=16, sigma=20) plt.imshow(alpha_img, alpha=0.8) plt.set_cmap(cm.Greys_r) plt.axis('off') plt.show()
def generate_image_caption(encoder, decoder, img_path, word_dict, beam_size=3, ax=plt): ''' Function to display the image along with the resultant predicted caption. Arguments: encoder: Instance of the trained Encoder for encoding of images decoder: Instance of the trained Decoder for caption prediction from encoded image img_path (str): Complete path of the image to be visualized word_dict (dict): Dictionary of words (vocabulary) beam_size (int): Number of top candidates to consider for beam search. Default = 3 ax: axes for plotting ''' # Load the image and transform it img = pil_loader(img_path) img = data_transforms(img) img = torch.FloatTensor(img) img = img.unsqueeze(0) # Get the caption from the trained network img_features = encoder(img) img_features = img_features.expand(beam_size, img_features.size(1), img_features.size(2)) sentence, alpha = decoder.caption(img_features, beam_size) # Using the dictionary, convert the encoded caption to normal words token_dict = {idx: word for word, idx in word_dict.items()} sentence_tokens = [] for word_idx in sentence: if word_idx == word_dict['<start>']: continue if word_idx == word_dict['<eos>']: break sentence_tokens.append(token_dict[word_idx]) # Resizing image for a standard display img = Image.open(img_path) w, h = img.size if w > h: w = w * 256 / h h = 256 else: h = h * 256 / w w = 256 left = (w - 224) / 2 top = (h - 224) / 2 resized_img = img.resize((int(w), int(h)), Image.BICUBIC).crop( (left, top, left + 224, top + 224)) img = np.array(resized_img.convert('RGB').getdata()).reshape(224, 224, 3) img = img.astype('float32') / 255 # Creation of a sentence from the list of words caption = '' for word in sentence_tokens: if word is sentence_tokens[len(sentence_tokens) - 1]: caption = caption + word + '.' else: caption = caption + word + ' ' ax.imshow(img) ax.set_title(caption.capitalize()) ax.axis('off')
def get_image(path): im = pil_loader(path) im = im.convert("RGB") im = transform_train(im) im = torch.reshape(im, (1, ) + im.shape) return im