def __getitem__(self, idx): """ returns: dict (Tensors): contains 'images', 'given_segmentations', 'labels' """ query = self._query_seqs[idx] seqname = query.split(' ', 3)[0] # We require to begin with a nonempty frame, and will consider all objects in that frame to be tracked. # A starting frame is valid if it is followed by seqlen-1 frames with corresp images frame_ids = self.get_frame_ids(seqname) viable_starting_frame_ids = [ idx for idx in self.get_nonempty_frame_ids(seqname) if idx <= frame_ids[-self._seqlen] ] frame_ids = self._select_frame_ids(frame_ids, viable_starting_frame_ids) images = torch.stack([ self._image_read(self._full_image_path(seqname, idx)) for idx in frame_ids ]) segannos = torch.stack([ self._anno_read(self._full_anno_path(seqname, idx)) for idx in frame_ids ]) if self._joint_transform is not None: images, segannos = self._joint_transform(images, segannos) # try: # segannos = self._select_object_ids(segannos) # except: # print(seqname) # print("frame ids ", self.get_frame_ids(seqname)) # print("frame ids post filtering ", frame_ids) # print("viable starting frame ids", viable_starting_frame_ids) # print("visible objects", self._visible_objects[seqname]) # raise object_name = int(query.split(' ', 3)[1]) segannos_one = To_onehot(segannos, object_name) segannos_all = To_allhot(segannos) sentence = query.split(' ', 3)[3] txt = np.array( text_processing.preprocess_sentence(sentence, self.vocab_dict, 20)) # txt = np.tile(txt, self._seqlen).reshape(self._seqlen, -1) return { 'images': images, 'segannos': segannos_one, 'segannos_all': segannos_all, 'sentence': txt }
def __getitem__(self, index): sent = self.qdb[index][0] sentence = text_p.preprocess_sentence(sent, self.vocab_file, self.word_count) target = self.qdb[index][1] sentence = np.array(sentence).astype(np.int64) target = np.array(target).astype(np.float32) return sentence, target
def __getitem__(self, idx): """ returns: dict (Tensors): contains 'images', 'given_segmentations', 'labels' """ query = self._query_seqs[idx] # assert self._version == '2017', "Only the 2017 version is supported for training as of now" seqname = query.split(' ', 2)[0] # We require to begin with a nonempty frame, and will consider all objects in that frame to be tracked. # A starting frame is valid if it is followed by seqlen-1 frames with corresp images frame_ids = self.get_frame_ids(seqname) viable_starting_frame_ids = [ idx for idx in self.get_nonempty_frame_ids(seqname) if idx <= frame_ids[-self._seqlen] ] frame_ids = self._select_frame_ids(frame_ids, viable_starting_frame_ids) images = torch.stack([ self._image_read(self._full_image_path(seqname, idx)) for idx in frame_ids ]) segannos = torch.stack([ self._anno_read(self._full_anno_path(seqname, idx)) for idx in frame_ids ]) if self._joint_transform is not None: images, segannos = self._joint_transform(images, segannos) object_name = int(query.split(' ', 2)[1]) segannos_one = To_onehot(segannos, object_name) segannos_all = To_allhot(segannos) sentence = query.split(' ', 2)[2].split('"')[1] txt = np.array( text_processing.preprocess_sentence(sentence, self.vocab_dict, 20)) # txt = np.tile(txt, self._seqlen).reshape(self._seqlen, -1) # return {'images': images, 'provides_seganno': provides_seganno, 'given_seganno': given_seganno, # 'segannos': segannos, 'sentence': sentence} return { 'images': images, 'segannos': segannos_one, 'segannos_all': segannos_all, 'sentence': txt }
def get_video_generator(self): for query in self._query_seqs: seqname = query.split(' ', 2)[0] object_name = int(query.split(' ', 2)[1]) sentence = query.split(' ', 2)[2].split('"')[1] txt = np.array( text_processing.preprocess_sentence(sentence, self.vocab_dict, 20)) if seqname in self._all_seqs: yield (seqname, self._get_video(seqname, object_name, txt))
def get_video_generator(self, low=0, high=2**31): """Returns a video generator. The video generator is used to obtain parts of a sequence. Some assumptions are made, depending on whether the train or valid splits are used. For the train split, the first annotated frame is given. No other annotation is used. For the validation split, each annotation found is given. """ for query in self._query_seqs: seqname = query.split(' ', 2)[0] object_name = int(query.split(' ', 2)[1]) sentence = query.split(' ', 2)[2].split('"')[1] txt = np.array( text_processing.preprocess_sentence(sentence, self.vocab_dict, 20)) if seqname in self._all_seqs: yield (seqname, self._get_video(seqname, object_name, txt))
def __getitem__(self, index): sent = self.qdb[index] sentence = text_p.preprocess_sentence(sent, self.vocab_file, self.word_count) sentence = np.array(sentence).astype(np.int64) return sentence
def data_preparation(**kwargs): dataset = kwargs['dataset'] data_base_dir = kwargs['data_base_dir'] text_len = kwargs['text_len'] if dataset == 'both': dataset_types = ['train', 'val'] else: dataset_types = [dataset] caption_data_base_dir = os.path.join(data_base_dir, 'captions') image_data_base_dir = os.path.join(data_base_dir, 'images') categories = os.listdir(caption_data_base_dir) categories.sort() vocab_file = os.path.join(data_base_dir, 'vocab.txt') vocab_dict = load_vocab_dict_from_file(vocab_file) for dataset_type in dataset_types: data_save_split_base = os.path.join(data_base_dir, 'tfrecord', dataset_type) os.makedirs(data_save_split_base, exist_ok=True) for category_id, category_name in enumerate(categories): record_filename = os.path.join(data_save_split_base, category_name + '.tfrecord') with tf.python_io.TFRecordWriter( record_filename) as tfrecord_writer: json_file_path = os.path.join(caption_data_base_dir, category_name, dataset_type + '.json') fp = open(json_file_path, "r") json_data = fp.read() json_data = json.loads(json_data) nImgs = len(json_data) print(dataset_type, category_name, nImgs) for j in range(nImgs): image_name = json_data[j]['key'] cartoon_path = os.path.join(image_data_base_dir, category_name, 'cartoon', image_name) sketch_path = os.path.join(image_data_base_dir, category_name, 'edgemap', image_name) cartoon_image = Image.open(cartoon_path) cartoon_image = cartoon_image.convert("RGB") cartoon_image = np.array( cartoon_image, dtype=np.uint8) # shape = [H, W, 3] cartoon_image_raw = cartoon_image.tobytes() sketch_image = Image.open(sketch_path) sketch_image = sketch_image.convert("RGB") sketch_image = np.array( sketch_image, dtype=np.uint8) # shape = [H, W, 3] sketch_image_raw = sketch_image.tobytes() color_text = json_data[j]['color_text'] vocab_indices = preprocess_sentence( color_text, vocab_dict, text_len) # list vocab_indices_raw = np.array( vocab_indices, dtype=np.uint8).tobytes() # [15] # print(color_text) # vocab_indices_display = [item + 1 for item in vocab_indices] # print(vocab_indices_display) example = _to_tfexample_raw( image_name.encode(), cartoon_image_raw, sketch_image_raw, category_name.encode(), category_id, color_text.encode(), vocab_indices_raw) tfrecord_writer.write(example.SerializeToString())
def inference(img_name, instruction): wild_data_base_dir = 'examples' wild_text = instruction wild_cate = img_name[:img_name.find('.png')] SIZE = {True: (64, 64), False: (192, 192)} T = 15 # the longest length of text vocab_file = 'data/vocab.txt' captions_base_dir = os.path.join('data', 'captions') categories = os.listdir(captions_base_dir) categories.sort() if wild_cate not in categories: wild_cate = categories[2] # Roll out the parameters batch_size = 1 ckpt_dir = Config.ckpt_dir results_dir = Config.results_dir data_format = Config.data_format distance_map = Config.distance_map small_img = Config.small_img LSTM_hybrid = Config.LSTM_hybrid block_type = Config.block_type vocab_size = Config.vocab_size distance_map = distance_map != 0 small = small_img != 0 LSTM_hybrid = LSTM_hybrid != 0 img_dim = SIZE[small] output_folder = results_dir print('output_folder:', output_folder) os.makedirs(output_folder, exist_ok=True) vocab_dict = load_vocab_dict_from_file(vocab_file) input_images = tf.placeholder(tf.float32, shape=[1, 3, img_dim[0], img_dim[1]]) # [1, 3, H, W] class_ids = tf.placeholder(tf.int32, shape=(1, )) # (1, ) text_vocab_indiceses = tf.placeholder(tf.int32, shape=[1, 15]) # [1, 15] ret_list = build_single_graph( input_images, input_images, None, class_ids, None, text_vocab_indiceses, batch_size=batch_size, training=False, LSTM_hybrid=LSTM_hybrid, vocab_size=vocab_size, data_format=data_format, distance_map=distance_map, block_type=block_type) # [image_gens, images, sketches] snapshot_loader = tf.train.Saver() tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('Restore trained model:', tf.train.latest_checkpoint(ckpt_dir)) snapshot_loader.restore(sess, tf.train.latest_checkpoint(ckpt_dir)) input_name = img_name input_category = wild_cate # e.g. 'bus' input_text = wild_text # e.g. 'A yellow bus with blue window' sketch_path = os.path.join(wild_data_base_dir, input_name) sketch_image = Image.open(sketch_path) sketch_image = sketch_image.convert("RGB") # Resize if sketch_image.width != img_dim[0] or sketch_image.height != img_dim[ 1]: margin_size = 0 if input_category in ['road'] else 10 sketch_image = resize_and_padding_mask_image( sketch_image, img_dim[0], margin_size=margin_size).astype(np.float32) else: sketch_image = np.array(sketch_image, dtype=np.float32) # shape = [H, W, 3] # Normalization sketch_image = sketch_image / 255. sketch_image = sketch_image * 2. - 1 sketch_image = np.expand_dims(sketch_image, axis=0) # shape = [1, H, W, 3] sketch_image = np.transpose(sketch_image, [0, 3, 1, 2]) # shape = [1, 3, H, W] class_id = categories.index(input_category) class_id = np.array([class_id]) vocab_indices = preprocess_sentence(input_text, vocab_dict, T) # list vocab_indices = np.array(vocab_indices, dtype=np.int32) vocab_indices = np.expand_dims(vocab_indices, axis=0) # shape = [1, 15] try: # print('class_id', class_id) # print('vocab_indices', vocab_indices) generated_img, _, input_sketch = sess.run( [ret_list[0], ret_list[1], ret_list[2]], feed_dict={ input_images: sketch_image, class_ids: class_id, text_vocab_indiceses: vocab_indices }) except Exception as e: print(e.args) if data_format == 'NCHW': generated_img = np.transpose(generated_img, (0, 2, 3, 1)) input_sketch = np.transpose(input_sketch, (0, 2, 3, 1)) # log('before, generated_img', generated_img) # log('before, input_sketch', input_sketch) generated_img = ((generated_img + 1) / 2.) * 255 input_sketch = ((input_sketch + 1) / 2.) * 255 generated_img = generated_img[:, :, :, ::-1].astype(np.uint8) input_sketch = input_sketch.astype(np.uint8) # log('after, generated_img', generated_img) # log('after, input_sketch', input_sketch) img_out_filename = input_name[:-4] + '_output.png' sketch_in_filename = input_name[:-4] + '_input.png' # Save file cv2.imwrite(os.path.join(output_folder, img_out_filename), generated_img[0]) cv2.imwrite(os.path.join(output_folder, sketch_in_filename), input_sketch[0]) print('Saved file %s' % img_out_filename)