def testDecodePngInstanceMasks(self): image_tensor = np.random.randint(256, size=(10, 10, 3)).astype(np.uint8) encoded_jpeg = self._EncodeImage(image_tensor) mask_1 = np.random.randint(0, 2, size=(10, 10, 1)).astype(np.uint8) mask_2 = np.random.randint(0, 2, size=(10, 10, 1)).astype(np.uint8) encoded_png_1 = self._EncodeImage(mask_1, encoding_type='png') decoded_png_1 = np.squeeze(mask_1.astype(np.float32)) encoded_png_2 = self._EncodeImage(mask_2, encoding_type='png') decoded_png_2 = np.squeeze(mask_2.astype(np.float32)) encoded_masks = [encoded_png_1, encoded_png_2] decoded_masks = np.stack([decoded_png_1, decoded_png_2]) example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/object/mask': dataset_util.bytes_list_feature( encoded_masks) })).SerializeToString() example_decoder = tf_example_decoder.TfExampleDecoder( load_instance_masks=True, instance_mask_type=input_reader_pb2.PNG_MASKS) tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) with self.test_session() as sess: tensor_dict = sess.run(tensor_dict) self.assertAllEqual( decoded_masks, tensor_dict[fields.InputDataFields.groundtruth_instance_masks])
def testDecodeAdditionalChannels(self): image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) encoded_jpeg = self._EncodeImage(image_tensor) additional_channel_tensor = np.random.randint( 256, size=(4, 5, 1)).astype(np.uint8) encoded_additional_channel = self._EncodeImage( additional_channel_tensor) decoded_additional_channel = self._DecodeImage( encoded_additional_channel) example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/additional_channels/encoded': dataset_util.bytes_list_feature([encoded_additional_channel] * 2), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/source_id': dataset_util.bytes_feature(b'image_id'), })).SerializeToString() example_decoder = tf_example_decoder.TfExampleDecoder( num_additional_channels=2) tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) with self.test_session() as sess: tensor_dict = sess.run(tensor_dict) self.assertAllEqual( np.concatenate([decoded_additional_channel] * 2, axis=2), tensor_dict[fields.InputDataFields.image_additional_channels])
def testDecodeEmptyPngInstanceMasks(self): image_tensor = np.random.randint(256, size=(10, 10, 3)).astype(np.uint8) encoded_jpeg = self._EncodeImage(image_tensor) encoded_masks = [] example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/object/mask': dataset_util.bytes_list_feature( encoded_masks), 'image/height': dataset_util.int64_feature(10), 'image/width': dataset_util.int64_feature(10), })).SerializeToString() example_decoder = tf_example_decoder.TfExampleDecoder( load_instance_masks=True, instance_mask_type=input_reader_pb2.PNG_MASKS) tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) with self.test_session() as sess: tensor_dict = sess.run(tensor_dict) self.assertAllEqual( tensor_dict[ fields.InputDataFields.groundtruth_instance_masks].shape, [0, 10, 10])
def create_tf_example(group): path = images_dir + '/' + group.filename with tf.gfile.GFile(path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = Image.open(encoded_jpg_io) width, height = image.size filename = group.filename.encode('utf8') image_format = b'jpg' xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] classes = [] # NOTE: coordinates is divided by 2 because of image is resized by half for index, row in group.object.iterrows(): xmins.append((row['xmin'] / 2) / width) xmaxs.append((row['xmax'] / 2) / width) ymins.append((row['ymin'] / 2) / height) ymaxs.append((row['ymax'] / 2) / height) classes_text.append(row['sign_class'].encode('utf8')) classes.append(int(row['sign_id'])) tf_example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename), 'image/source_id': dataset_util.bytes_feature(filename), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature(image_format), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), })) return tf_example
def create_tf_example(group): path = file_location(group.filename) with tf.gfile.GFile(path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = Image.open(encoded_jpg_io) width, height = image.size filename = group.filename.encode('utf8') image_format = b'jpg' xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] classes = [] for index, row in group.object.iterrows(): xmins.append(row['x_from'] / width) xmaxs.append(row['x_from'] + row['width'] / width) ymins.append(row['y_from'] / height) ymaxs.append(row['y_from'] + row['height'] / height) classes_text.append(row['sign_class'].encode('utf8')) classes.append(row['sign_class_id']) tf_example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename), 'image/source_id': dataset_util.bytes_feature(filename), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature(image_format), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), })) return tf_example
def testDecodeImageLabels(self): image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) encoded_jpeg = self._EncodeImage(image_tensor) example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/class/label': dataset_util.int64_list_feature([1, 2]), })).SerializeToString() example_decoder = tf_example_decoder.TfExampleDecoder() tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) with self.test_session() as sess: tensor_dict = sess.run(tensor_dict) self.assertTrue( fields.InputDataFields.groundtruth_image_classes in tensor_dict) self.assertAllEqual( tensor_dict[fields.InputDataFields.groundtruth_image_classes], np.array([1, 2])) example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/class/text': dataset_util.bytes_list_feature([b'dog', b'cat']), })).SerializeToString() label_map_string = """ item { id:3 name:'cat' } item { id:1 name:'dog' } """ label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt') with tf.gfile.Open(label_map_path, 'wb') as f: f.write(label_map_string) example_decoder = tf_example_decoder.TfExampleDecoder( label_map_proto_file=label_map_path) tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) with self.test_session() as sess: sess.run(tf.tables_initializer()) tensor_dict = sess.run(tensor_dict) self.assertTrue( fields.InputDataFields.groundtruth_image_classes in tensor_dict) self.assertAllEqual( tensor_dict[fields.InputDataFields.groundtruth_image_classes], np.array([1, 3]))
def create_tf_record(self, has_additional_channels=False, num_examples=1): path = os.path.join(self.get_temp_dir(), 'tfrecord') writer = tf.python_io.TFRecordWriter(path) image_tensor = np.random.randint(255, size=(4, 5, 3)).astype(np.uint8) additional_channels_tensor = np.random.randint( 255, size=(4, 5, 1)).astype(np.uint8) flat_mask = (4 * 5) * [1.0] with self.test_session(): encoded_jpeg = tf.image.encode_jpeg( tf.constant(image_tensor)).eval() encoded_additional_channels_jpeg = tf.image.encode_jpeg( tf.constant(additional_channels_tensor)).eval() for i in range(num_examples): features = { 'image/source_id': dataset_util.bytes_feature(str(i).encode()), 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), 'image/height': dataset_util.int64_feature(4), 'image/width': dataset_util.int64_feature(5), 'image/object/bbox/xmin': dataset_util.float_list_feature([0.0]), 'image/object/bbox/xmax': dataset_util.float_list_feature([1.0]), 'image/object/bbox/ymin': dataset_util.float_list_feature([0.0]), 'image/object/bbox/ymax': dataset_util.float_list_feature([1.0]), 'image/object/class/label': dataset_util.int64_list_feature([2]), 'image/object/mask': dataset_util.float_list_feature(flat_mask), } if has_additional_channels: additional_channels_key = 'image/additional_channels/encoded' features[ additional_channels_key] = dataset_util.bytes_list_feature( [encoded_additional_channels_jpeg] * 2) example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(example.SerializeToString()) writer.close() return path
def testDecodeObjectLabelWithMappingWithName(self): image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) encoded_jpeg = self._EncodeImage(image_tensor) bbox_classes_text = [b'cat', b'dog'] example = tf.train.Example(features=tf.train.Features( feature={ 'image/encoded': dataset_util.bytes_feature(encoded_jpeg), 'image/format': dataset_util.bytes_feature(b'jpeg'), 'image/object/class/text': dataset_util.bytes_list_feature(bbox_classes_text), })).SerializeToString() label_map_string = """ item { id:3 name:'cat' } item { id:1 name:'dog' } """ label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt') with tf.gfile.Open(label_map_path, 'wb') as f: f.write(label_map_string) example_decoder = tf_example_decoder.TfExampleDecoder( label_map_proto_file=label_map_path) tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) self.assertAllEqual((tensor_dict[ fields.InputDataFields.groundtruth_classes].get_shape().as_list()), [None]) with self.test_session() as sess: sess.run(tf.tables_initializer()) tensor_dict = sess.run(tensor_dict) self.assertAllEqual( [3, 1], tensor_dict[fields.InputDataFields.groundtruth_classes])
def create_tf_example(image, image_dir, bbox_annotations=None, category_index=None, caption_annotations=None, include_masks=False): """Converts image and annotations to a tf.Example proto. Args: image: dict with keys: [u'license', u'file_name', u'coco_url', u'height', u'width', u'date_captured', u'flickr_url', u'id'] image_dir: directory containing the image files. bbox_annotations: list of dicts with keys: [u'segmentation', u'area', u'iscrowd', u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box coordinates in the official COCO dataset are given as [x, y, width, height] tuples using absolute coordinates where x, y represent the top-left (0-indexed) corner. This function converts to the format expected by the Tensorflow Object Detection API (which is which is [ymin, xmin, ymax, xmax] with coordinates normalized relative to image size). category_index: a dict containing COCO category information keyed by the 'id' field of each category. See the label_map_util.create_category_index function. caption_annotations: list of dict with keys: [u'id', u'image_id', u'str']. include_masks: Whether to include instance segmentations masks (PNG encoded) in the result. default: False. Returns: example: The converted tf.Example num_annotations_skipped: Number of (invalid) annotations that were ignored. Raises: ValueError: if the image pointed to by data['filename'] is not a valid JPEG """ image_height = image['height'] image_width = image['width'] filename = image['file_name'] image_id = image['id'] full_path = os.path.join(image_dir, filename) with tf.gfile.GFile(full_path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) key = hashlib.sha256(encoded_jpg).hexdigest() feature_dict = { 'image/height': dataset_util.int64_feature(image_height), 'image/width': dataset_util.int64_feature(image_width), 'image/filename': dataset_util.bytes_feature(filename.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(str(image_id).encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), } num_annotations_skipped = 0 if bbox_annotations: xmin = [] xmax = [] ymin = [] ymax = [] is_crowd = [] category_names = [] category_ids = [] area = [] encoded_mask_png = [] for object_annotations in bbox_annotations: (x, y, width, height) = tuple(object_annotations['bbox']) if width <= 0 or height <= 0: num_annotations_skipped += 1 continue if x + width > image_width or y + height > image_height: num_annotations_skipped += 1 continue xmin.append(float(x) / image_width) xmax.append(float(x + width) / image_width) ymin.append(float(y) / image_height) ymax.append(float(y + height) / image_height) is_crowd.append(object_annotations['iscrowd']) category_id = int(object_annotations['category_id']) category_ids.append(category_id) category_names.append( category_index[category_id]['name'].encode('utf8')) area.append(object_annotations['area']) if include_masks: run_len_encoding = mask.frPyObjects( object_annotations['segmentation'], image_height, image_width) binary_mask = mask.decode(run_len_encoding) if not object_annotations['iscrowd']: binary_mask = np.amax(binary_mask, axis=2) pil_image = PIL.Image.fromarray(binary_mask) output_io = io.BytesIO() pil_image.save(output_io, format='PNG') encoded_mask_png.append(output_io.getvalue()) feature_dict.update({ 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax), 'image/object/class/text': dataset_util.bytes_list_feature(category_names), 'image/object/class/label': dataset_util.int64_list_feature(category_ids), 'image/object/is_crowd': dataset_util.int64_list_feature(is_crowd), 'image/object/area': dataset_util.float_list_feature(area), }) if include_masks: feature_dict['image/object/mask'] = ( dataset_util.bytes_list_feature(encoded_mask_png)) if caption_annotations: captions = [] for caption_annotation in caption_annotations: captions.append(caption_annotation['caption'].encode('utf8')) feature_dict.update( {'image/caption': dataset_util.bytes_list_feature(captions)}) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) return key, example, num_annotations_skipped
def prepare_tfexample(image_path, annotations, label_map_dict): image = pil.open(image_path) image = np.asarray(image) width = int(image.shape[1]) height = int(image.shape[0]) xmin_norm = annotations['xmin'] / float(width) ymin_norm = annotations['ymin'] / float(height) xmax_norm = annotations['xmax'] / float(width) ymax_norm = annotations['ymax'] / float(height) if np.any(xmin_norm > xmax_norm): logging.warn( 'Image {}, xmin and xmax are replaced: {} - {} / {} - {}'.format( image_path, xmin_norm, xmax_norm, annotations['xmin'], annotations['xmax'])) xmin_norm[xmin_norm > xmax_norm], xmax_norm[xmin_norm > xmax_norm] = xmax_norm[xmin_norm > xmax_norm], \ xmin_norm[xmin_norm > xmax_norm] if np.any(ymin_norm > ymax_norm): logging.warn( 'Image {}, ymin and ymax are replaced: {} - {} / {} - {}'.format( image_path, ymin_norm, ymax_norm, annotations['ymin'], annotations['ymax'])) ymin_norm[ymin_norm > ymax_norm], ymax_norm[ymin_norm > ymax_norm] = ymax_norm[ymin_norm > ymax_norm], \ ymin_norm[ymin_norm > ymax_norm] if np.any(xmin_norm > 1.0) or np.any(xmin_norm < 0.0): logging.warn( 'Image {}, x_min out of bounds: {} / {} - bound: {}'.format( image_path, xmin_norm, annotations['xmin'], width)) # remove completely if the min is out of bounds, broken annotation indices = xmin_norm < 1.0 xmin_norm = xmin_norm[indices] xmax_norm = xmax_norm[indices] ymin_norm = ymin_norm[indices] ymax_norm = ymax_norm[indices] if np.any(xmax_norm > 1.0) or np.any(xmax_norm < 0.0): logging.warn( 'Image {}, x_max out of bounds: {} / {} - bound: {}'.format( image_path, xmax_norm, annotations['xmax'], width)) # cut down max out of bounds to 1.0 xmax_norm[xmax_norm > 1.0] = np.ones_like(xmax_norm[xmax_norm > 1.0]) if np.any(ymin_norm > 1.0) or np.any(ymin_norm < 0.0): logging.warn( 'Image {}, y_min out of bounds: {} / {} - bound: {}'.format( image_path, ymin_norm, annotations['ymin'], height)) # remove completely if the min is out of bounds, broken annotation indices = ymin_norm < 1.0 ymin_norm = ymin_norm[indices] xmin_norm = xmin_norm[indices] ymax_norm = ymax_norm[indices] xmax_norm = xmax_norm[indices] if np.any(ymax_norm > 1.0) or np.any(ymax_norm < 0.0): logging.warn( 'Image {}, y_max out of bounds: {} / {} - bound: {}'.format( image_path, ymax_norm, annotations['ymax'], height)) # cut down max out of bounds to 1.0 ymax_norm[ymax_norm > 1.0] = np.ones_like(ymax_norm[ymax_norm > 1.0]) # we ignore the "difficult object" labels for now difficult_obj = [0] * len(xmin_norm) with tf.gfile.GFile(image_path, 'rb') as fid: encoded_png = fid.read() key = hashlib.sha256(encoded_png).hexdigest() class_to_key_map = { key: idx + 1 for idx, key in enumerate(sign_name_carolo_dict.keys()) } example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(image_path.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(image_path.encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_png), 'image/format': dataset_util.bytes_feature('jpg'.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_norm), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_norm), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_norm), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_norm), 'image/object/class/text': dataset_util.bytes_list_feature([ sign_name_carolo_dict[x].encode('utf8') for x in annotations['class'] ]), 'image/object/class/label': dataset_util.int64_list_feature( [class_to_key_map[x] for x in annotations['class']]), 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), })) return example
def dict_to_tf_example(data, mask_path, label_map_dict, image_subdirectory, ignore_difficult_instances=False, faces_only=True, mask_type='png'): """Convert XML derived dict to tf.Example proto. Notice that this function normalizes the bounding box coordinates provided by the raw data. Args: data: dict holding PASCAL XML fields for a single image (obtained by running dataset_util.recursive_parse_xml_to_dict) mask_path: String path to PNG encoded mask. label_map_dict: A map from string label names to integers ids. image_subdirectory: String specifying subdirectory within the Pascal dataset directory holding the actual image data. ignore_difficult_instances: Whether to skip difficult instances in the dataset (default: False). faces_only: If True, generates bounding boxes for pet faces. Otherwise generates bounding boxes (as well as segmentations for full pet bodies). mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. Returns: example: The converted tf.Example. Raises: ValueError: if the image pointed to by data['filename'] is not a valid JPEG """ img_path = os.path.join(image_subdirectory, data['filename']) with tf.gfile.GFile(img_path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) if image.format != 'JPEG': raise ValueError('Image format not JPEG') key = hashlib.sha256(encoded_jpg).hexdigest() with tf.gfile.GFile(mask_path, 'rb') as fid: encoded_mask_png = fid.read() encoded_png_io = io.BytesIO(encoded_mask_png) mask = PIL.Image.open(encoded_png_io) if mask.format != 'PNG': raise ValueError('Mask format not PNG') mask_np = np.asarray(mask) nonbackground_indices_x = np.any(mask_np != 2, axis=0) nonbackground_indices_y = np.any(mask_np != 2, axis=1) nonzero_x_indices = np.where(nonbackground_indices_x) nonzero_y_indices = np.where(nonbackground_indices_y) width = int(data['size']['width']) height = int(data['size']['height']) xmins = [] ymins = [] xmaxs = [] ymaxs = [] classes = [] classes_text = [] truncated = [] poses = [] difficult_obj = [] masks = [] if 'object' in data: for obj in data['object']: difficult = bool(int(obj['difficult'])) if ignore_difficult_instances and difficult: continue difficult_obj.append(int(difficult)) if faces_only: xmin = float(obj['bndbox']['xmin']) xmax = float(obj['bndbox']['xmax']) ymin = float(obj['bndbox']['ymin']) ymax = float(obj['bndbox']['ymax']) else: xmin = float(np.min(nonzero_x_indices)) xmax = float(np.max(nonzero_x_indices)) ymin = float(np.min(nonzero_y_indices)) ymax = float(np.max(nonzero_y_indices)) xmins.append(xmin / width) ymins.append(ymin / height) xmaxs.append(xmax / width) ymaxs.append(ymax / height) class_name = get_class_name_from_filename(data['filename']) classes_text.append(class_name.encode('utf8')) classes.append(label_map_dict[class_name]) truncated.append(int(obj['truncated'])) poses.append(obj['pose'].encode('utf8')) if not faces_only: mask_remapped = (mask_np != 2).astype(np.uint8) masks.append(mask_remapped) feature_dict = { 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(data['filename'].encode('utf8')), 'image/source_id': dataset_util.bytes_feature(data['filename'].encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), 'image/object/truncated': dataset_util.int64_list_feature(truncated), 'image/object/view': dataset_util.bytes_list_feature(poses), } if not faces_only: if mask_type == 'numerical': mask_stack = np.stack(masks).astype(np.float32) masks_flattened = np.reshape(mask_stack, [-1]) feature_dict['image/object/mask'] = ( dataset_util.float_list_feature(masks_flattened.tolist())) elif mask_type == 'png': encoded_mask_png_list = [] for mask in masks: img = PIL.Image.fromarray(mask) output = io.BytesIO() img.save(output, format='PNG') encoded_mask_png_list.append(output.getvalue()) feature_dict['image/object/mask'] = ( dataset_util.bytes_list_feature(encoded_mask_png_list)) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) return example
def create_tf_example(image_df, image2idx): """Converts image and annotations to a tf.Example proto. Args - OLD DESCRIPTION FOR THE REFERENCE, IGNORE IT. image: dict with keys: [u'license', u'file_name', u'coco_url', u'height', u'width', u'date_captured', u'flickr_url', u'id'] image_dir: directory containing the image files. bbox_annotations: list of dicts with keys: [u'segmentation', u'area', u'iscrowd', u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box coordinates in the official Open Images dataset are given as [x, y, width, height] tuples using absolute coordinates where x, y represent the top-left (0-indexed) corner. This function converts to the format expected by the Tensorflow Object Detection API (which is which is [ymin, xmin, ymax, xmax] with coordinates normalized relative to image size). category_index: a dict containing Open Images category information keyed by the 'id' field of each category. See the label_map_util.create_category_index function. caption_annotations: list of dict with keys: [u'id', u'image_id', u'str']. include_masks: Whether to include instance segmentations masks (PNG encoded) in the result. default: False. Returns: example: The converted tf.Example Raises: ValueError: if the image pointed to by data['filename'] is not a valid JPEG """ image_id = image_df.ImageID.values[0] # some settings here bbox_annotations = True include_masks = False filename = image_id + '.jpg' full_path = os.path.join(FLAGS.image_dir, filename) if not os.path.exists(full_path): full_path = os.path.join(FLAGS.image_dir2, filename) with tf.gfile.GFile(full_path, 'rb') as fid: encoded_jpg = fid.read() pil_image = PIL.Image.open(full_path) image_height = pil_image.height image_width = pil_image.width encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) key = hashlib.sha256(encoded_jpg).hexdigest() feature_dict = { 'image/height': dataset_util.int64_feature(image_height), 'image/width': dataset_util.int64_feature(image_width), 'image/filename': dataset_util.bytes_feature(filename.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(str(image2idx[image_id]).encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), } # num_annotations_skipped = 0 if bbox_annotations: xmin = [] xmax = [] ymin = [] ymax = [] is_crowd = [] category_names = [] category_ids = [] area = [] # encoded_mask_png = [] for ann in image_df.itertuples(): xmin.append(ann.XMin) xmax.append(ann.XMax) ymin.append(ann.YMin) ymax.append(ann.YMax) # is_crowd.append(object_annotations['iscrowd']) is_crowd.append(bool(ann.IsGroupOf)) # category_id = int(object_annotations['category_id']) category_id = class_indices[ann.LabelName] # print(category_id) category_ids.append(category_id) category_name = class_labels[ann.LabelName].encode('utf8') # print(category_name) category_names.append(category_name) # area.append(object_annotations['area']) area.append(abs((ann.XMax - ann.XMin) * (ann.YMax - ann.YMin))) # if include_masks: # run_len_encoding = mask.frPyObjects(object_annotations['segmentation'], # image_height, image_width) # # binary_mask = mask.decode(run_len_encoding) # # if not object_annotations['iscrowd']: # binary_mask = np.amax(binary_mask, axis=2) # # pil_image = PIL.Image.fromarray(binary_mask) # output_io = io.BytesIO() # pil_image.save(output_io, format='PNG') # encoded_mask_png.append(output_io.getvalue()) feature_dict.update({ 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax), 'image/object/class/text': dataset_util.bytes_list_feature(category_names), 'image/object/class/label': dataset_util.int64_list_feature(category_ids), 'image/object/is_crowd': dataset_util.int64_list_feature(is_crowd), 'image/object/area': dataset_util.float_list_feature(area), }) # if include_masks: # feature_dict['image/object/mask'] = ( # dataset_util.bytes_list_feature(encoded_mask_png)) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) return example # key, example, num_annotations_skipped
def create_tf_example(line): # TODO(user): Populate the following variables from your example. #filename = None # Filename of the image. Empty if image is not from file #encoded_image_data = None # Encoded image bytes image_format = b'jpeg' # b'jpeg' or b'png' elements = line.split(' ') encoded_image_data = open(os.path.join(FLAGS.data_root, elements[0]), 'rb').read() filename = elements[0].split('/')[-1].replace('.jpg', '').encode('utf-8') source_id = filename key = hashlib.sha256(encoded_image_data).hexdigest().encode('utf8') xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] classes = [] #TODO: Find way to convert cv2 image to bytes so won't open image twice. img = cv2.imread(os.path.join(FLAGS.data_root, elements[0]), 0) height, width = img.shape[:2] for i in elements[1:]: _i = i.split(',') _xmins = int(_i[0]) / width _xmaxs = int(_i[2]) / width _ymins = int(_i[1]) / height _ymaxs = int(_i[3]) / height xmins.append(_xmins) xmaxs.append(_xmaxs) ymins.append(_ymins) ymaxs.append(_ymaxs) classes.append(int(_i[4])) classes_text.append(dict[int(_i[4])].encode('utf-8')) tf_example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename), 'image/source_id': dataset_util.bytes_feature(filename), 'image/encoded': dataset_util.bytes_feature(encoded_image_data), 'image/format': dataset_util.bytes_feature(image_format), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), 'image/key/sha256': dataset_util.bytes_feature(key) })) return tf_example
def dict_to_tf_example(data, dataset_directory, label_map_dict, ignore_difficult_instances=False, image_subdirectory='JPEGImages'): """Convert XML derived dict to tf.Example proto. Notice that this function normalizes the bounding box coordinates provided by the raw data. Args: data: dict holding PASCAL XML fields for a single image (obtained by running dataset_util.recursive_parse_xml_to_dict) dataset_directory: Path to root directory holding PASCAL dataset label_map_dict: A map from string label names to integers ids. ignore_difficult_instances: Whether to skip difficult instances in the dataset (default: False). image_subdirectory: String specifying subdirectory within the PASCAL dataset directory holding the actual image data. Returns: example: The converted tf.Example. Raises: ValueError: if the image pointed to by data['filename'] is not a valid JPEG """ img_path = os.path.join( data['folder'], image_subdirectory, getattr(data['filename'], 'decode', lambda: data['filename'])()) full_path = os.path.join(dataset_directory, img_path) with tf.gfile.GFile(full_path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) if image.format != 'JPEG': raise ValueError('Image format not JPEG') key = hashlib.sha256(encoded_jpg).hexdigest() width = int(data['size']['width']) height = int(data['size']['height']) xmin = [] ymin = [] xmax = [] ymax = [] classes = [] classes_text = [] truncated = [] poses = [] difficult_obj = [] if 'object' in data: for obj in data['object']: difficult = bool(int(obj['difficult'])) if ignore_difficult_instances and difficult: continue difficult_obj.append(int(difficult)) xmin.append(float(obj['bndbox']['xmin']) / width) ymin.append(float(obj['bndbox']['ymin']) / height) xmax.append(float(obj['bndbox']['xmax']) / width) ymax.append(float(obj['bndbox']['ymax']) / height) classes_text.append(obj['name'].encode('utf8')) classes.append(label_map_dict[obj['name']]) truncated.append(int(obj['truncated'])) poses.append(obj['pose'].encode('utf8')) example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(data['filename']), 'image/source_id': dataset_util.bytes_feature(data['filename']), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), 'image/object/truncated': dataset_util.int64_list_feature(truncated), 'image/object/view': dataset_util.bytes_list_feature(poses), })) return example
def create_tf_example( image, image_dir, bbox_annotations=None, category_index=None, caption_annotations=None, include_masks=False, num_attributes=None, ): """Converts image and annotations to a tf.Example proto. Args: image: dict with keys: [u'license', u'file_name', u'coco_url', u'height', u'width', u'date_captured', u'flickr_url', u'id'] image_dir: directory containing the image files. bbox_annotations: list of dicts with keys: [u'segmentation', u'area', u'iscrowd', u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box coordinates in the official COCO dataset are given as [x, y, width, height] tuples using absolute coordinates where x, y represent the top-left (0-indexed) corner. This function converts to the format expected by the Tensorflow Object Detection API (which is which is [ymin, xmin, ymax, xmax] with coordinates normalized relative to image size). category_index: a dict containing COCO category information keyed by the 'id' field of each category. See the label_map_util.create_category_index function. caption_annotations: list of dict with keys: [u'id', u'image_id', u'str']. include_masks: Whether to include instance segmentations masks (PNG encoded) in the result. default: False. Returns: example: The converted tf.Example num_annotations_skipped: Number of (invalid) annotations that were ignored. Raises: ValueError: if the image pointed to by data['filename'] is not a valid JPEG """ image_height = image["height"] image_width = image["width"] filename = image["file_name"] image_id = image["id"] full_path = os.path.join(image_dir, filename) image = PIL.Image.open(full_path).convert("RGB") print( f"Resize image {filename}: ({image.width}, {image.height}) -> ({image_width}, {image_height})" ) image = image.resize((image_width, image_height)) with tempfile.NamedTemporaryFile("wb", suffix=".jpg") as f: image.save(f.name) with tf.io.gfile.GFile(f.name, "rb") as fid: encoded_jpg = fid.read() assert ( image_width == image.width and image_height == image.height ), f"filename={filename}: label width={image_width}, height={image_height} but actual width={image.width}, height={image.height}" key = hashlib.sha256(encoded_jpg).hexdigest() # Hashing can't work. # c.f., https://github.com/tensorflow/tpu/issues/917 # image_id = hash_image_id(image_id) feature_dict = { "image/height": dataset_util.int64_feature(image_height), "image/width": dataset_util.int64_feature(image_width), "image/filename": dataset_util.bytes_feature(filename.encode("utf8")), # source_id must be integer string # c.f., https://github.com/tensorflow/tpu/issues/516 # c.f., process_source_id in tf_tpu_models/official/deteciton/utils/dataloader_utils.py "image/source_id": dataset_util.bytes_feature(str(image_id).encode("utf-8")), "image/key/sha256": dataset_util.bytes_feature(key.encode("utf8")), "image/encoded": dataset_util.bytes_feature(encoded_jpg), "image/format": dataset_util.bytes_feature("jpeg".encode("utf8")), } num_annotations_skipped = 0 if bbox_annotations: xmin = [] xmax = [] ymin = [] ymax = [] is_crowd = [] category_names = [] category_ids = [] attributes_multi_hot = (np.zeros( (len(bbox_annotations), num_attributes), dtype=np.bool) if num_attributes else None) area = [] encoded_mask_png = [] for i, object_annotations in enumerate(bbox_annotations): (x, y, width, height) = tuple(object_annotations["bbox"]) if width <= 0 or height <= 0: num_annotations_skipped += 1 continue if x + width > image_width or y + height > image_height: num_annotations_skipped += 1 continue xmin.append(float(x) / image_width) xmax.append(float(x + width) / image_width) ymin.append(float(y) / image_height) ymax.append(float(y + height) / image_height) is_crowd.append(object_annotations["iscrowd"]) category_id = int(object_annotations["category_id"]) category_ids.append(category_id) category_names.append( category_index[category_id]["name"].encode("utf8")) area.append(object_annotations["area"]) if include_masks: segmentation = object_annotations["segmentation"] if isinstance(segmentation, list): if isinstance(segmentation[0], int): binary_mask = _get_binary_mask(segmentation, image_height, image_width) elif isinstance(segmentation[0], list): run_len_encoding = mask.frPyObjects( segmentation, image_height, image_width) binary_mask = mask.decode(run_len_encoding) if not object_annotations["iscrowd"] and (len( binary_mask.shape) > 2): binary_mask = np.amax(binary_mask, axis=2) elif (isinstance(segmentation, dict) and "counts" in segmentation.keys() and "size" in segmentation.keys()): binary_mask = mask.decode(segmentation) if not object_annotations["iscrowd"] and (len( binary_mask.shape) > 2): binary_mask = np.amax(binary_mask, axis=2) else: raise ValueError( f"not supported format annotation: {segmentation}") pil_image = PIL.Image.fromarray(binary_mask) output_io = io.BytesIO() pil_image.save(output_io, format="PNG") encoded_mask_png.append(output_io.getvalue()) if num_attributes: attributes_multi_hot[i, object_annotations["attribute_ids"]] = 1 feature_dict.update({ "image/object/bbox/xmin": dataset_util.float_list_feature(xmin), "image/object/bbox/xmax": dataset_util.float_list_feature(xmax), "image/object/bbox/ymin": dataset_util.float_list_feature(ymin), "image/object/bbox/ymax": dataset_util.float_list_feature(ymax), "image/object/class/text": dataset_util.bytes_list_feature(category_names), "image/object/class/label": dataset_util.int64_list_feature(category_ids), "image/object/attributes/labels": dataset_util.bytes_feature(attributes_multi_hot.tobytes()), "image/object/is_crowd": dataset_util.int64_list_feature(is_crowd), "image/object/area": dataset_util.float_list_feature(area), }) if include_masks: feature_dict[ "image/object/mask"] = dataset_util.bytes_list_feature( encoded_mask_png) if caption_annotations: captions = [] for caption_annotation in caption_annotations: captions.append(caption_annotation["caption"].encode("utf8")) feature_dict.update( {"image/caption": dataset_util.bytes_list_feature(captions)}) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) return key, example, num_annotations_skipped
def prepare_example(image_path, annotations, label_map_dict): """Converts a dictionary with annotations for an image to tf.Example proto. Args: image_path: The complete path to image. annotations: A dictionary representing the annotation of a single object that appears in the image. label_map_dict: A map from string label names to integer ids. Returns: example: The converted tf.Example. """ with tf.gfile.GFile(image_path, 'rb') as fid: encoded_png = fid.read() encoded_png_io = io.BytesIO(encoded_png) image = pil.open(encoded_png_io) image = np.asarray(image) key = hashlib.sha256(encoded_png).hexdigest() width = int(image.shape[1]) height = int(image.shape[0]) xmin_norm = annotations['2d_bbox_left'] / float(width) ymin_norm = annotations['2d_bbox_top'] / float(height) xmax_norm = annotations['2d_bbox_right'] / float(width) ymax_norm = annotations['2d_bbox_bottom'] / float(height) difficult_obj = [0] * len(xmin_norm) example = tf.train.Example(features=tf.train.Features( feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(image_path.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(image_path.encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_png), 'image/format': dataset_util.bytes_feature('png'.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_norm), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_norm), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_norm), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_norm), 'image/object/class/text': dataset_util.bytes_list_feature( [x.encode('utf8') for x in annotations['type']]), 'image/object/class/label': dataset_util.int64_list_feature( [label_map_dict[x] for x in annotations['type']]), 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), 'image/object/truncated': dataset_util.float_list_feature(annotations['truncated']), 'image/object/alpha': dataset_util.float_list_feature(annotations['alpha']), 'image/object/3d_bbox/height': dataset_util.float_list_feature(annotations['3d_bbox_height']), 'image/object/3d_bbox/width': dataset_util.float_list_feature(annotations['3d_bbox_width']), 'image/object/3d_bbox/length': dataset_util.float_list_feature(annotations['3d_bbox_length']), 'image/object/3d_bbox/x': dataset_util.float_list_feature(annotations['3d_bbox_x']), 'image/object/3d_bbox/y': dataset_util.float_list_feature(annotations['3d_bbox_y']), 'image/object/3d_bbox/z': dataset_util.float_list_feature(annotations['3d_bbox_z']), 'image/object/3d_bbox/rot_y': dataset_util.float_list_feature(annotations['3d_bbox_rot_y']), })) return example
def tf_example_from_annotations_data_frame(annotations_data_frame, label_map, encoded_image): """Populates a TF Example message with image annotations from a data frame. Args: annotations_data_frame: Data frame containing the annotations for a single image. label_map: String to integer label map. encoded_image: The encoded image string Returns: The populated TF Example, if the label of at least one object is present in label_map. Otherwise, returns None. """ filtered_data_frame = annotations_data_frame[ annotations_data_frame.LabelName.isin(label_map)] filtered_data_frame_boxes = filtered_data_frame[~filtered_data_frame.YMin. isnull()] filtered_data_frame_labels = filtered_data_frame[ filtered_data_frame.YMin.isnull()] image_id = annotations_data_frame.ImageID.iloc[0] feature_map = { standard_fields.TfExampleFields.object_bbox_ymin: dataset_util.float_list_feature(filtered_data_frame_boxes.YMin.values), standard_fields.TfExampleFields.object_bbox_xmin: dataset_util.float_list_feature(filtered_data_frame_boxes.XMin.values), standard_fields.TfExampleFields.object_bbox_ymax: dataset_util.float_list_feature(filtered_data_frame_boxes.YMax.values), standard_fields.TfExampleFields.object_bbox_xmax: dataset_util.float_list_feature(filtered_data_frame_boxes.XMax.values), standard_fields.TfExampleFields.object_class_text: dataset_util.bytes_list_feature( filtered_data_frame_boxes.LabelName.values), standard_fields.TfExampleFields.object_class_label: dataset_util.int64_list_feature( filtered_data_frame_boxes.LabelName.map( lambda x: label_map[x]).values), standard_fields.TfExampleFields.filename: dataset_util.bytes_feature('{}.jpg'.format(image_id).encode()), standard_fields.TfExampleFields.source_id: dataset_util.bytes_feature(image_id.encode()), standard_fields.TfExampleFields.image_encoded: dataset_util.bytes_feature(encoded_image.encode()), } if 'IsGroupOf' in filtered_data_frame.columns: feature_map[standard_fields.TfExampleFields. object_group_of] = dataset_util.int64_list_feature( filtered_data_frame_boxes.IsGroupOf.values.astype(int)) if 'IsOccluded' in filtered_data_frame.columns: feature_map[standard_fields.TfExampleFields. object_occluded] = dataset_util.int64_list_feature( filtered_data_frame_boxes.IsOccluded.values.astype( int)) if 'IsTruncated' in filtered_data_frame.columns: feature_map[standard_fields.TfExampleFields. object_truncated] = dataset_util.int64_list_feature( filtered_data_frame_boxes.IsTruncated.values.astype( int)) if 'IsDepiction' in filtered_data_frame.columns: feature_map[standard_fields.TfExampleFields. object_depiction] = dataset_util.int64_list_feature( filtered_data_frame_boxes.IsDepiction.values.astype( int)) if 'ConfidenceImageLabel' in filtered_data_frame_labels.columns: feature_map[standard_fields.TfExampleFields. image_class_label] = dataset_util.int64_list_feature( filtered_data_frame_labels.LabelName.map( lambda x: label_map[x]).values) feature_map[standard_fields.TfExampleFields. image_class_text] = dataset_util.bytes_list_feature( filtered_data_frame_labels.LabelName.values), return tf.train.Example(features=tf.train.Features(feature=feature_map))