def preprocess(self, inputs, audio_flag=False): """Feature-extractor specific preprocessing. See base class. Args: inputs: a [batch, height_in, width_in, channels] float tensor representing a batch of images with values between 0 and 255.0. Returns: preprocessed_inputs: a [batch, height_out, width_out, channels] float tensor representing a batch of images. Raises: ValueError: if inputs tensor does not have type tf.float32 """ if inputs.dtype is not tf.float32: raise ValueError('`preprocess` expects a tf.float32 tensor') with tf.name_scope('Preprocessor'): # TODO: revisit whether to always use batch size as the number of parallel # iterations vs allow for dynamic batching. if (audio_flag == False): resized_inputs = tf.map_fn(self._image_resizer_fn, elems=inputs, dtype=tf.float32) return self._feature_extractor.preprocess(resized_inputs, normalized=False) else: print("audio shape in preprocess", inputs.get_shape()) # no resize # TO DO: hard coding for just coverting # dynamic shape of tensor to the static shape # need to be modified later resized_inputs = preprocessor.resize_image(inputs, new_height=20, new_width=9) return self._feature_extractor.preprocess(resized_inputs, normalized=False)
def build(input_reader_config, model_config, lstm_config, unroll_length, data_augmentation_options=None, batch_size=1): """Builds a tensor dictionary based on the InputReader config. Args: input_reader_config: An input_reader_builder.InputReader object. model_config: A model.proto object containing the config for the desired DetectionModel. lstm_config: LSTM specific configs. unroll_length: Unrolled length for LSTM training. data_augmentation_options: A list of tuples, where each tuple contains a data augmentation function and a dictionary containing arguments and their values (see preprocessor.py). batch_size: Batch size for queue outputs. Returns: A dictionary of tensors based on items in the input_reader_config. Raises: ValueError: On invalid input reader proto. ValueError: If no input paths are specified. """ if not isinstance(input_reader_config, input_reader_pb2.InputReader): raise ValueError('input_reader_config not of type ' 'input_reader_pb2.InputReader.') external_reader_config = input_reader_config.external_input_reader google_input_reader_config = external_reader_config.Extensions[ input_reader_google_pb2.GoogleInputReader.google_input_reader] input_reader_type = google_input_reader_config.WhichOneof('input_reader') if input_reader_type == 'tf_record_video_input_reader': config = google_input_reader_config.tf_record_video_input_reader reader_type_class = tf.TFRecordReader else: raise ValueError( 'Unsupported reader in input_reader_config: %s' % input_reader_type) if not config.input_path: raise ValueError('At least one input path must be specified in ' '`input_reader_config`.') key, value = parallel_reader.parallel_read( config.input_path[:], # Convert `RepeatedScalarContainer` to list. reader_class=reader_type_class, num_epochs=(input_reader_config.num_epochs if input_reader_config.num_epochs else None), num_readers=input_reader_config.num_readers, shuffle=input_reader_config.shuffle, dtypes=[tf.string, tf.string], capacity=input_reader_config.queue_capacity, min_after_dequeue=input_reader_config.min_after_dequeue) # TODO(yinxiao): Add loading instance mask option. decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder() keys_to_decode = [ fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_classes ] tensor_dict = decoder.decode(value, items=keys_to_decode) tensor_dict['image'].set_shape([None, None, None, 3]) tensor_dict['groundtruth_boxes'].set_shape([None, None, 4]) height = model_config.ssd.image_resizer.fixed_shape_resizer.height width = model_config.ssd.image_resizer.fixed_shape_resizer.width # If data augmentation is specified in the config file, the preprocessor # will be called here to augment the data as specified. Most common # augmentations include horizontal flip and cropping. if data_augmentation_options: images_pre = tf.split(tensor_dict['image'], config.video_length, axis=0) bboxes_pre = tf.split( tensor_dict['groundtruth_boxes'], config.video_length, axis=0) labels_pre = tf.split( tensor_dict['groundtruth_classes'], config.video_length, axis=0) images_proc, bboxes_proc, labels_proc = [], [], [] cache = preprocessor_cache.PreprocessorCache() for i, _ in enumerate(images_pre): image_dict = { fields.InputDataFields.image: images_pre[i], fields.InputDataFields.groundtruth_boxes: tf.squeeze(bboxes_pre[i], axis=0), fields.InputDataFields.groundtruth_classes: tf.squeeze(labels_pre[i], axis=0), } image_dict = preprocessor.preprocess( image_dict, data_augmentation_options, func_arg_map=preprocessor.get_default_func_arg_map(), preprocess_vars_cache=cache) # Pads detection count to _PADDING_SIZE. image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_boxes], [[0, _PADDING_SIZE], [0, 0]]) image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0], [_PADDING_SIZE, -1]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_classes], [[0, _PADDING_SIZE]]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_classes], [0], [_PADDING_SIZE]) images_proc.append(image_dict[fields.InputDataFields.image]) bboxes_proc.append(image_dict[fields.InputDataFields.groundtruth_boxes]) labels_proc.append(image_dict[fields.InputDataFields.groundtruth_classes]) tensor_dict['image'] = tf.concat(images_proc, axis=0) tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0) tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0) else: # Pads detection count to _PADDING_SIZE per frame. tensor_dict['groundtruth_boxes'] = tf.pad( tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]]) tensor_dict['groundtruth_boxes'] = tf.slice( tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1]) tensor_dict['groundtruth_classes'] = tf.pad( tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]]) tensor_dict['groundtruth_classes'] = tf.slice( tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE]) tensor_dict['image'], _ = preprocessor.resize_image( tensor_dict['image'], new_height=height, new_width=width) num_steps = config.video_length / unroll_length init_states = { 'lstm_state_c': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_h': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_step': tf.constant(num_steps, shape=[]), } batch = sqss.batch_sequences_with_states( input_key=key, input_sequences=tensor_dict, input_context={}, input_length=None, initial_states=init_states, num_unroll=unroll_length, batch_size=batch_size, num_threads=batch_size, make_keys_unique=True, capacity=batch_size * batch_size) return _build_training_batch_dict(batch, unroll_length, batch_size)
def build(input_reader_config, model_config, lstm_config, unroll_length, data_augmentation_options=None, batch_size=1): """Builds a tensor dictionary based on the InputReader config. Args: input_reader_config: An input_reader_builder.InputReader object. model_config: A model.proto object containing the config for the desired DetectionModel. lstm_config: LSTM specific configs. unroll_length: Unrolled length for LSTM training. data_augmentation_options: A list of tuples, where each tuple contains a data augmentation function and a dictionary containing arguments and their values (see preprocessor.py). batch_size: Batch size for queue outputs. Returns: A dictionary of tensors based on items in the input_reader_config. Raises: ValueError: On invalid input reader proto. ValueError: If no input paths are specified. """ if not isinstance(input_reader_config, input_reader_pb2.InputReader): raise ValueError('input_reader_config not of type ' 'input_reader_pb2.InputReader.') external_reader_config = input_reader_config.external_input_reader external_input_reader_config = external_reader_config.Extensions[ input_reader_google_pb2.GoogleInputReader.google_input_reader] input_reader_type = external_input_reader_config.WhichOneof('input_reader') if input_reader_type == 'tf_record_video_input_reader': config = external_input_reader_config.tf_record_video_input_reader reader_type_class = tf.TFRecordReader else: raise ValueError( 'Unsupported reader in input_reader_config: %s' % input_reader_type) if not config.input_path: raise ValueError('At least one input path must be specified in ' '`input_reader_config`.') key, value = parallel_reader.parallel_read( config.input_path[:], # Convert `RepeatedScalarContainer` to list. reader_class=reader_type_class, num_epochs=(input_reader_config.num_epochs if input_reader_config.num_epochs else None), num_readers=input_reader_config.num_readers, shuffle=input_reader_config.shuffle, dtypes=[tf.string, tf.string], capacity=input_reader_config.queue_capacity, min_after_dequeue=input_reader_config.min_after_dequeue) # TODO(yinxiao): Add loading instance mask option. decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder() keys_to_decode = [ fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_classes ] tensor_dict = decoder.decode(value, items=keys_to_decode) tensor_dict['image'].set_shape([None, None, None, 3]) tensor_dict['groundtruth_boxes'].set_shape([None, None, 4]) height = model_config.ssd.image_resizer.fixed_shape_resizer.height width = model_config.ssd.image_resizer.fixed_shape_resizer.width # If data augmentation is specified in the config file, the preprocessor # will be called here to augment the data as specified. Most common # augmentations include horizontal flip and cropping. if data_augmentation_options: images_pre = tf.split( tensor_dict['image'], config.video_length, axis=0) bboxes_pre = tf.split( tensor_dict['groundtruth_boxes'], config.video_length, axis=0) labels_pre = tf.split( tensor_dict['groundtruth_classes'], config.video_length, axis=0) images_proc, bboxes_proc, labels_proc = [], [], [] cache = preprocessor_cache.PreprocessorCache() for i, _ in enumerate(images_pre): image_dict = { fields.InputDataFields.image: images_pre[i], fields.InputDataFields.groundtruth_boxes: tf.squeeze(bboxes_pre[i], axis=0), fields.InputDataFields.groundtruth_classes: tf.squeeze(labels_pre[i], axis=0), } image_dict = preprocessor.preprocess( image_dict, data_augmentation_options, func_arg_map=preprocessor.get_default_func_arg_map(), preprocess_vars_cache=cache) # Pads detection count to _PADDING_SIZE. image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_boxes], [[0, _PADDING_SIZE], [0, 0]]) image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0], [_PADDING_SIZE, -1]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_classes], [[0, _PADDING_SIZE]]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_classes], [0], [_PADDING_SIZE]) images_proc.append(image_dict[fields.InputDataFields.image]) bboxes_proc.append( image_dict[fields.InputDataFields.groundtruth_boxes]) labels_proc.append( image_dict[fields.InputDataFields.groundtruth_classes]) tensor_dict['image'] = tf.concat(images_proc, axis=0) tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0) tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0) else: # Pads detection count to _PADDING_SIZE per frame. tensor_dict['groundtruth_boxes'] = tf.pad( tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]]) tensor_dict['groundtruth_boxes'] = tf.slice( tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1]) tensor_dict['groundtruth_classes'] = tf.pad( tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]]) tensor_dict['groundtruth_classes'] = tf.slice( tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE]) tensor_dict['image'], _ = preprocessor.resize_image( tensor_dict['image'], new_height=height, new_width=width) num_steps = config.video_length / unroll_length init_states = { 'lstm_state_c': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_h': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_step': tf.constant(num_steps, shape=[]), } batch = sqss.batch_sequences_with_states( input_key=key, input_sequences=tensor_dict, input_context={}, input_length=None, initial_states=init_states, num_unroll=unroll_length, batch_size=batch_size, num_threads=batch_size, make_keys_unique=True, capacity=batch_size * batch_size) return _build_training_batch_dict(batch, unroll_length, batch_size)
def resize_detection_fixed_size(tensors, input_size, for_testing=False): tensors_out = tensors.copy() #ignore_regions are currently not supported in this resize mode if Constants.IGNORE_REGIONS in tensors_out: del tensors_out[Constants.IGNORE_REGIONS] img = tensors[Constants.UNNORMALIZED_IMG] original_img = img bboxes = tensors[Constants.BBOXES] # remove the padding n_real_detections = tf.reduce_sum( tf.cast(tensors[Constants.IDS] > 0, tf.int32)) bboxes = bboxes[:n_real_detections] classes = tensors[Constants.CLASSES][:n_real_detections] # permute y1, y2, x1, x2 -> y1, x1, y2, x1 bboxes = tf.stack( [bboxes[..., 0], bboxes[..., 2], bboxes[..., 1], bboxes[..., 3]], axis=-1) # normalize bboxes to [0..1] height = tf.shape(img)[0] width = tf.shape(img)[1] bboxes = tf.cast(bboxes, tf.float32) / tf.cast( tf.stack([height, width, height, width], axis=0), tf.float32) import object_detection.core.preprocessor as preproc if not for_testing: #crop (ssd style) img, bboxes, classes = preproc.ssd_random_crop(img, bboxes, classes) #alternative #img, bboxes, classes = preproc.random_crop_image(img, real_boxes, real_boxes) # include random horizontal flip augmentation here img, bboxes = preproc.random_horizontal_flip(img, bboxes) #resize image, note: boxes don't need resizing as they are in relative coordinates img = preproc.resize_image(img, new_height=input_size[0], new_width=input_size[1]) if for_testing: _, bboxes = preproc.scale_boxes_to_pixel_coordinates( original_img, bboxes) else: _, bboxes = preproc.scale_boxes_to_pixel_coordinates(img, bboxes) #permute back y1, x1, y2, x1 -> y1, y2, x1, x2 bboxes = tf.stack( [bboxes[..., 0], bboxes[..., 2], bboxes[..., 1], bboxes[..., 3]], axis=-1) #pad the stuff needs to be padded back to the maximum size padded_size = smart_shape(tensors[Constants.CLASSES])[0] n_real_detections_after_crop = smart_shape(bboxes)[0] pad_size = padded_size - n_real_detections_after_crop paddings_bboxes = [[0, pad_size], [0, 0]] bboxes = tf.pad(bboxes, paddings=paddings_bboxes) paddings_classes_ids = [[0, pad_size]] classes = tf.pad(classes, paddings=paddings_classes_ids) ids = tf.pad(tf.range(n_real_detections_after_crop) + 1, paddings=paddings_classes_ids) if isinstance(padded_size, int): bboxes.set_shape((padded_size, 4)) classes.set_shape((padded_size, )) ids.set_shape((padded_size, )) else: bboxes.set_shape((None, 4)) #note that we do not retain the original ids, but it does not matter since this resize_mode is only meant for #isolated frames tensors_out[Constants.UNNORMALIZED_IMG] = img tensors_out[Constants.BBOXES] = bboxes tensors_out[Constants.CLASSES] = classes tensors_out[Constants.IDS] = ids tensors_out[Constants.RESIZED_SIZES] = tf.shape(img)[:2] if for_testing: tensors_out[Constants.ORIGINAL_SIZES] = tf.shape(original_img)[:2] return tensors_out