def test_sample_sequence(self): sequence = tf.range(100) sampled_seq_1 = preprocess_ops_3d.sample_sequence( sequence, 10, False, 1) sampled_seq_2 = preprocess_ops_3d.sample_sequence( sequence, 10, False, 2) sampled_seq_3 = preprocess_ops_3d.sample_sequence( sequence, 10, True, 1) self.assertAllEqual(sampled_seq_1, range(45, 55)) self.assertAllEqual(sampled_seq_2, range(40, 60, 2)) offset_3 = sampled_seq_3[0] self.assertBetween(offset_3, 0, 99) self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
def _parse_eval_data( self, decoded_tensors: Dict[str, tf.Tensor] ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: """Parses data for evaluation.""" image = decoded_tensors[self._image_key] image = _process_image(image=image, is_training=False, num_frames=self._num_frames, stride=self._stride, num_test_clips=self._num_test_clips, min_resize=self._min_resize, crop_size=self._crop_size, num_crops=self._num_crops) image = tf.cast(image, dtype=self._dtype) features = {'image': image} label = decoded_tensors[self._label_key] label = _process_label(label, self._one_hot_label, self._num_classes) if self._output_audio: audio = decoded_tensors[self._audio_feature] audio = tf.cast(audio, dtype=self._dtype) audio = preprocess_ops_3d.sample_sequence(audio, 20, random=False, stride=1) audio = tf.ensure_shape(audio, [20, 2048]) features['audio'] = audio return features, label
def _parse_train_data( self, decoded_tensors: Dict[str, tf.Tensor] ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: """Parses data for training.""" # Process image and label. image = decoded_tensors[self._image_key] image = process_image(image=image, is_training=True, num_frames=self._num_frames, stride=self._stride, random_stride_range=self._random_stride_range, num_test_clips=self._num_test_clips, min_resize=self._min_resize, crop_size=self._crop_size, num_channels=self._num_channels, min_aspect_ratio=self._min_aspect_ratio, max_aspect_ratio=self._max_aspect_ratio, min_area_ratio=self._min_area_ratio, max_area_ratio=self._max_area_ratio, augmenter=self._augmenter, zero_centering_image=self._zero_centering_image) image = tf.cast(image, dtype=self._dtype) features = {'image': image} label = decoded_tensors[self._label_key] label = process_label(label, self._one_hot_label, self._num_classes, self._label_dtype) if self._output_audio: audio = decoded_tensors[self._audio_feature] audio = tf.cast(audio, dtype=self._dtype) # TODO(yeqing): synchronize audio/video sampling. Especially randomness. audio = preprocess_ops_3d.sample_sequence(audio, self._audio_shape[0], random=False, stride=1) audio = tf.ensure_shape(audio, self._audio_shape) features['audio'] = audio return features, label
def _process_image(image: tf.Tensor, is_training: bool = True, is_ssl: bool = False, num_frames: int = 32, stride: int = 1, num_test_clips: int = 1, min_resize: int = 256, crop_size: int = 224, num_crops: int = 1, zero_centering_image: bool = False, seed: Optional[int] = None) -> tf.Tensor: """Processes a serialized image tensor. Args: image: Input Tensor of shape [timesteps] and type tf.string of serialized frames. is_training: Whether or not in training mode. If True, random sample, crop and left right flip is used. is_ssl: Whether or not in self-supervised pre-training mode. num_frames: Number of frames per subclip. stride: Temporal stride to sample frames. num_test_clips: Number of test clips (1 by default). If more than 1, this will sample multiple linearly spaced clips within each video at test time. If 1, then a single clip in the middle of the video is sampled. The clips are aggreagated in the batch dimension. min_resize: Frames are resized so that min(height, width) is min_resize. crop_size: Final size of the frame after cropping the resized frames. Both height and width are the same. num_crops: Number of crops to perform on the resized frames. zero_centering_image: If True, frames are normalized to values in [-1, 1]. If False, values in [0, 1]. seed: A deterministic seed to use when sampling. Returns: Processed frames. Tensor of shape [num_frames * num_test_clips, crop_size, crop_size, 3]. """ # Validate parameters. if is_training and num_test_clips != 1: logging.warning( '`num_test_clips` %d is ignored since `is_training` is `True`.', num_test_clips) # Temporal sampler. if is_training: # Sampler for training. if is_ssl: # Sample two clips from linear decreasing distribution. image = video_ssl_preprocess_ops.sample_ssl_sequence( image, num_frames, True, stride) else: # Sample random clip. image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride) else: # Sampler for evaluation. if num_test_clips > 1: # Sample linspace clips. image = preprocess_ops_3d.sample_linspace_sequence( image, num_test_clips, num_frames, stride) else: # Sample middle clip. image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride) # Decode JPEG string to tf.uint8. image = preprocess_ops_3d.decode_jpeg(image, 3) if is_training: # Standard image data augmentation: random resized crop and random flip. if is_ssl: image_1, image_2 = tf.split(image, num_or_size_splits=2, axis=0) image_1 = preprocess_ops_3d.random_crop_resize( image_1, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) image_1 = preprocess_ops_3d.random_flip_left_right(image_1, seed) image_2 = preprocess_ops_3d.random_crop_resize( image_2, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) image_2 = preprocess_ops_3d.random_flip_left_right(image_2, seed) else: image = preprocess_ops_3d.random_crop_resize( image, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) image = preprocess_ops_3d.random_flip_left_right(image, seed) else: # Resize images (resize happens only if necessary to save compute). image = preprocess_ops_3d.resize_smallest(image, min_resize) # Three-crop of the frames. image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False, num_crops) # Cast the frames in float32, normalizing according to zero_centering_image. if is_training and is_ssl: image_1 = preprocess_ops_3d.normalize_image(image_1, zero_centering_image) image_2 = preprocess_ops_3d.normalize_image(image_2, zero_centering_image) else: image = preprocess_ops_3d.normalize_image(image, zero_centering_image) # Self-supervised pre-training augmentations. if is_training and is_ssl: # Temporally consistent color jittering. image_1 = video_ssl_preprocess_ops.random_color_jitter_3d(image_1) image_2 = video_ssl_preprocess_ops.random_color_jitter_3d(image_2) # Temporally consistent gaussian blurring. image_1 = video_ssl_preprocess_ops.random_blur(image_1, crop_size, crop_size, 1.0) image_2 = video_ssl_preprocess_ops.random_blur(image_2, crop_size, crop_size, 0.1) image_2 = video_ssl_preprocess_ops.random_solarization(image_2) image = tf.concat([image_1, image_2], axis=0) image = tf.clip_by_value(image, 0., 1.) return image
def process_image(image: tf.Tensor, is_training: bool = True, num_frames: int = 32, stride: int = 1, random_stride_range: int = 0, num_test_clips: int = 1, min_resize: int = 256, crop_size: int = 224, num_crops: int = 1, zero_centering_image: bool = False, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, min_area_ratio: float = 0.49, max_area_ratio: float = 1.0, augmenter: Optional[augment.ImageAugment] = None, seed: Optional[int] = None) -> tf.Tensor: """Processes a serialized image tensor. Args: image: Input Tensor of shape [timesteps] and type tf.string of serialized frames. is_training: Whether or not in training mode. If True, random sample, crop and left right flip is used. num_frames: Number of frames per subclip. stride: Temporal stride to sample frames. random_stride_range: An int indicating the min and max bounds to uniformly sample different strides from the video. E.g., a value of 1 with stride=2 will uniformly sample a stride in {1, 2, 3} for each video in a batch. Only used enabled training for the purposes of frame-rate augmentation. Defaults to 0, which disables random sampling. num_test_clips: Number of test clips (1 by default). If more than 1, this will sample multiple linearly spaced clips within each video at test time. If 1, then a single clip in the middle of the video is sampled. The clips are aggreagated in the batch dimension. min_resize: Frames are resized so that min(height, width) is min_resize. crop_size: Final size of the frame after cropping the resized frames. Both height and width are the same. num_crops: Number of crops to perform on the resized frames. zero_centering_image: If True, frames are normalized to values in [-1, 1]. If False, values in [0, 1]. min_aspect_ratio: The minimum aspect range for cropping. max_aspect_ratio: The maximum aspect range for cropping. min_area_ratio: The minimum area range for cropping. max_area_ratio: The maximum area range for cropping. augmenter: Image augmenter to distort each image. seed: A deterministic seed to use when sampling. Returns: Processed frames. Tensor of shape [num_frames * num_test_clips, crop_size, crop_size, 3]. """ # Validate parameters. if is_training and num_test_clips != 1: logging.warning( '`num_test_clips` %d is ignored since `is_training` is `True`.', num_test_clips) if random_stride_range < 0: raise ValueError('Random stride range should be >= 0, got {}'.format( random_stride_range)) # Temporal sampler. if is_training: if random_stride_range > 0: # Uniformly sample different frame-rates stride = tf.random.uniform([], tf.maximum(stride - random_stride_range, 1), stride + random_stride_range, dtype=tf.int32) # Sample random clip. image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride, seed) elif num_test_clips > 1: # Sample linspace clips. image = preprocess_ops_3d.sample_linspace_sequence( image, num_test_clips, num_frames, stride) else: # Sample middle clip. image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride) # Decode JPEG string to tf.uint8. if image.dtype == tf.string: image = preprocess_ops_3d.decode_jpeg(image, 3) if is_training: # Standard image data augmentation: random resized crop and random flip. image = preprocess_ops_3d.random_crop_resize( image, crop_size, crop_size, num_frames, 3, (min_aspect_ratio, max_aspect_ratio), (min_area_ratio, max_area_ratio)) image = preprocess_ops_3d.random_flip_left_right(image, seed) if augmenter is not None: image = augmenter.distort(image) else: # Resize images (resize happens only if necessary to save compute). image = preprocess_ops_3d.resize_smallest(image, min_resize) # Crop of the frames. image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False, num_crops) # Cast the frames in float32, normalizing according to zero_centering_image. return preprocess_ops_3d.normalize_image(image, zero_centering_image)