def __init__(self, dataset_id, params): # Generic parameters input_params = params.eval.input assert input_params.has_data, 'Please provide a dataset name.' self._num_frames = input_params.num_frames self._video_stride = input_params.video_stride self._audio_stride = input_params.audio_stride self._raw_audio = input_params.raw_audio self._stft_length = input_params.stft_length self._stft_step = input_params.stft_step self._mel_bins = input_params.mel_bins self._video_stride = self._video_stride * int(DEFAULT_FPS // REF_FPS) n_audio_secs = self._num_frames / REF_FPS self._num_samples = int(REF_SR * n_audio_secs) self._num_windows_test = input_params.num_windows_test params_factory = { 'is_training': False, 'num_samples': self._num_samples, 'stride': self._video_stride, 'audio_stride': self._audio_stride, 'num_test_clips': self._num_windows_test, } factory_args = {'subset': 'test'} factory_class = ds_fctr.get_ds_factory( dataset_name=dataset_id, )(**factory_args) ds_factory = factory_class.configure(**params_factory) ds_factory.postprocessor_builder.add_fn(processing.remove_vision) # Add audio preprocessing. if not self._raw_audio: ds_factory.preprocessor_builder.add_fn( functools.partial( processing.raw_audio_to_spectrogram, sample_rate=REF_SR, stft_length=self._stft_length, stft_step=self._stft_step, mel_bins=self._mel_bins, rm_audio=True ) ) ds_factory.preprocessor_builder.add_fn( processing.normalize_spectrogram, feature_name=FeatureNames.AUDIO_MEL, fn_name='normalize_mel', ) super(AudioEvalLoader, self).__init__( dmvr_factory=ds_factory, params=input_params, postprocess_fns=None, num_epochs=1, mode='eval', name=dataset_id, )
def __init__(self, dataset_id, params): # Generic parameters input_params = params.train.input self._num_frames = input_params.num_frames self._frame_size = input_params.frame_size self._video_stride = input_params.video_stride self._raw_audio = input_params.raw_audio self._stft_length = input_params.stft_length self._stft_step = input_params.stft_step self._mel_bins = input_params.mel_bins self._zero_centering_image = input_params.zero_centering_image self._max_num_words = input_params.max_num_words self._max_context_sentences = input_params.max_context_sentences self._space_to_depth = input_params.space_to_depth self._linearize_vision = input_params.linearize_vision # Augmentation parameters self._min_resize = input_params.min_resize self._min_area_ratio = input_params.min_area_ratio self._max_area_ratio = input_params.max_area_ratio self._min_aspect_ratio = input_params.min_aspect_ratio self._max_aspect_ratio = input_params.max_aspect_ratio self._crop_resize_style = input_params.crop_resize_style self._scale_jitter = input_params.scale_jitter self._audio_noise = input_params.audio_noise self._audio_mixup = input_params.audio_mixup self._mixup_alpha = input_params.mixup_alpha self._mixup_beta = input_params.mixup_beta ds_names = dataset_id.split('+') assert 'howto100m' in dataset_id, 'Only HT+ is supported' ds_factories = [] for ds_name in ds_names: params_factory = { 'is_training': True, 'num_frames': self._num_frames, 'stride': self._video_stride, 'crop_size': self._frame_size, 'min_resize': self._min_resize, 'zero_centering_image': self._zero_centering_image, 'min_area_ratio': self._min_area_ratio, 'max_area_ratio': self._max_area_ratio, 'min_aspect_ratio': self._min_aspect_ratio, 'max_aspect_ratio': self._max_aspect_ratio, 'crop_resize_style': self._crop_resize_style, } fps = REF_FPS if ds_name == 'howto100m' else DEFAULT_FPS n_audio_secs = self._num_frames / REF_FPS stride = self._video_stride * int(fps // REF_FPS) params_factory['stride'] = stride self._num_audio_samples = int(REF_SR * n_audio_secs) params_factory['num_samples'] = self._num_audio_samples if ds_name == 'howto100m': params_factory.update({ 'output_audio': True, 'max_num_words': self._max_num_words, 'max_context_sentences': self._max_context_sentences, }) # Get the factory. factory_args = {'subset': 'train'} factory_class = ds_fctr.get_ds_factory( dataset_name=ds_name, )(**factory_args) ds_factory = factory_class.configure(**params_factory) # Add zeros to audio and/or text if the dataset does not have audio # or text already. Also add a boolean to whether audio and/or text # are valid and should be used ds_factory.sampler_builder.add_fn( functools.partial( processing.add_audio_text_if_empty, has_valid_text=(ds_name == 'howto100m'), has_valid_audio=True, num_audio_samples=self._num_audio_samples, max_context_sentences=self._max_context_sentences, max_num_words=self._max_num_words, )) # Remove labels from inputs if ds_name == 'audioset': ds_factory.postprocessor_builder.add_fn(processing.remove_label) # Add audio preprocessing. if self._audio_noise > 0.: # Add gaussian noise ds_factory.preprocessor_builder.add_fn( functools.partial( processing.add_gaussian, gamma=self._audio_noise, ), feature_name=FeatureNames.AUDIO, fn_name='volume_gaussian' ) if self._raw_audio: ds_factory.preprocessor_builder.add_fn( processing.extend_waveform_dim, feature_name=FeatureNames.AUDIO, fn_name='extend_waveform', ) else: ds_factory.preprocessor_builder.add_fn( functools.partial( processing.raw_audio_to_spectrogram, sample_rate=REF_SR, stft_length=self._stft_length, stft_step=self._stft_step, mel_bins=self._mel_bins, rm_audio=True ) ) ds_factory.preprocessor_builder.add_fn( processing.normalize_spectrogram, feature_name=FeatureNames.AUDIO_MEL, fn_name='normalize_mel', ) # Extra data augmentation on video. if self._scale_jitter and self._crop_resize_style == 'VGG': # scale jitter is applied only when crop+resize is VGG-style ds_factory.preprocessor_builder.add_fn( functools.partial( processing.scale_jitter_augm, prob=0.8, ), feature_name=FeatureNames.VISION, fn_name=f'{FeatureNames.VISION}_jitter_scale', add_before_fn_name=f'{FeatureNames.VISION}_resize_smallest' ) ds_factories.append(ds_factory) # Add batch-level data-agnostic post-processing functions postprocess_fns = [] if self._space_to_depth: postprocess_fns.append( functools.partial( processing.space_to_depth, temporal_block_size=2, spatial_block_size=2, feature_name=FeatureNames.VISION, ) ) if self._linearize_vision: postprocess_fns.append( functools.partial( processing.linearize, feature_name=FeatureNames.VISION, ) ) if self._audio_mixup: feat_name = FeatureNames.AUDIO if self._raw_audio else FeatureNames.AUDIO_MEL postprocess_fns.append( functools.partial( processing.batched_mixup, feature_name=feat_name, alpha=self._mixup_alpha, beta=self._mixup_beta, mixup_labels=False, ) ) num_post_processors = len(postprocess_fns) if num_post_processors == 0: postprocess_fns = None super(PreTrainLoader, self).__init__( dmvr_factory=ds_factories, params=input_params, postprocess_fns=postprocess_fns, num_epochs=-1, mode='train', name=dataset_id, )
def __init__(self, dataset_id, params): # Generic parameters input_params = params.train.input assert input_params.has_data, 'Please provide a dataset name.' self._num_frames = input_params.num_frames self._video_stride = input_params.video_stride self._audio_stride = input_params.audio_stride self._raw_audio = input_params.raw_audio self._stft_length = input_params.stft_length self._stft_step = input_params.stft_step self._mel_bins = input_params.mel_bins n_audio_secs = self._num_frames / REF_FPS self._num_samples = int(REF_SR * n_audio_secs) # Augmentation parameters self._audio_noise = input_params.audio_noise self._mixup = input_params.mixup self._mixup_alpha = input_params.mixup_alpha params_factory = { 'is_training': True, 'num_samples': self._num_samples, 'stride': self._video_stride, 'audio_stride': self._audio_stride, } with tf.name_scope('input_{}_train'.format(dataset_id)): # Get the factory. factory_args = {'subset': 'train'} factory_class = ds_fctr.get_ds_factory( dataset_name=dataset_id, )(**factory_args) ds_factory = factory_class.configure(**params_factory) ds_factory.postprocessor_builder.add_fn(processing.remove_vision) # Add audio preprocessing. if self._audio_noise > 0.: # Add gaussian noise ds_factory.preprocessor_builder.add_fn( functools.partial( processing.add_gaussian, gamma=self._audio_noise, ), feature_name=FeatureNames.AUDIO, fn_name='volume_gaussian' ) if not self._raw_audio: ds_factory.preprocessor_builder.add_fn( functools.partial( processing.raw_audio_to_spectrogram, sample_rate=REF_SR, stft_length=self._stft_length, stft_step=self._stft_step, mel_bins=self._mel_bins, rm_audio=True ) ) ds_factory.preprocessor_builder.add_fn( processing.normalize_spectrogram, feature_name=FeatureNames.AUDIO_MEL, fn_name='normalize_mel', ) postprocess_fns = [] if self._mixup: postprocess_fns.append( functools.partial( processing.batched_mixup, feature_name=(FeatureNames.AUDIO if self._raw_audio else FeatureNames.AUDIO_MEL), alpha=self._mixup_alpha, beta=self._mixup_alpha, mixup_labels=True, ) ) num_post_processors = len(postprocess_fns) if num_post_processors == 0: postprocess_fns = None super(AudioFineTuneLoader, self).__init__( dmvr_factory=ds_factory, params=input_params, postprocess_fns=postprocess_fns, num_epochs=-1, mode='train', name=dataset_id, )
def __init__(self, dataset_id, params): # Generic parameters input_params = params.eval.input assert input_params.has_data, 'Please provide a dataset name.' self._frame_size = input_params.frame_size self._zero_centering_image = input_params.zero_centering_image self._space_to_depth = input_params.space_to_depth self._linearize_vision = input_params.linearize_vision if dataset_id in VID_CLS_DS: self._num_frames = input_params.num_frames self._video_stride = input_params.video_stride self._multi_crop = input_params.multi_crop self._num_windows_test = input_params.num_windows_test with tf.name_scope('input_{}_test'.format(dataset_id)): params_factory = { 'is_training': False, 'min_resize': self._frame_size, 'crop_size': self._frame_size, 'zero_centering_image': self._zero_centering_image } if dataset_id in VID_CLS_DS: params_factory['num_frames'] = self._num_frames params_factory['stride'] = self._video_stride params_factory['num_test_clips'] = self._num_windows_test params_factory['multi_crop'] = self._multi_crop factory_args = {'subset': 'test'} if dataset_id.lower().startswith('kinetics'): factory_args['subset'] = 'valid' factory_class = ds_fctr.get_ds_factory( dataset_name=dataset_id, )(**factory_args) ds_factory = factory_class.configure(**params_factory) ds_factory.postprocessor_builder.add_fn(processing.remove_audio) postprocess_fns = [] if self._space_to_depth and dataset_id in VID_CLS_DS: postprocess_fns.append( functools.partial( processing.space_to_depth, temporal_block_size=2, spatial_block_size=2, feature_name=FeatureNames.VISION, ) ) if self._linearize_vision: postprocess_fns.append( functools.partial( processing.linearize, feature_name=FeatureNames.VISION, ) ) num_post_processors = len(postprocess_fns) if num_post_processors == 0: postprocess_fns = None super(VisionEvalLoader, self).__init__( dmvr_factory=ds_factory, params=input_params, postprocess_fns=postprocess_fns, num_epochs=1, mode='eval', name=dataset_id, )
def __init__(self, dataset_id, params): # Generic parameters input_params = params.train.input assert input_params.has_data, 'Please provide a dataset name.' self._frame_size = input_params.frame_size self._zero_centering_image = input_params.zero_centering_image self._space_to_depth = input_params.space_to_depth self._linearize_vision = input_params.linearize_vision if dataset_id in VID_CLS_DS: self._num_frames = input_params.num_frames self._video_stride = input_params.video_stride # Augmentation parameters self._mixup = input_params.mixup self._mixup_alpha = input_params.mixup_alpha self._min_area_ratio = input_params.min_area_ratio self._max_area_ratio = input_params.max_area_ratio self._min_aspect_ratio = input_params.min_aspect_ratio self._max_aspect_ratio = input_params.max_aspect_ratio self._color_augment = input_params.color_augment self._label_smoothing = input_params.label_smoothing params_factory = { 'is_training': True, 'crop_size': self._frame_size, 'crop_resize_style': 'Inception', 'min_area_ratio': self._min_area_ratio, 'max_area_ratio': self._max_area_ratio, 'min_aspect_ratio': self._min_aspect_ratio, 'max_aspect_ratio': self._max_aspect_ratio, 'zero_centering_image': self._zero_centering_image, } if dataset_id in VID_CLS_DS: params_factory['num_frames'] = self._num_frames params_factory['stride'] = self._video_stride # Get the factory. factory_args = {'subset': 'train'} factory_class = ds_fctr.get_ds_factory( dataset_name=dataset_id, )(**factory_args) ds_factory = factory_class.configure(**params_factory) ds_factory.postprocessor_builder.add_fn(processing.remove_audio) # Add batch-level data-agnostic post-processing functions postprocess_fns = [] if self._label_smoothing > 0.0: alpha = self._label_smoothing assert alpha <= 1.0, 'Please provide a valid smoothing factor' postprocess_fns.append( functools.partial( processing.label_smoothing, alpha=alpha, multi_label=False, ) ) if self._mixup: postprocess_fns.append( functools.partial( processing.batched_mixup, feature_name=FeatureNames.VISION, alpha=self._mixup_alpha, beta=self._mixup_alpha, mixup_labels=True, ) ) if self._space_to_depth and dataset_id in VID_CLS_DS: postprocess_fns.append( functools.partial( processing.space_to_depth, temporal_block_size=2, spatial_block_size=2, feature_name=FeatureNames.VISION, ) ) if self._linearize_vision: postprocess_fns.append( functools.partial( processing.linearize, feature_name=FeatureNames.VISION, ) ) num_post_processors = len(postprocess_fns) if num_post_processors == 0: postprocess_fns = None super(VisionFineTuneLoader, self).__init__( dmvr_factory=ds_factory, params=input_params, postprocess_fns=postprocess_fns, num_epochs=-1, mode='train', name=dataset_id, )
def __init__(self, dataset_id, subset, params, split = None, ): # Generic parameters input_params = params.eval.input self._num_frames = input_params.num_frames self._frame_size = input_params.frame_size self._video_stride = input_params.video_stride self._audio_stride = input_params.audio_stride self._min_resize = input_params.frame_size self._raw_audio = input_params.raw_audio self._stft_length = input_params.stft_length self._stft_step = input_params.stft_step self._mel_bins = input_params.mel_bins self._multi_crop = input_params.multi_crop self._zero_centering_image = input_params.zero_centering_image self._max_num_words = input_params.max_num_words self._space_to_depth = input_params.space_to_depth if subset == 'train': self._mode = 'train' self._is_training = True self._num_epochs = input_params.num_augmentation self._color_augment = input_params.color_augment self._audio_mixup = input_params.audio_mixup self._num_windows_test = 1 if self._num_epochs == 1: self._is_training = False self._color_augment = False self._audio_mixup = False else: self._min_area_ratio = input_params.min_area_ratio self._max_area_ratio = input_params.max_area_ratio self._min_aspect_ratio = input_params.min_aspect_ratio self._max_aspect_ratio = input_params.max_aspect_ratio self._mixup_alpha = input_params.mixup_alpha self._mixup_beta = input_params.mixup_beta else: self._mode = 'test' self._is_training = False self._num_epochs = 1 self._num_windows_test = input_params.num_windows_test params_factory = { 'is_training': self._is_training, } ref_fps = REF_FPS # assume all train_ds were used if dataset_id in AUD_CLS_DS: sample_rate = DEFAULT_SR n_audio_secs = self._num_frames / ref_fps num_audio_samples = int(sample_rate * n_audio_secs) params_factory['num_samples'] = num_audio_samples params_factory['audio_stride'] = self._audio_stride else: params_factory.update({ 'num_frames': self._num_frames, 'stride': self._video_stride, 'min_resize': self._min_resize, 'crop_size': self._frame_size, 'zero_centering_image': self._zero_centering_image }) if dataset_id in TEXT_DS: params_factory['max_num_words'] = self._max_num_words if self._mode == 'test': params_factory['num_test_clips'] = self._num_windows_test if dataset_id in VID_CLS_DS: params_factory['multi_crop'] = self._multi_crop # add augmentation-related parameters if self._is_training and dataset_id not in AUD_CLS_DS: params_factory.update({ 'crop_resize_style': 'Inception', 'min_area_ratio': self._min_area_ratio, 'max_area_ratio': self._max_area_ratio, 'min_aspect_ratio': self._min_aspect_ratio, 'max_aspect_ratio': self._max_aspect_ratio, }) factory_args = {'subset': subset} if dataset_id in CLS_DS: factory_args['split'] = split factory_class = ds_fctr.get_ds_factory( dataset_name=dataset_id, )(**factory_args) ds_factory = factory_class.configure(**params_factory) if dataset_id in AUD_CLS_DS: if self._raw_audio: ds_factory.preprocessor_builder.add_fn( functools.partial( processing.extend_waveform_dim, num_windows=self._num_windows_test, ), feature_name=FeatureNames.AUDIO, fn_name='extend_waveform', ) else: ds_factory.preprocessor_builder.add_fn( functools.partial( processing.raw_audio_to_spectrogram, sample_rate=DEFAULT_SR, stft_length=self._stft_length, stft_step=self._stft_step, mel_bins=self._mel_bins, num_windows=self._num_windows_test, specaugment=None, rm_audio=False, ) ) ds_factory.preprocessor_builder.add_fn( processing.normalize_spectrogram, feature_name=FeatureNames.AUDIO_MEL, fn_name='normalize_mel', ) # Add batch-level data-agnostic post-processing functions postprocess_fns = [] if self._space_to_depth: postprocess_fns.append( functools.partial( processing.space_to_depth, temporal_block_size=2, spatial_block_size=2, feature_name=FeatureNames.VISION, ) ) if self._is_training: if self._audio_mixup and dataset_id in AUD_CLS_DS: postprocess_fns.append([ functools.partial( processing.batched_mixup, feature_name=(FeatureNames.AUDIO if self._raw_audio else FeatureNames.AUDIO_MEL), alpha=self._mixup_alpha, beta=self._mixup_beta, mixup_labels=False, ) ]) split = '0' if split is None else str(split) name = dataset_id + '@' + split super(EvalLoader, self).__init__( dmvr_factory=ds_factory, params=input_params, postprocess_fns=postprocess_fns, num_epochs=self._num_epochs, mode=self._mode, name=name, )