def random_stretch_squeeze(inputs, resample_offset, seed=None): """Stretches and squeezes audio data in time dim. It can be useful for augmenting training data with random stretchs squeezes in time dim for making model more robust to input audio sampling frequency and human speech frequency. Args: inputs: input tensor [batch_size, time] resample_offset: defines stretch squeeze range: 1-resample_offset...1+resample_offset seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] image = tf.expand_dims(inputs, 2) # feature image = tf.expand_dims(image, 3) # channels resample = 1.0 # when it is equal to 1 - no stretching or squeezing time_stretch_squeeze = tf.random.uniform( shape=[batch_size], minval=resample - resample_offset, maxval=resample + resample_offset, dtype=tf.float32, seed=seed) tf.print(time_stretch_squeeze) print(time_stretch_squeeze) shape = tf.shape(inputs) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): image_resized = tf.image.resize( images=image[i], size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=sequence_length, target_width=1, ) outputs = outputs.write(i, image_resized_cropped) outputs = tf.squeeze(outputs.stack(), axis=[2, 3]) outputs.set_shape(inputs_shape) return outputs
def random_shift(inputs, time_shift, seed=None): """Shifts input data randomly in time dim. It can be useful for augmenting training data with random shifts in time dim for making model more robust to input audio shifts Args: inputs: input tensor [batch_size, time] time_shift: defines time shift range: -time_shift...time_shift it is defiend in samples seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] # below function will process 2D arrays, convert it to [batch, time, dummy] inputs = tf.expand_dims(inputs, 2) time_shift_amounts = tf.random.uniform(shape=[batch_size], minval=-time_shift, maxval=time_shift, dtype=tf.int32, seed=seed) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): time_shift_amount = time_shift_amounts[i] # pylint: disable=cell-var-from-loop time_shift_padding = tf.cond(time_shift_amount > 0, lambda: [[time_shift_amount, 0], [0, 0]], lambda: [[0, -time_shift_amount], [0, 0]]) time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0], lambda: [-time_shift_amount, 0]) # pylint: enable=cell-var-from-loop padded = tf.pad(tensor=inputs[i], paddings=time_shift_padding, mode='CONSTANT') padded_sliced = tf.slice(padded, time_shift_offset, [sequence_length, -1]) outputs = outputs.write(i, padded_sliced) # convert it back to [batch, time] outputs = tf.squeeze(outputs.stack(), axis=[2]) outputs.set_shape(inputs_shape) return outputs
def random_cutout( inputs, mask_size, mask_value=0, seed=None, data_format='channels_last', ): """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs. It is based on addons/tensorflow_addons/image/cutout_ops.py kept here here for backward compatibility Args: inputs: input tensor [batch_size, time, feature, channels] mask_size: mask size (time feature) mask_value: mask will be filled with this value seed: random seed data_format: dimesnions order Returns: masked image Raises: ValueError: if inputs.shape.rank != 4 """ if inputs.shape.rank != 4: raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank) mask_size = tf.convert_to_tensor(mask_size) if tf.rank(mask_size) == 0: mask_size = tf.stack([mask_size, mask_size]) if data_format == 'channels_last': time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2] else: time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3] batch_size = tf.shape(inputs)[0] cutout_center_time = tf.random.uniform( shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed ) cutout_center_feature = tf.random.uniform( shape=[batch_size], minval=0, maxval=feature_size, dtype=tf.int32, seed=seed) offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0]) origin_shape = inputs.shape offset = tf.convert_to_tensor(offset) mask_size = mask_size // 2 cutout_center_time = offset[:, 0] cutout_center_feature = offset[:, 1] lower_pads = tf.maximum(0, cutout_center_time - mask_size[0]) upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0]) left_pads = tf.maximum(0, cutout_center_feature - mask_size[1]) right_pads = tf.maximum(0, feature_size - cutout_center_feature - mask_size[1]) cutout_shape = tf.transpose( [ time_size - (lower_pads + upper_pads), feature_size - (left_pads + right_pads), ], [1, 0], ) masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(tf.shape(cutout_shape)[0]): padding_dims = [ [lower_pads[i], upper_pads[i]], [left_pads[i], right_pads[i]], ] mask = tf.pad( tf.zeros(cutout_shape[i], dtype=inputs.dtype), padding_dims, constant_values=1, ) masks = masks.write(i, mask) if data_format == 'channels_last': mask = tf.expand_dims(masks.stack(), -1) mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]]) else: mask = tf.expand_dims(masks.stack(), 1) mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1]) inputs = tf.where( tf.equal(mask, 0), tf.ones_like(inputs, dtype=inputs.dtype) * mask_value, inputs, ) inputs.set_shape(origin_shape) return inputs