def _non_streaming(self, inputs): # depthwise 1D convolution in non streaming mode # it is used for training or non streaming inference. # Zero pad inputs from the left to make conv1d causal. # [batch_size, time_steps, feature_dim] if self.pad: inputs_pad = tf.keras.backend.temporal_padding( inputs, padding=(self.memory_size - 1, 0)) else: inputs_pad = inputs # expand dimensionality for depthwise_conv2d # to [memory_size, 1, feature_dim, 1] time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1), -1) # run convolution depthwise_conv1d = tf.nn.depthwise_conv2d( tf.expand_dims(inputs_pad, -2), time_kernel_exp, strides=[1, 1, 1, 1], padding='VALID') # [batch_size, time_steps, 1, feature_dim] # [batch_size, time_steps, feature_dim] depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2]) # [batch_size, time_steps, feature_dim] if self.use_bias: depthwise_conv1d = depthwise_conv1d + self.bias return depthwise_conv1d
def _non_streaming(self, inputs): # depthwise 1D convolution in non streaming mode # it is used for training or non streaming inference. # pad input data inputs_pad = temporal_padding.TemporalPadding( padding=self.pad, padding_size=self.memory_size - 1)(inputs) # expand dimensionality for depthwise_conv2d # to [memory_size, 1, feature_dim, 1] time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1), -1) # run convolution depthwise_conv1d = tf.nn.depthwise_conv2d( tf.expand_dims(inputs_pad, -2), time_kernel_exp, strides=[1, 1, 1, 1], padding='VALID') # [batch_size, time_steps, 1, feature_dim] # [batch_size, time_steps, feature_dim] depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2]) # [batch_size, time_steps, feature_dim] if self.use_bias: depthwise_conv1d = depthwise_conv1d + self.bias return depthwise_conv1d
def random_stretch_squeeze(inputs, resample_offset, seed=None): """Stretches and squeezes audio data in time dim. It can be useful for augmenting training data with random stretchs squeezes in time dim for making model more robust to input audio sampling frequency and human speech frequency. Args: inputs: input tensor [batch_size, time] resample_offset: defines stretch squeeze range: 1-resample_offset...1+resample_offset seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] image = tf.expand_dims(inputs, 2) # feature image = tf.expand_dims(image, 3) # channels resample = 1.0 # when it is equal to 1 - no stretching or squeezing time_stretch_squeeze = tf.random.uniform( shape=[batch_size], minval=resample - resample_offset, maxval=resample + resample_offset, dtype=tf.float32, seed=seed) tf.print(time_stretch_squeeze) print(time_stretch_squeeze) shape = tf.shape(inputs) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): image_resized = tf.image.resize( images=image[i], size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=sequence_length, target_width=1, ) outputs = outputs.write(i, image_resized_cropped) outputs = tf.squeeze(outputs.stack(), axis=[2, 3]) outputs.set_shape(inputs_shape) return outputs
def random_shift(inputs, time_shift, seed=None): """Shifts input data randomly in time dim. It can be useful for augmenting training data with random shifts in time dim for making model more robust to input audio shifts Args: inputs: input tensor [batch_size, time] time_shift: defines time shift range: -time_shift...time_shift it is defiend in samples seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] # below function will process 2D arrays, convert it to [batch, time, dummy] inputs = tf.expand_dims(inputs, 2) time_shift_amounts = tf.random.uniform(shape=[batch_size], minval=-time_shift, maxval=time_shift, dtype=tf.int32, seed=seed) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): time_shift_amount = time_shift_amounts[i] # pylint: disable=cell-var-from-loop time_shift_padding = tf.cond(time_shift_amount > 0, lambda: [[time_shift_amount, 0], [0, 0]], lambda: [[0, -time_shift_amount], [0, 0]]) time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0], lambda: [-time_shift_amount, 0]) # pylint: enable=cell-var-from-loop padded = tf.pad(tensor=inputs[i], paddings=time_shift_padding, mode='CONSTANT') padded_sliced = tf.slice(padded, time_shift_offset, [sequence_length, -1]) outputs = outputs.write(i, padded_sliced) # convert it back to [batch, time] outputs = tf.squeeze(outputs.stack(), axis=[2]) outputs.set_shape(inputs_shape) return outputs
def call(self, inputs): # inputs [batch_size, time1, feature1, feature2] time_kernel_exp = tf.expand_dims(self.filters, -1) # it can be replaced by AveragePooling2D with temporal padding # and optimized for streaming mode # output will be [batch_size, time1, feature1, feature2] return tf.nn.depthwise_conv2d(inputs, time_kernel_exp, strides=self.strides, padding=self.padding.upper(), dilations=self.dilation_rate, name=self.name + '_averPool2D')
def random_cutout( inputs, mask_size, mask_value=0, seed=None, data_format='channels_last', ): """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs. It is based on addons/tensorflow_addons/image/cutout_ops.py kept here here for backward compatibility Args: inputs: input tensor [batch_size, time, feature, channels] mask_size: mask size (time feature) mask_value: mask will be filled with this value seed: random seed data_format: dimesnions order Returns: masked image Raises: ValueError: if inputs.shape.rank != 4 """ if inputs.shape.rank != 4: raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank) mask_size = tf.convert_to_tensor(mask_size) if tf.rank(mask_size) == 0: mask_size = tf.stack([mask_size, mask_size]) if data_format == 'channels_last': time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2] else: time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3] batch_size = tf.shape(inputs)[0] cutout_center_time = tf.random.uniform( shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed ) cutout_center_feature = tf.random.uniform( shape=[batch_size], minval=0, maxval=feature_size, dtype=tf.int32, seed=seed) offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0]) origin_shape = inputs.shape offset = tf.convert_to_tensor(offset) mask_size = mask_size // 2 cutout_center_time = offset[:, 0] cutout_center_feature = offset[:, 1] lower_pads = tf.maximum(0, cutout_center_time - mask_size[0]) upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0]) left_pads = tf.maximum(0, cutout_center_feature - mask_size[1]) right_pads = tf.maximum(0, feature_size - cutout_center_feature - mask_size[1]) cutout_shape = tf.transpose( [ time_size - (lower_pads + upper_pads), feature_size - (left_pads + right_pads), ], [1, 0], ) masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(tf.shape(cutout_shape)[0]): padding_dims = [ [lower_pads[i], upper_pads[i]], [left_pads[i], right_pads[i]], ] mask = tf.pad( tf.zeros(cutout_shape[i], dtype=inputs.dtype), padding_dims, constant_values=1, ) masks = masks.write(i, mask) if data_format == 'channels_last': mask = tf.expand_dims(masks.stack(), -1) mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]]) else: mask = tf.expand_dims(masks.stack(), 1) mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1]) inputs = tf.where( tf.equal(mask, 0), tf.ones_like(inputs, dtype=inputs.dtype) * mask_value, inputs, ) inputs.set_shape(origin_shape) return inputs