def _non_streaming(self, inputs): # depthwise 1D convolution in non streaming mode # it is used for training or non streaming inference. # pad input data inputs_pad = temporal_padding.TemporalPadding( padding=self.pad, padding_size=self.memory_size - 1)(inputs) # expand dimensionality for depthwise_conv2d # to [memory_size, 1, feature_dim, 1] time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1), -1) # run convolution depthwise_conv1d = tf.nn.depthwise_conv2d( tf.expand_dims(inputs_pad, -2), time_kernel_exp, strides=[1, 1, 1, 1], padding='VALID') # [batch_size, time_steps, 1, feature_dim] # [batch_size, time_steps, feature_dim] depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2]) # [batch_size, time_steps, feature_dim] if self.use_bias: depthwise_conv1d = depthwise_conv1d + self.bias return depthwise_conv1d
def conv_model_no_stream_wrapper(flags, conv_cell, cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias): """Toy example of convolutional model. It has the same model topology as in conv_model() above, but without wrapping conv cell by Stream layer, so that all parameters set manually. Args: flags: model and data settings conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_dilation_rate: list of dilation_rate in conv layer cnn_strides: list of strides in conv layer cnn_use_bias: list of use_bias in conv layer Returns: Keras model """ if not all( len(cnn_filters) == len(l) for l in [ cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias ]): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides, use_bias in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides, cnn_use_bias): ring_buffer_size_in_time_dim = dilation_rate * (kernel_size - 1) net = stream.Stream( cell=tf.identity, ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim, use_one_step=False, pad_time_dim=None)(net) padding_size = ring_buffer_size_in_time_dim net = temporal_padding.TemporalPadding( padding='causal', padding_size=padding_size)( net) net = conv_cell( filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, use_bias=use_bias, padding='valid')(net) # padding has to be valid! return tf.keras.Model(input_audio, net)
def test_padding_and_cropping(self, padding, padding_size): batch_size = 1 time_dim = 10 feature_dim = 3 inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim), batch_size=batch_size) net = temporal_padding.TemporalPadding( padding=padding, padding_size=padding_size)(inputs) model = tf.keras.Model(inputs, net) np.random.seed(1) input_signal = np.random.rand(batch_size, time_dim, feature_dim) output_signal = model.predict(input_signal) if padding_size >= 0: reference_padding = { 'causal': (padding_size, 0), 'same': (padding_size // 2, padding_size - padding_size // 2), 'future': (0, padding_size), }[padding] output_reference = tf.keras.backend.temporal_padding( input_signal, padding=reference_padding) else: reference_cropping = { 'causal': (-padding_size, 0), 'same': ((-padding_size) // 2, -padding_size - (-padding_size) // 2), 'future': (0, -padding_size), }[padding] output_reference = tf.keras.layers.Cropping1D(reference_cropping)( input_signal) self.assertAllClose(output_signal, output_reference) self.assertAllEqual(output_signal.shape, [batch_size, time_dim + padding_size, feature_dim])
def test_no_padding_or_cropping_in_streaming(self, padding, padding_size): batch_size = 1 feature_dim = 3 mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE inputs = tf.keras.layers.Input(shape=(1, feature_dim), batch_size=batch_size) net = temporal_padding.TemporalPadding(padding=padding, padding_size=padding_size, mode=mode)(inputs) self.assertAllEqual(tf.keras.backend.int_shape(net), [batch_size, 1, feature_dim])
def __init__(self, mode=modes.Modes.TRAINING, inference_batch_size=1, frame_size=400, frame_step=160, use_one_step=True, padding=None, **kwargs): super(DataFrame, self).__init__(**kwargs) if use_one_step and frame_step > frame_size: raise ValueError('frame_step:%d must be <= frame_size:%d' % (frame_step, frame_size)) if padding and padding != 'causal': raise ValueError('only causal padding is supported') self.mode = mode self.inference_batch_size = inference_batch_size self.frame_size = frame_size self.frame_step = frame_step self.use_one_step = use_one_step self.padding = padding if self.use_one_step: self.ring_buffer_size_in_time_dim = frame_size else: self.ring_buffer_size_in_time_dim = frame_size - 1 if self.padding: self.padding_layer = temporal_padding.TemporalPadding( padding_size=self.ring_buffer_size_in_time_dim, padding=self.padding) else: self.padding_layer = tf.keras.layers.Lambda(lambda x: x) if self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE: # create state varaible for inference streaming with internal state self.states = self.add_weight(name='frame_states', shape=[ self.inference_batch_size, self.ring_buffer_size_in_time_dim ], trainable=False, initializer=tf.zeros_initializer) elif self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE: # in streaming mode with external state, # state becomes an input output placeholders self.input_state = tf.keras.layers.Input( shape=(self.ring_buffer_size_in_time_dim, ), batch_size=self.inference_batch_size, name=self.name + 'input_state') self.output_state = None
def test_padding(self, padding_size): batch_size = 1 time_dim = 3 feature_dim = 3 padding = 'causal' inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim), batch_size=batch_size) net = temporal_padding.TemporalPadding( padding=padding, padding_size=padding_size)(inputs) model = tf.keras.Model(inputs, net) np.random.seed(1) input_signal = np.random.rand(batch_size, time_dim, feature_dim) output_signal = model.predict(input_signal) output_reference = tf.keras.backend.temporal_padding( input_signal, padding=(padding_size, 0)) self.assertAllClose(output_signal, output_reference) self.assertAllEqual(output_signal.shape, [batch_size, time_dim + padding_size, feature_dim])
def setUp(self): super(STFTTest, self).setUp() test_utils.set_seed(123) self.frame_size = 40 self.frame_step = 10 # layer definition stft_layer = stft.STFT(self.frame_size, self.frame_step, mode=modes.Modes.TRAINING, inference_batch_size=1, padding='causal') if stft_layer.window_type == 'hann_tf': synthesis_window_fn = tf.signal.hann_window else: synthesis_window_fn = None # prepare input data self.input_signal = np.random.rand(1, 120) # prepare default tf stft padding_layer = temporal_padding.TemporalPadding( padding_size=stft_layer.frame_size - 1, padding=stft_layer.padding) # pylint: disable=g-long-lambda stft_default_layer = tf.keras.layers.Lambda( lambda x: tf.signal.stft(x, stft_layer.frame_size, stft_layer.frame_step, fft_length=stft_layer.fft_size, window_fn=synthesis_window_fn, pad_end=False)) # pylint: enable=g-long-lambda input_tf = tf.keras.layers.Input(shape=(self.input_signal.shape[1], ), batch_size=1) net = padding_layer(input_tf) net = stft_default_layer(net) model_stft = tf.keras.models.Model(input_tf, net) self.stft_out = model_stft.predict(self.input_signal)
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, delay_also_in_non_streaming, dilation=1): """Toy deep convolutional model with residual connections. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer delay_also_in_non_streaming: Whether to apply delay also in non-streaming. dilation: dilation applied on all conv layers Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio sum_delay = 0 net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): time_buffer_size = dilation * (kernel_size - 1) if padding == 'causal': # residual connection is simple with 'causal' padding net_residual = net elif padding == 'same': # residual connection in streaming mode needs delay with 'same' padding delay_val = time_buffer_size // 2 net_residual = delay.Delay( delay=delay_val, also_in_non_streaming=delay_also_in_non_streaming)(net) sum_delay += delay_val else: raise ValueError('wrong padding mode ', padding) # it is easier to convert model to streaming mode when padding function # is decoupled from conv layer net = temporal_padding.TemporalPadding( padding='causal' if delay_also_in_non_streaming else padding, padding_size=time_buffer_size)(net) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream(cell=tf.identity, ring_buffer_size_in_time_dim=time_buffer_size, use_one_step=False, pad_time_dim=None)(net) net = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid')( net) # padding has to be valid! net = tf.keras.layers.Add()([net, net_residual]) return tf.keras.Model(input_audio, net), sum_delay
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): """Toy deep convolutional model with residual connections. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer Returns: Keras model Raises: ValueError: if any of input list has different length from any other """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): ring_buffer_size_in_time_dim = (kernel_size - 1) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream( cell=tf.identity, ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim, use_one_step=False, pad_time_dim=None)(net) # residual connection in streaming mode needs: # * kernel size in time dim of conv layer # * padding mode which was used to padd data in time dim net_residual = residual.Residual( padding=padding, kernel_size_time=ring_buffer_size_in_time_dim + 1)(net) # it is easier to convert model to streaming mode when padding function # is decoupled from conv layer net = temporal_padding.TemporalPadding( padding=padding, padding_size=ring_buffer_size_in_time_dim)(net) net = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid')( net) # padding has to be valid! net = tf.keras.layers.Add()([net, net_residual]) return tf.keras.Model(input_audio, net)