Пример #1
0
 def __init__(self,
              dmodel=144,
              reduction_factor=4,
              num_blocks=16,
              head_size=36,
              num_heads=4,
              kernel_size=32,
              fc_factor=0.5,
              dropout=0.0,
              add_wav_info=False,
              hop_size=80,
              name="conformer_encoder",
              **kwargs):
     super(ConformerEncoder_, self).__init__(name=name, **kwargs)
     self.dmodel = dmodel
     self.reduction_factor = reduction_factor
     self.conv_subsampling = ConvSubsampling(
         odim=dmodel, reduction_factor=reduction_factor, dropout=dropout)
     self.conformer_blocks = []
     self.add_wav_info = add_wav_info
     if self.add_wav_info:
         self.wav_layer = WavePickModel(dmodel, hop_size)
     for i in range(num_blocks):
         conformer_block = ConformerBlock(input_dim=dmodel,
                                          dropout=dropout,
                                          fc_factor=fc_factor,
                                          head_size=head_size,
                                          num_heads=num_heads,
                                          kernel_size=kernel_size,
                                          name=f"conformer_block_{i}")
         self.conformer_blocks.append(conformer_block)
    def __init__(self,
                 dmodel=144,
                 reduction_factor=4,
                 num_blocks=4,
                 cell_nums=4,
                 head_size=36,
                 num_heads=4,
                 kernel_size=32,
                 fc_factor=0.5,
                 dropout=0.0,
                 add_wav_info=False,
                 hop_size=80,
                 name="streaming_conformer_encoder",
                 **kwargs):
        """Initial variables."""
        super(StreamingConformerEncoder, self).__init__()
        self.dmodel = dmodel
        self.reduction_factor = reduction_factor
        self.conv_subsampling = ConvSubsampling(
            odim=dmodel, reduction_factor=reduction_factor, dropout=dropout)
        self.dropout = dropout
        self.cell_nums = cell_nums
        self.add_wav_info = add_wav_info
        if self.add_wav_info:
            self.wav_layer = WavePickModel(dmodel, hop_size)
        cells = []
        for i in range(cell_nums):
            cells.append(
                StreamingEncoderCell(
                    dmodel=dmodel,
                    num_blocks=num_blocks,
                    head_size=head_size,
                    num_heads=num_heads,
                    kernel_size=kernel_size,
                    fc_factor=fc_factor,
                    dropout=dropout,
                    name=name + 'cell_%s' % i,
                ))

        self.custom_layer = tf.keras.layers.RNN(cells,
                                                return_sequences=True,
                                                return_state=True,
                                                name='customer_rnn')
Пример #3
0
    def __init__(self,arch_config,**kwargs):
        super(DeepSpeech2, self).__init__()
        conv_conf = append_default_keys_dict(DEFAULT_CONV, arch_config.get("conv_conf", {}))
        rnn_conf = append_default_keys_dict(DEFAULT_RNN, arch_config.get("rnn_conf", {}))
        fc_conf = append_default_keys_dict(DEFAULT_FC, arch_config.get("fc_conf", {}))
        assert len(conv_conf["conv_strides"]) == \
               len(conv_conf["conv_filters"]) == len(conv_conf["conv_kernels"])
        assert conv_conf["conv_type"] in [1, 2]
        assert rnn_conf["rnn_type"] in ["lstm", "gru", "rnn"]
        assert conv_conf["conv_dropout"] >= 0.0 and rnn_conf["rnn_dropout"] >= 0.0
        layer=[]
        if conv_conf["conv_type"] == 2:
            conv = tf.keras.layers.Conv2D
        else:
            layer += [Merge2LastDims("conv1d_features")]
            conv = tf.keras.layers.Conv1D
            ker_shape = np.shape(conv_conf["conv_kernels"])
            stride_shape = np.shape(conv_conf["conv_strides"])
            filter_shape = np.shape(conv_conf["conv_filters"])
            assert len(ker_shape) == 1 and len(stride_shape) == 1 and len(filter_shape) == 1
        for i, fil in enumerate(conv_conf["conv_filters"]):
            layer += [conv(filters=fil, kernel_size=conv_conf["conv_kernels"][i],
                           strides=conv_conf["conv_strides"][i], padding="same",
                           activation=None, dtype=tf.float32, name=f"cnn_{i}")]
            layer += [tf.keras.layers.BatchNormalization(name=f"cnn_bn_{i}")]
            layer += [tf.keras.layers.ReLU(name=f"cnn_relu_{i}")]
            layer += [tf.keras.layers.Dropout(conv_conf["conv_dropout"],
                                              name=f"cnn_dropout_{i}")]
        last_dim=fil
        if conv_conf["conv_type"] == 2:
            layer += [Merge2LastDims("reshape_conv2d_to_rnn")]
        layer+=[tf.keras.layers.Dense(last_dim,name='feature_projector')]
        self.Cnn_feature_extractor=tf.keras.Sequential(layer)

        self.add_wav_info=kwargs['add_wav_info']
        if kwargs['add_wav_info']:
            hop_size=kwargs['hop_size']
            for i, fil in enumerate(conv_conf["conv_filters"]):
                hop_size*=conv_conf["conv_strides"][i][0]
            self.wav_layer=WavePickModel(last_dim,hop_size)
        layer=[]
        rnn = get_rnn(rnn_conf["rnn_type"])

        # To time major
        if rnn_conf["rnn_bidirectional"]:
            layer += [TransposeTimeMajor("transpose_to_time_major")]

        # RNN layers
        for i in range(rnn_conf["rnn_layers"]):
            if rnn_conf["rnn_bidirectional"]:
                layer += [tf.keras.layers.Bidirectional(
                    rnn(rnn_conf["rnn_units"], activation=rnn_conf["rnn_activation"],
                        time_major=True, dropout=rnn_conf["rnn_dropout"],
                        return_sequences=True, use_bias=True),
                    name=f"b{rnn_conf['rnn_type']}_{i}")]
                layer += [SequenceBatchNorm(time_major=True, name=f"sequence_wise_bn_{i}")]
            else:
                layer += [rnn(rnn_conf["rnn_units"], activation=rnn_conf["rnn_activation"],
                              dropout=rnn_conf["rnn_dropout"], return_sequences=True, use_bias=True,
                              name=f"{rnn_conf['rnn_type']}_{i}")]
                layer += [SequenceBatchNorm(time_major=False, name=f"sequence_wise_bn_{i}")]
                if rnn_conf["rnn_rowconv"]:
                    layer += [RowConv1D(filters=rnn_conf["rnn_units"],
                                        future_context=rnn_conf["rnn_rowconv_context"],
                                        name=f"row_conv_{i}")]

        # To batch major
        if rnn_conf["rnn_bidirectional"]:
            layer += [TransposeTimeMajor("transpose_to_batch_major")]

        # FC Layers
        if fc_conf["fc_units"]:
            assert fc_conf["fc_dropout"] >= 0.0

            for idx, units in enumerate(fc_conf["fc_units"]):
                layer += [tf.keras.layers.Dense(units=units, activation=None,
                                                use_bias=True, name=f"hidden_fc_{idx}")]
                layer += [tf.keras.layers.BatchNormalization(name=f"hidden_fc_bn_{idx}")]
                layer += [tf.keras.layers.ReLU(name=f"hidden_fc_relu_{idx}")]
                layer += [tf.keras.layers.Dropout(fc_conf["fc_dropout"],
                                                  name=f"hidden_fc_dropout_{idx}")]
        self.Rnn_feature_extractor=tf.keras.Sequential(layer)
class StreamingConformerEncoder(tf.keras.Model):
    def __init__(self,
                 dmodel=144,
                 reduction_factor=4,
                 num_blocks=4,
                 cell_nums=4,
                 head_size=36,
                 num_heads=4,
                 kernel_size=32,
                 fc_factor=0.5,
                 dropout=0.0,
                 add_wav_info=False,
                 hop_size=80,
                 name="streaming_conformer_encoder",
                 **kwargs):
        """Initial variables."""
        super(StreamingConformerEncoder, self).__init__()
        self.dmodel = dmodel
        self.reduction_factor = reduction_factor
        self.conv_subsampling = ConvSubsampling(
            odim=dmodel, reduction_factor=reduction_factor, dropout=dropout)
        self.dropout = dropout
        self.cell_nums = cell_nums
        self.add_wav_info = add_wav_info
        if self.add_wav_info:
            self.wav_layer = WavePickModel(dmodel, hop_size)
        cells = []
        for i in range(cell_nums):
            cells.append(
                StreamingEncoderCell(
                    dmodel=dmodel,
                    num_blocks=num_blocks,
                    head_size=head_size,
                    num_heads=num_heads,
                    kernel_size=kernel_size,
                    fc_factor=fc_factor,
                    dropout=dropout,
                    name=name + 'cell_%s' % i,
                ))

        self.custom_layer = tf.keras.layers.RNN(cells,
                                                return_sequences=True,
                                                return_state=True,
                                                name='customer_rnn')

    @tf.function(experimental_relax_shapes=True)
    def call(self, inputs, states=None, training=None, mask=None):

        if self.add_wav_info:
            mel_inputs, wav_inputs = inputs
            B = tf.shape(mel_inputs)[0]
            T = tf.shape(mel_inputs)[1]
            mel_inputs = merge_two_first_dims(mel_inputs)
            wav_inputs = merge_two_first_dims(wav_inputs)
            mel_outputs = self.conv_subsampling(mel_inputs, training=training)
            wav_outputs = self.wav_layer(wav_inputs, training=training)
            outputs = mel_outputs + wav_outputs
            outputs = split_two_first_dims(outputs, B, T)
        else:
            mel_inputs = inputs
            B = tf.shape(mel_inputs)[0]
            T = tf.shape(mel_inputs)[1]
            mel_inputs = merge_two_first_dims(mel_inputs)
            outputs = self.conv_subsampling(mel_inputs, training=training)
            outputs = split_two_first_dims(outputs, B, T)

        if states is None:

            states = self.custom_layer.get_initial_state(outputs)

        outputs = self.custom_layer(outputs, initial_state=states)
        return outputs[0], outputs[1:]

    def get_init_states(self, inputs):
        return self.custom_layer.get_initial_state(inputs)

    def inference(self, inputs, states):
        if self.add_wav_info:
            mel_inputs, wav_inputs = inputs

            mel_outputs = self.conv_subsampling(mel_inputs, training=False)
            wav_outputs = self.wav_layer(wav_inputs, training=False)
            outputs = mel_outputs + wav_outputs

        else:
            mel_inputs = inputs

            outputs = self.conv_subsampling(mel_inputs, training=False)
        outputs = tf.expand_dims(outputs, 1)
        outputs = self.custom_layer(outputs, initial_state=states)
        new_states = outputs[1:]
        result = tf.squeeze(outputs[0], 1)
        return result, new_states

    def get_config(self):
        conf = super(StreamingConformerEncoder, self).get_config()
        conf.update(self.conv_subsampling.get_config())
        if self.add_wav_info:
            conf.update(self.wav_layer.get_config())
        conf.update(self.custom_layer.get_config())