Exemplo n.º 1
0
    def _decode(self, input_dict):
        if 'target_tensors' in input_dict:
            targets = input_dict['target_tensors'][0]
        else:
            targets = None
        encoder_outputs = input_dict['encoder_output']['outputs']
        inputs_attention_bias = (
            input_dict['encoder_output']['inputs_attention_bias'])
        self.embedding_softmax_layer = (
            input_dict['encoder_output']['embedding_softmax_layer'])

        with tf.name_scope("decode"):
            # prepare decoder layers
            if len(self.layers) == 0:
                for _ in range(self.params["num_hidden_layers"]):
                    self_attention_layer = attention_layer.SelfAttention(
                        self.params["hidden_size"],
                        self.params["num_heads"],
                        self.params["attention_dropout"],
                        self.mode == "train",
                    )
                    enc_dec_attention_layer = attention_layer.Attention(
                        self.params["hidden_size"],
                        self.params["num_heads"],
                        self.params["attention_dropout"],
                        self.mode == "train",
                    )
                    feed_forward_network = ffn_layer.FeedFowardNetwork(
                        self.params["hidden_size"],
                        self.params["filter_size"],
                        self.params["relu_dropout"],
                        self.mode == "train",
                    )

                    self.layers.append([
                        PrePostProcessingWrapper(self_attention_layer,
                                                 self.params,
                                                 self.mode == "train"),
                        PrePostProcessingWrapper(enc_dec_attention_layer,
                                                 self.params,
                                                 self.mode == "train"),
                        PrePostProcessingWrapper(feed_forward_network,
                                                 self.params,
                                                 self.mode == "train")
                    ])

                self.output_normalization = LayerNormalization(
                    self.params["hidden_size"])

            if targets is None:
                return self.predict(encoder_outputs, inputs_attention_bias)
            else:
                logits = self.decode_pass(targets, encoder_outputs,
                                          inputs_attention_bias)
                return {
                    "logits": logits,
                    "outputs": [tf.argmax(logits, axis=-1)],
                    "final_state": None,
                    "final_sequence_lengths": None
                }
Exemplo n.º 2
0
    def __init__(self,
                 hidden_size,
                 attention_dropout,
                 layer_postprocess_dropout,
                 training,
                 cnn_dropout_prob,
                 regularizer=None,
                 conv_params=None,
                 n_heads=1,
                 window_size=None,
                 back_step_size=None,
                 name="attention_block"):
        """
    Attention block constructor.

    Args:
      hidden_size: dimensionality of hidden embeddings.
      attention_dropout: dropout rate for attention layer.
      layer_postprocess_dropout:  dropout rate for sublayer.
      training: whether it is training mode.
      cnn_dropout_prob: dropout probabilty for cnn layers.
      regularizer: regularizer for the convolution kernel.
      conv_params: description of convolutional layer.
      n_heads: number of attention heads. Defaults to 1.
      window_size: size of attention window for forcing
        monotonic attention during the inference. Defaults to None.
      back_step_size: number of steps attention is allowed to
        go back during the inference. Defaults to 0.
      name: name of the block.
    """

        self.name = name
        self.conv = None

        if conv_params:
            self.conv = ConvBlock.create(index=0,
                                         conv_params=conv_params,
                                         regularizer=regularizer,
                                         bn_momentum=0.95,
                                         bn_epsilon=1e-8,
                                         cnn_dropout_prob=cnn_dropout_prob,
                                         training=training)
            self.conv.name = "conv"

        attention = attention_layer.Attention(
            hidden_size=hidden_size,
            num_heads=n_heads,
            attention_dropout=attention_dropout,
            regularizer=regularizer,
            train=training,
            window_size=window_size,
            back_step_size=back_step_size,
        )

        feed_forward = tf.layers.Dense(units=hidden_size,
                                       use_bias=True,
                                       kernel_regularizer=regularizer)

        wrapper_params = {
            "hidden_size": hidden_size,
            "layer_postprocess_dropout": layer_postprocess_dropout
        }

        self.attention = PrePostProcessingWrapper(layer=attention,
                                                  params=wrapper_params,
                                                  training=training)

        self.feed_forward = PrePostProcessingWrapper(layer=feed_forward,
                                                     params=wrapper_params,
                                                     training=training)
Exemplo n.º 3
0
    def _embed_style(self, style_spec, style_len):
        """
    Code that implements the reference encoder as described in "Towards
    end-to-end prosody transfer for expressive speech synthesis with Tacotron",
    and "Style Tokens: Unsupervised Style Modeling, Control and Transfer in
    End-to-End Speech Synthesis"

    Config parameters:

    * **conv_layers** (list) --- See the conv_layers parameter for the
      Tacotron-2 model.
    * **num_rnn_layers** (int) --- Number of rnn layers in the reference encoder
    * **rnn_cell_dim** (int) --- Size of rnn layer
    * **rnn_unidirectional** (bool) --- Uni- or bi-directional rnn.
    * **rnn_type** --- Must be a valid tf rnn cell class
    * **emb_size** (int) --- Size of gst
    * **attention_layer_size** (int) --- Size of linear layers in attention
    * **num_tokens** (int) --- Number of tokens for gst
    * **num_heads** (int) --- Number of attention heads
    """
        training = (self._mode == "train")
        regularizer = self.params.get('regularizer', None)
        data_format = self.params.get('data_format', 'channels_last')
        batch_size = style_spec.get_shape().as_list()[0]

        top_layer = tf.expand_dims(style_spec, -1)
        params = self.params['style_embedding_params']
        if "conv_layers" in params:
            for i, conv_params in enumerate(params['conv_layers']):
                ch_out = conv_params['num_channels']
                kernel_size = conv_params['kernel_size']  # [time, freq]
                strides = conv_params['stride']
                padding = conv_params['padding']

                if padding == "VALID":
                    style_len = (style_len - kernel_size[0] +
                                 strides[0]) // strides[0]
                else:
                    style_len = (style_len + strides[0] - 1) // strides[0]

                top_layer = conv_bn_actv(
                    layer_type="conv2d",
                    name="conv{}".format(i + 1),
                    inputs=top_layer,
                    filters=ch_out,
                    kernel_size=kernel_size,
                    activation_fn=self.params['activation_fn'],
                    strides=strides,
                    padding=padding,
                    regularizer=regularizer,
                    training=training,
                    data_format=data_format,
                    bn_momentum=self.params.get('bn_momentum', 0.1),
                    bn_epsilon=self.params.get('bn_epsilon', 1e-5),
                )

            if data_format == 'channels_first':
                top_layer = tf.transpose(top_layer, [0, 2, 1])

        top_layer = tf.concat(tf.unstack(top_layer, axis=2), axis=-1)

        num_rnn_layers = params['num_rnn_layers']
        if num_rnn_layers > 0:
            cell_params = {}
            cell_params["num_units"] = params['rnn_cell_dim']
            rnn_type = params['rnn_type']
            rnn_input = top_layer
            rnn_vars = []

            multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([
                single_cell(cell_class=rnn_type,
                            cell_params=cell_params,
                            training=training,
                            residual_connections=False)
                for _ in range(num_rnn_layers)
            ])
            rnn_vars += multirnn_cell_fw.trainable_variables
            if params['rnn_unidirectional']:
                top_layer, final_state = tf.nn.dynamic_rnn(
                    cell=multirnn_cell_fw,
                    inputs=rnn_input,
                    sequence_length=style_len,
                    dtype=rnn_input.dtype,
                    time_major=False,
                )
                final_state = final_state[0]
            else:
                multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell([
                    single_cell(cell_class=rnn_type,
                                cell_params=cell_params,
                                training=training,
                                residual_connections=False)
                    for _ in range(num_rnn_layers)
                ])
                top_layer, final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=multirnn_cell_fw,
                    cell_bw=multirnn_cell_bw,
                    inputs=rnn_input,
                    sequence_length=style_len,
                    dtype=rnn_input.dtype,
                    time_major=False)
                # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
                final_state = tf.concat(
                    (final_state[0][0].h, final_state[1][0].h), 1)
                rnn_vars += multirnn_cell_bw.trainable_variables

            top_layer = final_state
            # Apply linear layer
            top_layer = tf.layers.dense(top_layer,
                                        128,
                                        activation=tf.nn.tanh,
                                        kernel_regularizer=regularizer,
                                        name="reference_activation")
            if regularizer and training:
                cell_weights = rnn_vars
                for weights in cell_weights:
                    if "bias" not in weights.name:
                        # print("Added regularizer to {}".format(weights.name))
                        if weights.dtype.base_dtype == tf.float16:
                            tf.add_to_collection('REGULARIZATION_FUNCTIONS',
                                                 (weights, regularizer))
                        else:
                            tf.add_to_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                regularizer(weights))

        num_units = params["num_tokens"]
        att_size = params["attention_layer_size"]

        # Randomly initilized tokens
        gst_embedding = tf.get_variable(
            "token_embeddings",
            shape=[num_units, params["emb_size"]],
            dtype=self.params["dtype"],
            initializer=tf.random_uniform_initializer(
                minval=-1., maxval=1., dtype=self.params["dtype"]),
            trainable=False)

        attention = attention_layer.Attention(params["attention_layer_size"],
                                              params["num_heads"],
                                              0.,
                                              training,
                                              mode="bahdanau")

        top_layer = tf.expand_dims(top_layer, 1)
        gst_embedding = tf.nn.tanh(gst_embedding)
        gst_embedding = tf.expand_dims(gst_embedding, 0)
        gst_embedding = tf.tile(gst_embedding, [batch_size, 1, 1])
        token_embeddings = attention(top_layer, gst_embedding, None)
        token_embeddings = tf.squeeze(token_embeddings, 1)

        return token_embeddings