示例#1
0
def create_attention_mechanism(attention_option, num_units, memory,
                               source_sequence_length):
    """Create attention mechanism based on the attention_option."""
    if attention_option == "luong":
        attention_mechanism = LuongAttention(
            num_units, memory, memory_sequence_length=source_sequence_length)
    elif attention_option == "scaled_luong":
        attention_mechanism = LuongAttention(
            num_units,
            memory,
            memory_sequence_length=source_sequence_length,
            scale=True)
    elif attention_option == "bahdanau":
        attention_mechanism = BahdanauAttention(
            num_units, memory, memory_sequence_length=source_sequence_length)
    elif attention_option == "normed_bahdanau":
        attention_mechanism = BahdanauAttention(
            num_units,
            memory,
            memory_sequence_length=source_sequence_length,
            normalize=True)
    elif attention_option == "multi_head":
        attention_mechanism = MultiHeadAttention(
            num_units,
            memory,
            memory_sequence_length=source_sequence_length,
            num_heads=4)
    else:
        raise ValueError("Unknown attention option %s" % attention_option)

    return attention_mechanism
    def _create_decoder_cell(self):
        enc_outputs, enc_states, enc_seq_len = self.enc_outputs, self.enc_states, self.enc_seq_len
        batch_size = self.batch_size * self.cfg.beam_size if self.use_beam_search else self.batch_size
        with tf.variable_scope("attention"):
            if self.cfg.attention == "luong":  # Luong attention mechanism
                attention_mechanism = LuongAttention(
                    num_units=self.cfg.num_units,
                    memory=enc_outputs,
                    memory_sequence_length=enc_seq_len)
            else:  # default using Bahdanau attention mechanism
                attention_mechanism = BahdanauAttention(
                    num_units=self.cfg.num_units,
                    memory=enc_outputs,
                    memory_sequence_length=enc_seq_len)

        def cell_input_fn(
            inputs, attention
        ):  # define cell input function to keep input/output dimension same
            # reference: https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/AttentionWrapper
            if not self.cfg.use_attention_input_feeding:
                return inputs
            input_project = tf.layers.Dense(self.cfg.num_units,
                                            dtype=tf.float32,
                                            name='attn_input_feeding')
            return input_project(tf.concat([inputs, attention], axis=-1))

        if self.cfg.top_attention:  # apply attention mechanism only on the top decoder layer
            cells = [
                self._create_rnn_cell() for _ in range(self.cfg.num_layers)
            ]
            cells[-1] = AttentionWrapper(
                cells[-1],
                attention_mechanism=attention_mechanism,
                name="Attention_Wrapper",
                attention_layer_size=self.cfg.num_units,
                initial_cell_state=enc_states[-1],
                cell_input_fn=cell_input_fn)
            initial_state = [state for state in enc_states]
            initial_state[-1] = cells[-1].zero_state(batch_size=batch_size,
                                                     dtype=tf.float32)
            dec_init_states = tuple(initial_state)
            cells = MultiRNNCell(cells)
        else:
            cells = MultiRNNCell(
                [self._create_rnn_cell() for _ in range(self.cfg.num_layers)])
            cells = AttentionWrapper(cells,
                                     attention_mechanism=attention_mechanism,
                                     name="Attention_Wrapper",
                                     attention_layer_size=self.cfg.num_units,
                                     initial_cell_state=enc_states,
                                     cell_input_fn=cell_input_fn)
            dec_init_states = cells.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=enc_states)
        return cells, dec_init_states
示例#3
0
def apply_attention(cell_dec,
                    enc_hidden_states,
                    enc_final_state,
                    input_length,
                    batch_size,
                    attention_probability_fn,
                    dropout,
                    alignment_history=True):

    if attention_probability_fn == 'softmax':
        probability_fn = tf.nn.softmax
        score_mask_value = float('-inf')
    elif attention_probability_fn == 'hardmax':
        probability_fn = tf.contrib.seq2seq.hardmax
        score_mask_value = float('-inf')
    elif attention_probability_fn == 'sparsemax':

        def sparsemax(attentionscores):
            attentionscores = tf.contrib.sparsemax.sparsemax(attentionscores)
            with tf.control_dependencies([
                    tf.assert_non_negative(attentionscores),
                    tf.assert_less_equal(attentionscores, 1., summarize=60)
            ]):
                return tf.identity(attentionscores)

        probability_fn = sparsemax
        # sparsemax does not deal with -inf properly, and has significant numerical stability issues
        # with large numbers (positive or negative)
        score_mask_value = -1e+5
    else:
        raise ValueError("Invalid attention_probability_fn " +
                         str(attention_probability_fn))

    with tf.variable_scope(
            'attention',
            initializer=tf.initializers.identity(dtype=tf.float32)):
        attention = LuongAttention(int(cell_dec.output_size),
                                   enc_hidden_states,
                                   memory_sequence_length=input_length,
                                   probability_fn=probability_fn,
                                   score_mask_value=score_mask_value)
    cell_dec = AttentionWrapper(cell_dec,
                                attention,
                                cell_input_fn=lambda inputs, _: inputs,
                                attention_layer_size=int(cell_dec.output_size),
                                alignment_history=alignment_history,
                                initial_cell_state=enc_final_state)
    enc_final_state = cell_dec.zero_state(batch_size, dtype=tf.float32)

    cell_dec = ActivationWrapper(cell_dec, activation=tf.tanh)
    cell_dec = NotBrokenDropoutWrapper(cell_dec, output_keep_prob=dropout)

    return cell_dec, enc_final_state
示例#4
0
    def add_decoder_op(self, enc_final_state, enc_hidden_states,
                       output_embed_matrix, training):
        cell_dec = tf.contrib.rnn.MultiRNNCell([
            self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers)
        ])

        encoder_hidden_size = int(enc_hidden_states.get_shape()[-1])
        decoder_hidden_size = int(cell_dec.output_size)

        # if encoder and decoder have different sizes, add a projection layer
        if encoder_hidden_size != decoder_hidden_size:
            assert False, (encoder_hidden_size, decoder_hidden_size)
            with tf.variable_scope('hidden_projection'):
                kernel = tf.get_variable(
                    'kernel', (encoder_hidden_size, decoder_hidden_size),
                    dtype=tf.float32)

                # apply a relu to the projection for good measure
                enc_final_state = nest.map_structure(
                    lambda x: tf.nn.relu(tf.matmul(x, kernel)),
                    enc_final_state)
                enc_hidden_states = tf.nn.relu(
                    tf.tensordot(enc_hidden_states, kernel, [[2], [1]]))
        else:
            # flatten and repack the state
            enc_final_state = nest.pack_sequence_as(
                cell_dec.state_size, nest.flatten(enc_final_state))

        cell_dec = ParentFeedingCellWrapper(cell_dec, enc_final_state)
        if self.config.apply_attention:
            attention = LuongAttention(self.config.decoder_hidden_size,
                                       enc_hidden_states,
                                       self.input_length_placeholder,
                                       probability_fn=tf.nn.softmax)
            cell_dec = AttentionWrapper(
                cell_dec,
                attention,
                cell_input_fn=lambda inputs, _: inputs,
                attention_layer_size=self.config.decoder_hidden_size,
                initial_cell_state=enc_final_state)
            enc_final_state = cell_dec.zero_state(self.batch_size,
                                                  dtype=tf.float32)
        decoder = Seq2SeqDecoder(self.config, self.input_placeholder,
                                 self.input_length_placeholder,
                                 self.output_placeholder,
                                 self.output_length_placeholder,
                                 self.batch_number_placeholder)
        return decoder.decode(cell_dec, enc_final_state,
                              self.config.grammar.output_size,
                              output_embed_matrix, training)
示例#5
0
    def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training):
        cell_dec = tf.contrib.rnn.MultiRNNCell([self.make_rnn_cell(i, for_decoder=True) for i in range(self.config.rnn_layers)])

        encoder_hidden_size = int(enc_hidden_states.get_shape()[-1])
        decoder_hidden_size = int(cell_dec.output_size)
        
        # if encoder and decoder have different sizes, add a projection layer
        if encoder_hidden_size != decoder_hidden_size:
            assert False, (encoder_hidden_size, decoder_hidden_size)
            with tf.variable_scope('hidden_projection'):
                kernel = tf.get_variable('kernel', (encoder_hidden_size, decoder_hidden_size), dtype=tf.float32)
            
                # apply a relu to the projection for good measure
                enc_final_state = nest.map_structure(lambda x: tf.nn.relu(tf.matmul(x, kernel)), enc_final_state)
                enc_hidden_states = tf.nn.relu(tf.tensordot(enc_hidden_states, kernel, [[2], [1]]))
        else:
            # flatten and repack the state
            enc_final_state = nest.pack_sequence_as(cell_dec.state_size, nest.flatten(enc_final_state))

        # to use these we need to tile the final encoder state / the memory
        # but that conflicts with our use of cell_dec on untiled inputs for the gold
        #cell_dec = ParentFeedingCellWrapper(cell_dec, tf.contrib.seq2seq.tile_batch(enc_final_state, self.config.beam_size))
        if self.config.apply_attention and False:
            attention = LuongAttention(decoder_hidden_size, enc_hidden_states, self.input_length_placeholder,
                                       probability_fn=tf.nn.softmax)
            cell_dec = AttentionWrapper(cell_dec, attention,
                                        cell_input_fn=lambda inputs, _: inputs,
                                        attention_layer_size=decoder_hidden_size,
                                        initial_cell_state=enc_final_state)
            enc_final_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32)
        
        print('enc_final_state', enc_final_state)
        linear_layer = tf_core_layers.Dense(self.config.output_size)
        go_vector = tf.ones((self.batch_size,), dtype=tf.int32) * self.config.grammar.start
        decoder = BeamSearchOptimizationDecoder(training, cell_dec, output_embed_matrix, go_vector, self.config.grammar.end,
                                                enc_final_state,
                                                beam_width=self.config.beam_size, output_layer=linear_layer,
                                                gold_sequence=self.output_placeholder if training else None,
                                                gold_sequence_length=(self.output_length_placeholder+1) if training else None)
        
        if self.config.use_grammar_constraints:
            raise NotImplementedError("Grammar constraints are not implemented for the beam search yet")
        
        # dynamic_decode craps itself if we pass output_time_major=False, as it tries to transpose
        # the loss vector
        final_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, maximum_iterations=self.config.max_length)
        return final_outputs
示例#6
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """
        构建解码器cell
        :param encoder_outputs:
        :param encoder_state:
        :return:
        """
        encoder_input_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirection:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        if self.use_beamsearch_decode:
            # 复制多份
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width
            )
            encoder_state = seq2seq.tile_batch(
                encoder_state, multiplier=self.beam_width
            )
            encoder_input_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width
            )
            batch_size *= self.beam_width

        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )
        else:
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )

        cell = MultiRNNCell([
            self.build_single_cell(
                self.hidden_size,
                use_residual=self.use_residual)
            for _ in range(self.depth)
        ])

        alignment_history = (
            self.mode != 'train' and not self.use_beamsearch_decode
        )

        def cell_input_fn(inputs, attention):
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_size,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(
                                cell=cell,
                                attention_mechanism=self.attention_mechanism,
                                attention_layer_size=self.hidden_size,
                                alignment_history=alignment_history,
                                cell_input_fn=cell_input_fn,
                                name='Attention_Wrapper'
        )

        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32)

        # 传递encoder状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state
        )

        return cell, decoder_initial_state
示例#7
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """构建解码器cell"""

        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size
        
        #编码器的参数可以用于解码器的参数初始化,当双向的时候只需要depth层的参数即可
        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量

        if self.use_beamsearch_decode:
            #将encoder_outputs扩大multiplier倍
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(
                encoder_state, multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的
            batch_size *= self.beam_width

        # 下面是两种不同的 Attention 机制
        #https://blog.csdn.net/u010960155/article/details/82853632
        if self.attention_type.lower() == 'luong':
            # 'Luong' style attention: https://arxiv.org/abs/1508.04025
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length
            )
        else: # Default Bahdanau
            # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length
            )

        # 定义多层的rnn
        cell = MultiRNNCell([
            self.build_single_cell(
                self.hidden_units,
                use_residual=self.use_residual
            )
            for _ in range(self.depth)
        ])

        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (
            self.mode != 'train' and not self.use_beamsearch_decode
        )

        def cell_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
              如果使用残差网络,需要先进行投影
            """
            #如果不使用残差网络
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)
            #如果使用残差网络则需先投影
            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))
        #对基本cell进行封装变成带注意力机制的cell网络
        cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,#attention的类型
            attention_layer_size=self.hidden_units,#隐藏层单元数
            alignment_history=alignment_history,
            cell_input_fn=cell_input_fn,#输入的输入方式
            name='Attention_Wrapper')

        # 将解码器状态置为空状态
        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32)
        
        
        #确保decoder_initial_state 和 decoder_initial_state 的dtype类型相同
        # 用encoder的参数去初始化解码器
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)
        
        #返回解码器的基本单元和解码器的初始化参数
        return cell, decoder_initial_state
示例#8
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """构建解码器cell"""

        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量
        # encoder_outputs, encoder_state, encoder_inputs_length
        # needs to be tiled so that:
        # [batch_size, .., ..] -> [batch_size x beam_width, .., ..]
        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(encoder_state,
                                               multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的
            batch_size *= self.beam_width

        # 下面是两种不同的 Attention 机制
        if self.attention_type.lower() == 'luong':
            # 'Luong' style attention: https://arxiv.org/abs/1508.04025
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:  # Default Bahdanau
            # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        # Building decoder_cell
        cell = MultiRNNCell([
            self.build_single_cell(self.hidden_units,
                                   use_residual=self.use_residual)
            for _ in range(self.depth)
        ])

        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        def cell_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
            """
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(cell=cell,
                                attention_mechanism=self.attention_mechanism,
                                attention_layer_size=self.hidden_units,
                                alignment_history=alignment_history,
                                cell_input_fn=cell_input_fn,
                                name='Attention_Wrapper')

        # 空状态
        decoder_initial_state = cell.zero_state(batch_size, tf.float32)

        # 传递encoder状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)

        # if self.use_beamsearch_decode:
        #     decoder_initial_state = seq2seq.tile_batch(
        #         decoder_initial_state, multiplier=self.beam_width)

        return cell, decoder_initial_state
示例#10
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id=None,
                   mel_targets=None,
                   linear_targets=None,
                   is_training=False,
                   loss_coeff=None,
                   stop_token_targets=None):

        with tf.variable_scope('Eembedding') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                speaker_embed_table = tf.get_variable(
                    'speaker_embedding',
                    [self.num_speakers, hp.speaker_embedding_size],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                # [N, T_in, speaker_embedding_size]
                speaker_embed = tf.nn.embedding_lookup(speaker_embed_table,
                                                       speaker_id)

                deep_dense = lambda x, dim, name: tf.layers.dense(
                    x, dim, activation=tf.nn.softsign, name=name
                )  # softsign: x / (abs(x) + 1)

                encoder_rnn_init_state = deep_dense(
                    speaker_embed, hp.encoder_lstm_units * 4,
                    'encoder_init_dense')  # hp.encoder_lstm_units = 256

                decoder_rnn_init_states = [
                    deep_dense(speaker_embed, hp.decoder_lstm_units * 2,
                               'decoder_init_dense_{}'.format(i))
                    for i in range(hp.decoder_layers)
                ]  # hp.decoder_lstm_units = 1024

                speaker_embed = None
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

        with tf.variable_scope('Encoder') as scope:
            ##############
            # Encoder
            ##############
            x = char_embedded_inputs
            for i in range(hp.enc_conv_num_layers):
                x = tf.layers.conv1d(x,
                                     filters=hp.enc_conv_channels,
                                     kernel_size=hp.enc_conv_kernel_size,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='dropout_{}'.format(i))

            if encoder_rnn_init_state is not None:
                initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split(
                    encoder_rnn_init_state, 4, 1)
                initial_state_fw = LSTMStateTuple(initial_state_fw_c,
                                                  initial_state_fw_h)
                initial_state_bw = LSTMStateTuple(initial_state_bw_c,
                                                  initial_state_bw_h)
            else:  # single mode
                initial_state_fw, initial_state_bw = None, None

            cell_fw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            ##############
            # Attention
            ##############
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=hp.mask_encoder,
                    memory_sequence_length=input_lengths,
                    smoothing=hp.smoothing,
                    cumulate_weights=hp.cumulative_weights)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            decoder_lstm = [
                ZoneoutLSTMCell(hp.decoder_lstm_units,
                                is_training,
                                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                                zoneout_factor_output=hp.tacotron_zoneout_rate,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(hp.decoder_layers)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "multi-speaker":

                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx][0].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1[1] * 2 != shape2[1]:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    c, h = tf.split(cell, 2, 1)
                    decoder_init_state[idx] = LSTMStateTuple(c, h)

                decoder_init_state = tuple(decoder_init_state)

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_prenet_outputs = DecoderWrapper(attention_cell, is_training,
                                                hp.dec_prenet_sizes,
                                                hp.dropout_prob,
                                                hp.inference_prenet_dropout)

            dec_outputs_cell = OutputProjectionWrapper(
                dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor)

            if is_training:
                helper = TacoTrainingHelper(
                    mel_targets, hp.num_mels,
                    hp.reduction_factor)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor))  # max_iters=200

            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor],
                [batch_size, -1, hp.num_mels
                 ])  # [N,iters,400] -> [N,5*iters,80]
            stop_token_outputs = tf.reshape(
                decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:],
                [batch_size, -1])  # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(hp.postnet_num_layers):
                activation = tf.nn.tanh if i != (hp.postnet_num_layers -
                                                 1) else None
                x = tf.layers.conv1d(x,
                                     filters=hp.postnet_channels,
                                     kernel_size=hp.postnet_kernel_size,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq,
                name='linear_spectogram_projection')  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state
            self.stop_token_targets = stop_token_targets
            self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            log('    encoder conv out:               %d' %
                encoder_conv_output.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    decoder prenet lstm concat out :        %d' %
                dec_prenet_outputs.output_size)
            log('    decoder cell out:         %d' %
                dec_outputs_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder mel out:    %d' % decoder_mel_outputs.shape[-1])
            log('    mel out:    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
示例#11
0
    def add_decoder_op(self, enc_final_state, enc_hidden_states,
                       output_embed_matrix, training):
        original_enc_final_state = enc_final_state
        flat_enc_final_state = nest.flatten(enc_final_state)
        enc_final_state = tf.concat(flat_enc_final_state, axis=1)
        enc_final_size = int(enc_final_state.get_shape()[1])

        part_logit_preds = dict()
        part_token_preds = dict()
        part_logit_sequence_preds = dict()
        part_token_sequence_preds = dict()
        part_layers = []
        grammar = self.config.grammar
        for i, part in enumerate(('trigger', 'query', 'action')):
            with tf.variable_scope('decode_function_' + part):
                activation = getattr(
                    tf.nn, self.config.function_nonlinearity) if hasattr(
                        tf.nn, self.config.function_nonlinearity) else getattr(
                            tf, self.config.function_nonlinearity)
                layer = tf.contrib.layers.fully_connected(
                    enc_final_state,
                    self.config.function_hidden_size,
                    activation_fn=activation)
                part_layers.append(layer)
                layer_with_dropout = tf.nn.dropout(
                    layer, keep_prob=self.dropout_placeholder, seed=443 * i)
                part_logit_preds[part] = tf.layers.dense(
                    layer_with_dropout, len(grammar.functions[part]))
                part_token_preds[part] = tf.cast(tf.argmax(
                    part_logit_preds[part], axis=1),
                                                 dtype=tf.int32)

        first_value_token = grammar.num_functions + grammar.num_begin_tokens + grammar.num_control_tokens
        num_value_tokens = grammar.output_size - first_value_token
        output_embed_matrix = tf.concat(
            (output_embed_matrix[0:grammar.num_control_tokens],
             output_embed_matrix[first_value_token:]),
            axis=0)

        adjusted_trigger = part_token_preds['trigger'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens)
        adjusted_query = part_token_preds['query'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens +
            len(grammar.functions['trigger']))
        adjusted_action = part_token_preds['action'] + (
            grammar.num_control_tokens + grammar.num_begin_tokens +
            len(grammar.functions['trigger']) +
            len(grammar.functions['query']))

        layer_concat = tf.concat(part_layers, axis=1)
        for i, part in enumerate(('trigger', 'query', 'action')):
            with tf.variable_scope('decode_sequence_' + part):

                def one_decoder_input(i, like):
                    with tf.variable_scope(str(i)):
                        return tf.layers.dense(layer_concat,
                                               like.get_shape()[1])

                flat_decoder_initial_state = [
                    one_decoder_input(i, like)
                    for i, like in enumerate(flat_enc_final_state)
                ]
                decoder_initial_state = nest.pack_sequence_as(
                    original_enc_final_state, flat_decoder_initial_state)
                cell_dec = tf.contrib.rnn.MultiRNNCell([
                    self.make_rnn_cell(i, True)
                    for i in range(self.config.rnn_layers)
                ])

                # uncompress function tokens (to look them up in the grammar)
                if training:
                    adjusted_function_token = self.part_function_placeholders[
                        part]
                else:
                    if part == 'trigger':
                        adjusted_function_token = adjusted_trigger
                    elif part == 'query':
                        adjusted_function_token = adjusted_query
                    elif part == 'action':
                        adjusted_function_token = adjusted_action

                # adjust the sequence to "skip" function tokens
                output_size = grammar.num_control_tokens + num_value_tokens
                output = self.part_sequence_placeholders[part]
                adjusted_output = tf.where(
                    output >= grammar.num_control_tokens,
                    output - (first_value_token - grammar.num_control_tokens),
                    output)

                if self.config.apply_attention:
                    attention = LuongAttention(self.config.decoder_hidden_size,
                                               enc_hidden_states,
                                               self.input_length_placeholder,
                                               probability_fn=tf.nn.softmax)
                    cell_dec = AttentionWrapper(
                        cell_dec,
                        attention,
                        cell_input_fn=lambda inputs, _: inputs,
                        attention_layer_size=self.config.decoder_hidden_size,
                        initial_cell_state=decoder_initial_state)
                    decoder_initial_state = cell_dec.zero_state(
                        self.batch_size, dtype=tf.float32)
                decoder = Seq2SeqDecoder(
                    self.config,
                    self.input_placeholder,
                    self.input_length_placeholder,
                    adjusted_output,
                    self.part_sequence_length_placeholders[part],
                    self.batch_number_placeholder,
                    max_length=MAX_PRIMITIVE_LENGTH)
                rnn_output, sample_ids = decoder.decode(
                    cell_dec,
                    decoder_initial_state,
                    output_size,
                    output_embed_matrix,
                    training,
                    grammar_helper=PrimitiveSequenceGrammarHelper(
                        grammar, adjusted_function_token))
                part_logit_sequence_preds[part] = rnn_output
                part_token_sequence_preds[part] = tf.cast(sample_ids,
                                                          dtype=tf.int32)

        with tf.variable_scope('top_classifier'):
            top_hidden = tf.contrib.layers.fully_connected(
                enc_final_state,
                self.config.first_token_hidden_size,
                activation_fn=tf.tanh)
            top_hidden_with_dropout = tf.nn.dropout(
                top_hidden, keep_prob=self.dropout_placeholder, seed=127)
            top_logits = tf.layers.dense(top_hidden_with_dropout,
                                         grammar.num_begin_tokens)
            top_token = tf.cast(tf.argmax(top_logits, axis=1), dtype=tf.int32)

        with tf.variable_scope('decode_special'):
            output_size = grammar.num_control_tokens + num_value_tokens
            output = self.special_label_placeholder
            adjusted_output = tf.where(
                output >= grammar.num_control_tokens,
                output - (first_value_token - grammar.num_control_tokens),
                output)
            cell_dec = tf.contrib.rnn.MultiRNNCell([
                self.make_rnn_cell(i, True)
                for i in range(self.config.rnn_layers)
            ])

            sequence_length = tf.ones(
                (self.batch_size, ), dtype=tf.int32) * MAX_SPECIAL_LENGTH
            decoder_initial_state = original_enc_final_state
            if self.config.apply_attention:
                attention = LuongAttention(self.config.decoder_hidden_size,
                                           enc_hidden_states,
                                           self.input_length_placeholder,
                                           probability_fn=tf.nn.softmax)
                cell_dec = AttentionWrapper(
                    cell_dec,
                    attention,
                    cell_input_fn=lambda inputs, _: inputs,
                    attention_layer_size=self.config.decoder_hidden_size,
                    initial_cell_state=original_enc_final_state)
                decoder_initial_state = cell_dec.zero_state(self.batch_size,
                                                            dtype=tf.float32)
            decoder = Seq2SeqDecoder(self.config,
                                     self.input_placeholder,
                                     self.input_length_placeholder,
                                     adjusted_output,
                                     sequence_length,
                                     self.batch_number_placeholder,
                                     max_length=MAX_SPECIAL_LENGTH)
            rnn_output, sample_ids = decoder.decode(
                cell_dec,
                decoder_initial_state,
                output_size,
                output_embed_matrix,
                training,
                grammar_helper=SpecialSequenceGrammarHelper(grammar))
            logit_special_sequence = rnn_output
            token_special_sequence = tf.cast(sample_ids, dtype=tf.int32)

        # adjust tokens back to their output code
        adjusted_top = tf.expand_dims(top_token + grammar.num_control_tokens,
                                      axis=1)

        adjusted_special_sequence = tf.where(
            token_special_sequence >= grammar.num_control_tokens,
            token_special_sequence +
            (first_value_token - grammar.num_control_tokens),
            token_special_sequence)

        adjusted_token_sequences = dict()
        for part in ('trigger', 'query', 'action'):
            token_sequence = part_token_sequence_preds[part]
            adjusted_token_sequence = tf.where(
                token_sequence >= grammar.num_control_tokens, token_sequence +
                (first_value_token - grammar.num_control_tokens),
                token_sequence)
            adjusted_token_sequences[part] = adjusted_token_sequence
        # remove EOS from the middle of the sentence
        adjusted_token_sequences['trigger'] = tf.where(
            tf.equal(adjusted_token_sequences['trigger'], grammar.end),
            tf.zeros_like(adjusted_token_sequences['trigger']),
            adjusted_token_sequences['trigger'])
        adjusted_token_sequences['query'] = tf.where(
            tf.equal(adjusted_token_sequences['query'], grammar.end),
            tf.zeros_like(adjusted_token_sequences['query']),
            adjusted_token_sequences['query'])

        adjusted_trigger = tf.expand_dims(adjusted_trigger, axis=1)
        adjusted_query = tf.expand_dims(adjusted_query, axis=1)
        adjusted_action = tf.expand_dims(adjusted_action, axis=1)

        program_sequence = tf.concat(
            (adjusted_top, adjusted_trigger,
             adjusted_token_sequences['trigger'], adjusted_query,
             adjusted_token_sequences['query'], adjusted_action,
             adjusted_token_sequences['action']),
            axis=1)
        full_special_sequence = tf.concat(
            (adjusted_top, adjusted_special_sequence), axis=1)
        # full special sequence is smaller than program sequence, so we need to pad it all the way to the same shape
        full_special_sequence = pad_up_to(full_special_sequence,
                                          tf.shape(program_sequence)[1],
                                          rank=1)

        rule_token = grammar.dictionary['rule'] - grammar.num_control_tokens
        full_sequence = tf.where(tf.equal(top_token, rule_token),
                                 program_sequence, full_special_sequence)

        return ThreePartAlignerResult(top_logits, part_logit_preds,
                                      part_logit_sequence_preds,
                                      logit_special_sequence, full_sequence)
示例#12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   stop_token_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

        with tf.variable_scope('Encoder') as scope:

            x = embedded_inputs

            #3 Conv Layers
            for i in range(3):
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='dropout_{}'.format(i))
            encoder_conv_output = x

            #bi-directional LSTM
            cell_fw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_bw_LSTM')

            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            if hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    128,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=True,
                    memory_sequence_length=input_lengths,
                    smoothing=False,
                    cumulate_weights=True)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    128,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'step_bah':
                attention_mechanism = BahdanauStepwiseMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    mode="parallel")
            elif hp.attention_type == 'mon_bah':
                attention_mechanism = BahdanauMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loung':
                attention_mechanism = LuongAttention(
                    128, encoder_outputs, memory_sequence_length=input_lengths)

            # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
            #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
            #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)

            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음

            attention_cell = AttentionWrapper(decoder_lstm,
                                              attention_mechanism,
                                              alignment_history=True,
                                              output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_outputs = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                dec_outputs, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]
            #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training,
                                     hp.postnet_depth)
            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            #self.stop_token_targets = stop_token_targets
            #self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            #log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
示例#13
0
            def __init__(self,
                         name,
                         input_reprs,
                         roll_direction=0,
                         activate=True,
                         is_translate=False,
                         word_in=None,
                         encoder_reprs=encoder.bi_reprs):
                self.name = name
                with tf.variable_scope(name + '/predictions'):
                    #decoder_state = tf.layers.dense(input_reprs, config.projection_size, name='encoder_to_decoder')
                    decoder_state = input_reprs

                    with tf.variable_scope('word_embeddings_vi'):
                        word_embedding_matrix = tf.get_variable(
                            'word_embedding_matrix_vi',
                            initializer=pretrained_embeddings_vi)
                        if is_translate:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, word_in)
                        else:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, words_tgt_in)
                        word_embeddings = tf.nn.dropout(
                            word_embeddings, inputs.keep_prob)
                        word_embeddings *= tf.get_variable('emb_scale',
                                                           initializer=1.0)

                    decoder_lstm = model_helpers.lstm_cell(
                        config.bidirectional_sizes[0], inputs.keep_prob,
                        config.projection_size)

                    decoder_output_layer = tf.layers.Dense(n_classes,
                                                           name='predict')

                    if not is_translate:
                        attention_mechanism = LuongAttention(
                            num_units=config.attention_units,
                            memory=encoder_reprs,
                            memory_sequence_length=size_sr,
                            scale=True)
                        attention_cell = AttentionWrapper(
                            decoder_lstm,
                            attention_mechanism,
                            attention_layer_size=config.attention_units)

                        batch_size = tf.shape(words_tgt_in)[0]
                        decoder_initial_state = attention_cell.zero_state(
                            dtype=tf.float32,
                            batch_size=batch_size * config.beam_width)
                        decoder_state = decoder_initial_state.clone(
                            cell_state=decoder_state)

                        helper = tf.contrib.seq2seq.TrainingHelper(
                            word_embeddings, size_tgt)

                        decoder = tf.contrib.seq2seq.BasicDecoder(
                            attention_cell, helper, decoder_state,
                            decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder)
                        # swap_memory=True)

                        self.logits = outputs.rnn_output
                    else:
                        if config.decode_mode == 'greedy':
                            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                                word_embedding_matrix,
                                [embeddings.START, embeddings.START],
                                embeddings.END)

                            decoder = tf.contrib.seq2seq.BasicDecoder(
                                decoder_lstm, helper, decoder_state,
                                decoder_output_layer)
                        elif config.decode_mode == 'beam':
                            encoder_reprs = tf.contrib.seq2seq.tile_batch(
                                encoder_reprs, multiplier=config.beam_width)
                            decoder_state = tf.contrib.seq2seq.tile_batch(
                                decoder_state, multiplier=config.beam_width)
                            size_src = tf.contrib.seq2seq.tile_batch(
                                size_sr, multiplier=config.beam_width)

                            attention_mechanism = LuongAttention(
                                num_units=config.attention_units,
                                memory=encoder_reprs,
                                memory_sequence_length=size_src,
                                scale=True)
                            attention_cell = AttentionWrapper(
                                decoder_lstm,
                                attention_mechanism,
                                attention_layer_size=config.attention_units)

                            batch_size = 2
                            decoder_initial_state = attention_cell.zero_state(
                                dtype=tf.float32,
                                batch_size=batch_size * config.beam_width)
                            decoder_state = decoder_initial_state.clone(
                                cell_state=decoder_state)

                            #decoder_state = tf.contrib.seq2seq.tile_batch(
                            #  decoder_state, multiplier=config.beam_width)

                            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                                cell=attention_cell,
                                embedding=word_embedding_matrix,
                                start_tokens=[
                                    embeddings.START, embeddings.START
                                ],
                                end_token=embeddings.END,
                                initial_state=decoder_state,
                                beam_width=config.beam_width,
                                output_layer=decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder,
                            maximum_iterations=config.max_translate_length)
                        #swap_memory=True)

                        if config.decode_mode == 'greedy':
                            self.sample_ids = outputs.sample_id
                        elif config.decode_mode == 'beam':
                            self.sample_ids = outputs.predicted_ids
                    '''
          outputs, state = tf.nn.dynamic_rnn(
            model_helpers.lstm_cell(config.bidirectional_sizes[0], inputs.keep_prob,
                                    config.projection_size),
            word_embeddings,
            initial_state=decoder_state,
            dtype=tf.float32,
            sequence_length=size_tgt,
            scope='predictlstm'
          )
          '''

                    self.state = state

                    #self.logits = tf.layers.dense(outputs, n_classes, name='predict')
                    #self.logits = tf.layers.dense(outputs.rnn_output, n_classes, name='predict')

                if is_translate:
                    return

                targets = words_tgt_out
                targets *= (1 - inputs.label_smoothing)
                targets += inputs.label_smoothing / n_classes
                self.loss = model_helpers.masked_ce_loss(
                    self.logits, targets, inputs.mask)
示例#14
0
文件: model.py 项目: zl007700/Seq2Seq
    def buildModel(self):
        T_in = self.args.T_in
        T_out = self.args.T_out
        D_in = self.args.D_in
        D_out = self.args.D_out
        E = self.args.embedding_dim
        H = self.args.hidden_dim
        SOS = self.args.SOS
        EOS = self.args.EOS
        PAD = self.args.PAD
        beam_width = 3

        # Input
        with tf.name_scope('input'):
            x = tf.placeholder(shape=(None, T_in),
                               dtype=tf.int32,
                               name='encoder_inputs')
            # N, T_out
            y = tf.placeholder(shape=(None, T_out),
                               dtype=tf.int32,
                               name='decoder_inputs')
            # N
            x_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # N
            y_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # dynamic sample num
            batch_size = tf.shape(x)[0]

            # symbol mask
            sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS
            eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS
            pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD

            # input mask
            x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32)
            y_with_sos_mask = tf.sequence_mask(y_len,
                                               T_out + 1,
                                               dtype=tf.float32)
            y_with_pad = tf.concat([y, pad], axis=1)
            eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS

            # masked inputs
            y_with_eos = y_with_pad + eos_mask
            y_with_sos = tf.concat([sos, y], axis=1)

        ## Embedding
        with tf.name_scope('embedding'):
            if self.args.use_pretrained:
                embedding_pretrained = np.fromfile(self.args.pretrained_file,
                                                   dtype=np.float32).reshape(
                                                       (-1, E))
                embedding = tf.Variable(embedding_pretrained, trainable=False)
            else:
                embedding = tf.get_variable(name='embedding',
                                            shape=(D_in, E),
                                            dtype=tf.float32,
                                            initializer=xavier_initializer())
            e_x = tf.nn.embedding_lookup(embedding, x)
            e_y = tf.nn.embedding_lookup(embedding, y_with_sos)
            if self.args.mode == 'train':
                e_x = tf.nn.dropout(e_x, self.args.keep_prob)

        ## Encoder
        with tf.name_scope('encoder'):
            ## Multi-BiLSTM
            fw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                e_x,
                sequence_length=x_len,
                dtype=tf.float32,
                time_major=False,
                scope=None)
            encoder_output = bi_encoder_output[0] + bi_encoder_output[1]
            encoder_final_state = bi_encoder_state[0]

        ## Decoder
        with tf.name_scope('decoder'):
            decoder_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            decoder_lengths = tf.ones(shape=[batch_size],
                                      dtype=tf.int32) * (T_out + 1)

            ## Trainning decoder
            with tf.variable_scope('attention'):
                attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=encoder_output,
                    memory_sequence_length=x_len,
                    name='attention_fn')
            projection_layer = Dense(units=D_out,
                                     kernel_initializer=xavier_initializer())

            train_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=H)
            train_decoder_init_state = train_decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
            training_helper = TrainingHelper(e_y,
                                             decoder_lengths,
                                             time_major=False)
            train_decoder = BasicDecoder(
                cell=train_decoder_cell,
                helper=training_helper,
                initial_state=train_decoder_init_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=T_out + 1)
            # N, T_out+1, D_out
            train_decoder_outputs = ln(train_decoder_outputs.rnn_output)

            ## Beam_search decoder
            beam_memory = tile_batch(encoder_output, beam_width)
            beam_memory_state = tile_batch(encoder_final_state, beam_width)
            beam_memory_length = tile_batch(x_len, beam_width)

            with tf.variable_scope('attention', reuse=True):
                beam_attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=beam_memory,
                    memory_sequence_length=beam_memory_length,
                    name='attention_fn')
            beam_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=beam_attention_mechanism,
                attention_layer_size=None)
            beam_decoder_init_state = beam_decoder_cell.zero_state(
                batch_size=batch_size * beam_width,
                dtype=tf.float32).clone(cell_state=beam_memory_state)
            start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS
            beam_decoder = BeamSearchDecoder(
                cell=beam_decoder_cell,
                embedding=embedding,
                start_tokens=start_tokens,
                end_token=EOS,
                initial_state=beam_decoder_init_state,
                beam_width=beam_width,
                output_layer=projection_layer)
            beam_decoder_outputs, _, _ = dynamic_decode(
                beam_decoder,
                scope=tf.get_variable_scope(),
                maximum_iterations=T_out + 1)
            beam_decoder_result_ids = beam_decoder_outputs.predicted_ids

        with tf.name_scope('loss'):
            logits = tf.nn.softmax(train_decoder_outputs)
            cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(
                y_with_eos, logits)
            loss_mask = tf.sequence_mask(y_len + 1,
                                         T_out + 1,
                                         dtype=tf.float32)
            loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast(
                batch_size, dtype=tf.float32)
            prediction = tf.argmax(logits, 2)

        ## train_op
        with tf.name_scope('train'):
            global_step = tf.train.get_or_create_global_step()
            lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps)
            optimizer = tf.train.AdamOptimizer(lr)

            ## gradient clips
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(loss, trainable_params)
            clip_gradients, _ = tf.clip_by_global_norm(
                gradients, self.args.gradient_clip_num)
            train_op = optimizer.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=global_step)

        # Summary
        with tf.name_scope('summary'):
            tf.summary.scalar('lr', lr)
            tf.summary.scalar('loss', loss)
            tf.summary.scalar('global_step', global_step)
            summaries = tf.summary.merge_all()
        return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
示例#15
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """ 构建解码器cell """
        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(encoder_state,
                                               multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的
            batch_size *= self.beam_width

        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:
            #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        cell = MultiRNNCell([
            self.build_signle_cell(self.hidden_units,
                                   use_residual=self.use_residual)
            for _ in range(self.depth)
        ])
        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        def cell_input_fn(inputs, attention):
            """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算"""
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        attention_cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,
            cell_input_fn=cell_input_fn,
            name='AttentionWrapper')
        # 空状态
        decoder_initial_state = attention_cell.zero_state(
            batch_size, tf.float32)

        #传递encoder的状态  定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)
        return attention_cell, decoder_initial_state
示例#16
0
 def build_graph(self, values, values_mask):
     with vs.variable_scope(self.name):
         lens = tf.reduce_sum(values_mask, axis=1)
         attention_mechanism = LuongAttention(self.attention_dim, values, lens)
         encoder = RNNEncoder(self.attention_dim, self.keep_prob, attention_mechanism)
         return encoder.build_graph(values, values_mask)
示例#17
0
    def build_decoder_cell(self):
        """构建解码器cell"""

        encoder_outputs = self.encoder_outputs
        encoder_last_state = self.encoder_last_state
        encoder_inputs_length = self.encoder_inputs_length

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量
        # encoder_outputs, encoder_last_state, encoder_inputs_length
        # needs to be tiled so that:
        # [batch_size, .., ..] -> [batch_size x beam_width, .., ..]
        if self.use_beamsearch_decode:

            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_last_state = nest.map_structure(
                lambda s: seq2seq.tile_batch(s, self.beam_width),
                self.encoder_last_state)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)

        # 计算解码器的隐藏神经元数,如果编码器是 bidirectional 的
        # 那么解码器的一些隐藏神经元应该乘 2
        num_units = self.hidden_units
        if self.bidirectional:
            num_units *= 2

        # 下面是两种不同的 Attention 机制
        if self.attention_type.lower() == 'luong':
            # 'Luong' style attention: https://arxiv.org/abs/1508.04025
            self.attention_mechanism = LuongAttention(
                num_units=num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:  # Default Bahdanau
            # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
            self.attention_mechanism = BahdanauAttention(
                num_units=num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        # Building decoder_cell
        self.decoder_cell_list = [
            self.build_single_cell(num_units, use_residual=self.use_residual)
            for i in range(self.depth)
        ]

        decoder_initial_state = encoder_last_state

        def attn_decoder_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
            """
            if not self.attn_input_feeding:
                return inputs

            # Essential when use_residual=True
            hidden_units = self.hidden_units
            if self.bidirectional:
                hidden_units *= 2
            attn_projection = layers.Dense(
                hidden_units,
                dtype=tf.float32,
                # use_bias=False,
                name='attn_input_feeding')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        # AttentionWrapper wraps RNNCell with the attention_mechanism
        # Note: We implement Attention mechanism only on the top decoder layer
        self.decoder_cell_list[-1] = AttentionWrapper(
            cell=self.decoder_cell_list[-1],
            attention_mechanism=self.attention_mechanism,
            # attention_layer_size=self.hidden_units,
            attention_layer_size=int(num_units / 2),
            cell_input_fn=attn_decoder_input_fn,
            initial_cell_state=encoder_last_state[-1],
            alignment_history=self.alignment_history,
            name='Attention_Wrapper')

        # To be compatible with AttentionWrapper, the encoder last state
        # of the top layer should be converted
        # into the AttentionWrapperState form
        # We can easily do this by calling AttentionWrapper.zero_state

        # Also if beamsearch decoding is used,
        # the batch_size argument in .zero_state
        # should be ${decoder_beam_width} times to the origianl batch_size
        # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的
        batch_size = self.batch_size if not self.use_beamsearch_decode \
                     else self.batch_size * self.beam_width
        initial_state = [state for state in encoder_last_state]

        initial_state[-1] = self.decoder_cell_list[-1].zero_state(
            batch_size=batch_size, dtype=tf.float32)
        decoder_initial_state = tuple(initial_state)

        return MultiRNNCell(self.decoder_cell_list), decoder_initial_state
示例#18
0
    def build_decoder_cell(self,encoder_outputs,encoder_state):
        '''

        构建解码器的cell
        :param encoder_outputs:
        :param encoder_state:
        :return:
        '''
        encoder_input_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs,(1,0,2))

        if self.use_beamsearch_decode:
            '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是
            batch的数据变成了原来的self.beam_width 倍
            '''
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs,multiplier=self.beam_width
            )
            encoder_state = seq2seq.tile_batch(
                encoder_state,multiplier=self.beam_width
            )


            encoder_input_length = seq2seq.tile_batch(
                self.encoder_inputs_length,multiplier=self.beam_width
            )

            #如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size
            batch_size *=self.beam_width


        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )
        else:
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )#这里的memory 觉得传递得有问题,为什么不是encoder_state呢?

        cell = MultiRNNCell(
            [
                self.build_single_cell(
                    self.hidden_units,
                    use_residual=self.use_residual
                )

                for _ in range(self.depth)
            ])

        alignment_history = (
            self.mode != 'train' and not self.use_beamsearch_decode
        )

        def cell_input_fn(inputs,attention):
            '''
            根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算
            :param inputs:
            :param attention:
            :return:
            '''

            if not self.use_residual:
                return array_ops.concat([inputs,attention],-1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')

            '''
            这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是
            layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1))
            因为Dense内部实际上是定义了__call__(self): 的方法,因此可以这样使用
            '''
            return attn_projection(array_ops.concat([inputs,attention],-1))


        cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,#这个是attention的历史信息
            cell_input_fn=cell_input_fn,#将attention拼接起来和input拼接起来
            name='Attention_Wrapper'
        )#AttentionWrapper 注意力机制的包裹器

        decoder_initial_state = cell.zero_state(
            batch_size,tf.float32
        )#这里初始化decoder_inital_state

        #传递encoder的状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state = encoder_state
        )

        return cell,decoder_initial_state
示例#19
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """
        构建解码器cell
        :param encoder_outputs: 编码输出
        :param encoder_state: 编码final state
        :return: cell: 带attention机制的rnn解码单元,
                 decoder_initial_state:decoder隐藏状态h0输入
        """
        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # BeamSearchDecoder
        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(encoder_state,
                                               multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)

        if self.attention_type.lower() == 'luong':
            # 'Luong' style attention: https://arxiv.org/abs/1508.04025
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:  # Default Bahdanau
            # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        # Building decoder_cell
        if self.bidirectional:
            cell = MultiRNNCell([
                self.build_single_cell(self.hidden_units * 2,
                                       use_residual=self.use_residual)
                for _ in range(self.depth)
            ])
        else:
            cell = MultiRNNCell([
                self.build_single_cell(self.hidden_units,
                                       use_residual=self.use_residual)
                for _ in range(self.depth)
            ])

        # 在预测模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        def cell_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
            """
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)
            mul = 2 if self.bidirectional else 1
            attn_projection = layers.Dense(self.hidden_units * mul,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')

            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(cell,
                                self.attention_mechanism,
                                attention_layer_size=self.hidden_units,
                                alignment_history=alignment_history,
                                cell_input_fn=cell_input_fn,
                                name='Attention_Wrapper')

        if self.use_beamsearch_decode:
            # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size
            # batch_size *= self.beam_width
            decoder_initial_state = cell.zero_state(batch_size=batch_size *
                                                    self.beam_width,
                                                    dtype=tf.float32)
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=encoder_state)
        else:
            # 空状态
            decoder_initial_state = cell.zero_state(batch_size, tf.float32)
            # 传递encoder状态
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=encoder_state)

        return cell, decoder_initial_state
    def build_decoder_cell(self, encoder_outputs, encoder_states):
        '''

        构建解码器的cell,返回一个解码器的cell和解码器初始化状态。
        :param encoder_outputs:

        :param encoder_state:
        :return:
        '''
        encoder_input_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_states = encoder_states[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        assert encoder_input_length is not None, 'encoder_state_length 不能为空'
        assert isinstance(batch_size, int), 'batchsize的值必须为int类型'
        assert encoder_outputs is not None, 'encoder_outputs is not None'
        assert encoder_states is not None, 'encoder_state is not None'
        #########################使用beamsearch的情况#####################################################
        if self.use_beamsearch_decode:
            '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是
            batch的数据变成了原来的self.beam_width 倍
            '''
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width
            )
            encoder_states = seq2seq.tile_batch(
                encoder_states, multiplier=self.beam_width
            )
            encoder_input_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width
            )
            # 如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size
            batch_size *= self.beam_width
        #########################使用beamsearch的情况#####################################################

        #########################使用注意力机制###########################################################
        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )
        else:
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )  # 双向LSTM的话encoder_outputs 就是它的隐藏状态h1
        #########################使用注意力机制###########################################################

        cell = MultiRNNCell(
            [
                self.build_single_cell(
                    self.hidden_units,
                    use_residual=self.use_residual
                )
                for _ in range(self.depth)
            ])
        # 这个cell就是多层的。

        alignment_history = (
                self.mode != 'train' and not self.use_beamsearch_decode
        )

        # alignment_history在不是训练状态以及没有使用beamsearch的时候使用。

        def cell_input_fn(inputs, attention):
            '''
            根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算
            使用注意力机制才会进行的运算
            :param inputs:
            :param attention:
            :return:
            '''

            if not self.use_residual:
                print(inputs.get_shape, 'inputs_shape')
                print(attention.get_shape, 'inputs_shape')
                print(array_ops.concat([inputs, attention], -1), 'inputs和attention拼接之后的形状')
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')

            '''
            这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是
            layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1))
            Dense最终继承了Layer类,Layer中定义了call方法和__call__ 方法,Dense也重写了call方法,__call__方法中调用call方法,call方法中还是起一个全连接层层的作用,__call__
            方法中执行流程是:pre process,call,post process
            '''
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,  # 这个是attention的历史信息
            cell_input_fn=cell_input_fn,  # 将attention拼接起来和input拼接起来
            name='Attention_Wrapper'
        )  # AttentionWrapper 注意力机制的包裹器

        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32
        )  # 这里初始化decoder_inital_state

        # 传递encoder的状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_states
        )

        return cell, decoder_initial_state