def create_attention_mechanism(attention_option, num_units, memory, source_sequence_length): """Create attention mechanism based on the attention_option.""" if attention_option == "luong": attention_mechanism = LuongAttention( num_units, memory, memory_sequence_length=source_sequence_length) elif attention_option == "scaled_luong": attention_mechanism = LuongAttention( num_units, memory, memory_sequence_length=source_sequence_length, scale=True) elif attention_option == "bahdanau": attention_mechanism = BahdanauAttention( num_units, memory, memory_sequence_length=source_sequence_length) elif attention_option == "normed_bahdanau": attention_mechanism = BahdanauAttention( num_units, memory, memory_sequence_length=source_sequence_length, normalize=True) elif attention_option == "multi_head": attention_mechanism = MultiHeadAttention( num_units, memory, memory_sequence_length=source_sequence_length, num_heads=4) else: raise ValueError("Unknown attention option %s" % attention_option) return attention_mechanism
def _create_decoder_cell(self): enc_outputs, enc_states, enc_seq_len = self.enc_outputs, self.enc_states, self.enc_seq_len batch_size = self.batch_size * self.cfg.beam_size if self.use_beam_search else self.batch_size with tf.variable_scope("attention"): if self.cfg.attention == "luong": # Luong attention mechanism attention_mechanism = LuongAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) else: # default using Bahdanau attention mechanism attention_mechanism = BahdanauAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) def cell_input_fn( inputs, attention ): # define cell input function to keep input/output dimension same # reference: https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/AttentionWrapper if not self.cfg.use_attention_input_feeding: return inputs input_project = tf.layers.Dense(self.cfg.num_units, dtype=tf.float32, name='attn_input_feeding') return input_project(tf.concat([inputs, attention], axis=-1)) if self.cfg.top_attention: # apply attention mechanism only on the top decoder layer cells = [ self._create_rnn_cell() for _ in range(self.cfg.num_layers) ] cells[-1] = AttentionWrapper( cells[-1], attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states[-1], cell_input_fn=cell_input_fn) initial_state = [state for state in enc_states] initial_state[-1] = cells[-1].zero_state(batch_size=batch_size, dtype=tf.float32) dec_init_states = tuple(initial_state) cells = MultiRNNCell(cells) else: cells = MultiRNNCell( [self._create_rnn_cell() for _ in range(self.cfg.num_layers)]) cells = AttentionWrapper(cells, attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states, cell_input_fn=cell_input_fn) dec_init_states = cells.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=enc_states) return cells, dec_init_states
def apply_attention(cell_dec, enc_hidden_states, enc_final_state, input_length, batch_size, attention_probability_fn, dropout, alignment_history=True): if attention_probability_fn == 'softmax': probability_fn = tf.nn.softmax score_mask_value = float('-inf') elif attention_probability_fn == 'hardmax': probability_fn = tf.contrib.seq2seq.hardmax score_mask_value = float('-inf') elif attention_probability_fn == 'sparsemax': def sparsemax(attentionscores): attentionscores = tf.contrib.sparsemax.sparsemax(attentionscores) with tf.control_dependencies([ tf.assert_non_negative(attentionscores), tf.assert_less_equal(attentionscores, 1., summarize=60) ]): return tf.identity(attentionscores) probability_fn = sparsemax # sparsemax does not deal with -inf properly, and has significant numerical stability issues # with large numbers (positive or negative) score_mask_value = -1e+5 else: raise ValueError("Invalid attention_probability_fn " + str(attention_probability_fn)) with tf.variable_scope( 'attention', initializer=tf.initializers.identity(dtype=tf.float32)): attention = LuongAttention(int(cell_dec.output_size), enc_hidden_states, memory_sequence_length=input_length, probability_fn=probability_fn, score_mask_value=score_mask_value) cell_dec = AttentionWrapper(cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=int(cell_dec.output_size), alignment_history=alignment_history, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(batch_size, dtype=tf.float32) cell_dec = ActivationWrapper(cell_dec, activation=tf.tanh) cell_dec = NotBrokenDropoutWrapper(cell_dec, output_keep_prob=dropout) return cell_dec, enc_final_state
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) encoder_hidden_size = int(enc_hidden_states.get_shape()[-1]) decoder_hidden_size = int(cell_dec.output_size) # if encoder and decoder have different sizes, add a projection layer if encoder_hidden_size != decoder_hidden_size: assert False, (encoder_hidden_size, decoder_hidden_size) with tf.variable_scope('hidden_projection'): kernel = tf.get_variable( 'kernel', (encoder_hidden_size, decoder_hidden_size), dtype=tf.float32) # apply a relu to the projection for good measure enc_final_state = nest.map_structure( lambda x: tf.nn.relu(tf.matmul(x, kernel)), enc_final_state) enc_hidden_states = tf.nn.relu( tf.tensordot(enc_hidden_states, kernel, [[2], [1]])) else: # flatten and repack the state enc_final_state = nest.pack_sequence_as( cell_dec.state_size, nest.flatten(enc_final_state)) cell_dec = ParentFeedingCellWrapper(cell_dec, enc_final_state) if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder(self.config, self.input_placeholder, self.input_length_placeholder, self.output_placeholder, self.output_length_placeholder, self.batch_number_placeholder) return decoder.decode(cell_dec, enc_final_state, self.config.grammar.output_size, output_embed_matrix, training)
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): cell_dec = tf.contrib.rnn.MultiRNNCell([self.make_rnn_cell(i, for_decoder=True) for i in range(self.config.rnn_layers)]) encoder_hidden_size = int(enc_hidden_states.get_shape()[-1]) decoder_hidden_size = int(cell_dec.output_size) # if encoder and decoder have different sizes, add a projection layer if encoder_hidden_size != decoder_hidden_size: assert False, (encoder_hidden_size, decoder_hidden_size) with tf.variable_scope('hidden_projection'): kernel = tf.get_variable('kernel', (encoder_hidden_size, decoder_hidden_size), dtype=tf.float32) # apply a relu to the projection for good measure enc_final_state = nest.map_structure(lambda x: tf.nn.relu(tf.matmul(x, kernel)), enc_final_state) enc_hidden_states = tf.nn.relu(tf.tensordot(enc_hidden_states, kernel, [[2], [1]])) else: # flatten and repack the state enc_final_state = nest.pack_sequence_as(cell_dec.state_size, nest.flatten(enc_final_state)) # to use these we need to tile the final encoder state / the memory # but that conflicts with our use of cell_dec on untiled inputs for the gold #cell_dec = ParentFeedingCellWrapper(cell_dec, tf.contrib.seq2seq.tile_batch(enc_final_state, self.config.beam_size)) if self.config.apply_attention and False: attention = LuongAttention(decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper(cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=decoder_hidden_size, initial_cell_state=enc_final_state) enc_final_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) print('enc_final_state', enc_final_state) linear_layer = tf_core_layers.Dense(self.config.output_size) go_vector = tf.ones((self.batch_size,), dtype=tf.int32) * self.config.grammar.start decoder = BeamSearchOptimizationDecoder(training, cell_dec, output_embed_matrix, go_vector, self.config.grammar.end, enc_final_state, beam_width=self.config.beam_size, output_layer=linear_layer, gold_sequence=self.output_placeholder if training else None, gold_sequence_length=(self.output_length_placeholder+1) if training else None) if self.config.use_grammar_constraints: raise NotImplementedError("Grammar constraints are not implemented for the beam search yet") # dynamic_decode craps itself if we pass output_time_major=False, as it tries to transpose # the loss vector final_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, maximum_iterations=self.config.max_length) return final_outputs
def build_decoder_cell(self, encoder_outputs, encoder_state): """ 构建解码器cell :param encoder_outputs: :param encoder_state: :return: """ encoder_input_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirection: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) if self.use_beamsearch_decode: # 复制多份 encoder_outputs = seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width ) encoder_state = seq2seq.tile_batch( encoder_state, multiplier=self.beam_width ) encoder_input_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width ) batch_size *= self.beam_width if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_size, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) else: self.attention_mechanism = BahdanauAttention( num_units=self.hidden_size, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) cell = MultiRNNCell([ self.build_single_cell( self.hidden_size, use_residual=self.use_residual) for _ in range(self.depth) ]) alignment_history = ( self.mode != 'train' and not self.use_beamsearch_decode ) def cell_input_fn(inputs, attention): if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_size, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_size, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='Attention_Wrapper' ) decoder_initial_state = cell.zero_state( batch_size, tf.float32) # 传递encoder状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state ) return cell, decoder_initial_state
def build_decoder_cell(self, encoder_outputs, encoder_state): """构建解码器cell""" encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size #编码器的参数可以用于解码器的参数初始化,当双向的时候只需要depth层的参数即可 if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量 if self.use_beamsearch_decode: #将encoder_outputs扩大multiplier倍 encoder_outputs = seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch( encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的 batch_size *= self.beam_width # 下面是两种不同的 Attention 机制 #https://blog.csdn.net/u010960155/article/details/82853632 if self.attention_type.lower() == 'luong': # 'Luong' style attention: https://arxiv.org/abs/1508.04025 self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length ) else: # Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length ) # 定义多层的rnn cell = MultiRNNCell([ self.build_single_cell( self.hidden_units, use_residual=self.use_residual ) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = ( self.mode != 'train' and not self.use_beamsearch_decode ) def cell_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 如果使用残差网络,需要先进行投影 """ #如果不使用残差网络 if not self.use_residual: return array_ops.concat([inputs, attention], -1) #如果使用残差网络则需先投影 attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) #对基本cell进行封装变成带注意力机制的cell网络 cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism,#attention的类型 attention_layer_size=self.hidden_units,#隐藏层单元数 alignment_history=alignment_history, cell_input_fn=cell_input_fn,#输入的输入方式 name='Attention_Wrapper') # 将解码器状态置为空状态 decoder_initial_state = cell.zero_state( batch_size, tf.float32) #确保decoder_initial_state 和 decoder_initial_state 的dtype类型相同 # 用encoder的参数去初始化解码器 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) #返回解码器的基本单元和解码器的初始化参数 return cell, decoder_initial_state
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def build_decoder_cell(self, encoder_outputs, encoder_state): """构建解码器cell""" encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量 # encoder_outputs, encoder_state, encoder_inputs_length # needs to be tiled so that: # [batch_size, .., ..] -> [batch_size x beam_width, .., ..] if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch(encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的 batch_size *= self.beam_width # 下面是两种不同的 Attention 机制 if self.attention_type.lower() == 'luong': # 'Luong' style attention: https://arxiv.org/abs/1508.04025 self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: # Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # Building decoder_cell cell = MultiRNNCell([ self.build_single_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) def cell_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 """ if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper(cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='Attention_Wrapper') # 空状态 decoder_initial_state = cell.zero_state(batch_size, tf.float32) # 传递encoder状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) # if self.use_beamsearch_decode: # decoder_initial_state = seq2seq.tile_batch( # decoder_initial_state, multiplier=self.beam_width) return cell, decoder_initial_state
def initialize(self, inputs, input_lengths, num_speakers, speaker_id=None, mel_targets=None, linear_targets=None, is_training=False, loss_coeff=None, stop_token_targets=None): with tf.variable_scope('Eembedding') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) deep_dense = lambda x, dim, name: tf.layers.dense( x, dim, activation=tf.nn.softsign, name=name ) # softsign: x / (abs(x) + 1) encoder_rnn_init_state = deep_dense( speaker_embed, hp.encoder_lstm_units * 4, 'encoder_init_dense') # hp.encoder_lstm_units = 256 decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.decoder_lstm_units * 2, 'decoder_init_dense_{}'.format(i)) for i in range(hp.decoder_layers) ] # hp.decoder_lstm_units = 1024 speaker_embed = None else: # self.num_speakers =1인 경우 speaker_embed = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None with tf.variable_scope('Encoder') as scope: ############## # Encoder ############## x = char_embedded_inputs for i in range(hp.enc_conv_num_layers): x = tf.layers.conv1d(x, filters=hp.enc_conv_channels, kernel_size=hp.enc_conv_kernel_size, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='dropout_{}'.format(i)) if encoder_rnn_init_state is not None: initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split( encoder_rnn_init_state, 4, 1) initial_state_fw = LSTMStateTuple(initial_state_fw_c, initial_state_fw_h) initial_state_bw = LSTMStateTuple(initial_state_bw_c, initial_state_bw_h) else: # single mode initial_state_fw, initial_state_bw = None, None cell_fw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: ############## # Attention ############## if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) decoder_lstm = [ ZoneoutLSTMCell(hp.decoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='decoder_LSTM_{}'.format(i + 1)) for i in range(hp.decoder_layers) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "multi-speaker": decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx][0].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1[1] * 2 != shape2[1]: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) c, h = tf.split(cell, 2, 1) decoder_init_state[idx] = LSTMStateTuple(c, h) decoder_init_state = tuple(decoder_init_state) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_prenet_outputs = DecoderWrapper(attention_cell, is_training, hp.dec_prenet_sizes, hp.dropout_prob, hp.inference_prenet_dropout) dec_outputs_cell = OutputProjectionWrapper( dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor) if is_training: helper = TacoTrainingHelper( mel_targets, hp.num_mels, hp.reduction_factor) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor)) # max_iters=200 decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor], [batch_size, -1, hp.num_mels ]) # [N,iters,400] -> [N,5*iters,80] stop_token_outputs = tf.reshape( decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(hp.postnet_num_layers): activation = tf.nn.tanh if i != (hp.postnet_num_layers - 1) else None x = tf.layers.conv1d(x, filters=hp.postnet_channels, kernel_size=hp.postnet_kernel_size, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') linear_outputs = tf.layers.dense( post_outputs, hp.num_freq, name='linear_spectogram_projection') # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state self.stop_token_targets = stop_token_targets self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) log(' encoder conv out: %d' % encoder_conv_output.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' decoder prenet lstm concat out : %d' % dec_prenet_outputs.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder mel out: %d' % decoder_mel_outputs.shape[-1]) log(' mel out: %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def add_decoder_op(self, enc_final_state, enc_hidden_states, output_embed_matrix, training): original_enc_final_state = enc_final_state flat_enc_final_state = nest.flatten(enc_final_state) enc_final_state = tf.concat(flat_enc_final_state, axis=1) enc_final_size = int(enc_final_state.get_shape()[1]) part_logit_preds = dict() part_token_preds = dict() part_logit_sequence_preds = dict() part_token_sequence_preds = dict() part_layers = [] grammar = self.config.grammar for i, part in enumerate(('trigger', 'query', 'action')): with tf.variable_scope('decode_function_' + part): activation = getattr( tf.nn, self.config.function_nonlinearity) if hasattr( tf.nn, self.config.function_nonlinearity) else getattr( tf, self.config.function_nonlinearity) layer = tf.contrib.layers.fully_connected( enc_final_state, self.config.function_hidden_size, activation_fn=activation) part_layers.append(layer) layer_with_dropout = tf.nn.dropout( layer, keep_prob=self.dropout_placeholder, seed=443 * i) part_logit_preds[part] = tf.layers.dense( layer_with_dropout, len(grammar.functions[part])) part_token_preds[part] = tf.cast(tf.argmax( part_logit_preds[part], axis=1), dtype=tf.int32) first_value_token = grammar.num_functions + grammar.num_begin_tokens + grammar.num_control_tokens num_value_tokens = grammar.output_size - first_value_token output_embed_matrix = tf.concat( (output_embed_matrix[0:grammar.num_control_tokens], output_embed_matrix[first_value_token:]), axis=0) adjusted_trigger = part_token_preds['trigger'] + ( grammar.num_control_tokens + grammar.num_begin_tokens) adjusted_query = part_token_preds['query'] + ( grammar.num_control_tokens + grammar.num_begin_tokens + len(grammar.functions['trigger'])) adjusted_action = part_token_preds['action'] + ( grammar.num_control_tokens + grammar.num_begin_tokens + len(grammar.functions['trigger']) + len(grammar.functions['query'])) layer_concat = tf.concat(part_layers, axis=1) for i, part in enumerate(('trigger', 'query', 'action')): with tf.variable_scope('decode_sequence_' + part): def one_decoder_input(i, like): with tf.variable_scope(str(i)): return tf.layers.dense(layer_concat, like.get_shape()[1]) flat_decoder_initial_state = [ one_decoder_input(i, like) for i, like in enumerate(flat_enc_final_state) ] decoder_initial_state = nest.pack_sequence_as( original_enc_final_state, flat_decoder_initial_state) cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) # uncompress function tokens (to look them up in the grammar) if training: adjusted_function_token = self.part_function_placeholders[ part] else: if part == 'trigger': adjusted_function_token = adjusted_trigger elif part == 'query': adjusted_function_token = adjusted_query elif part == 'action': adjusted_function_token = adjusted_action # adjust the sequence to "skip" function tokens output_size = grammar.num_control_tokens + num_value_tokens output = self.part_sequence_placeholders[part] adjusted_output = tf.where( output >= grammar.num_control_tokens, output - (first_value_token - grammar.num_control_tokens), output) if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=decoder_initial_state) decoder_initial_state = cell_dec.zero_state( self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder( self.config, self.input_placeholder, self.input_length_placeholder, adjusted_output, self.part_sequence_length_placeholders[part], self.batch_number_placeholder, max_length=MAX_PRIMITIVE_LENGTH) rnn_output, sample_ids = decoder.decode( cell_dec, decoder_initial_state, output_size, output_embed_matrix, training, grammar_helper=PrimitiveSequenceGrammarHelper( grammar, adjusted_function_token)) part_logit_sequence_preds[part] = rnn_output part_token_sequence_preds[part] = tf.cast(sample_ids, dtype=tf.int32) with tf.variable_scope('top_classifier'): top_hidden = tf.contrib.layers.fully_connected( enc_final_state, self.config.first_token_hidden_size, activation_fn=tf.tanh) top_hidden_with_dropout = tf.nn.dropout( top_hidden, keep_prob=self.dropout_placeholder, seed=127) top_logits = tf.layers.dense(top_hidden_with_dropout, grammar.num_begin_tokens) top_token = tf.cast(tf.argmax(top_logits, axis=1), dtype=tf.int32) with tf.variable_scope('decode_special'): output_size = grammar.num_control_tokens + num_value_tokens output = self.special_label_placeholder adjusted_output = tf.where( output >= grammar.num_control_tokens, output - (first_value_token - grammar.num_control_tokens), output) cell_dec = tf.contrib.rnn.MultiRNNCell([ self.make_rnn_cell(i, True) for i in range(self.config.rnn_layers) ]) sequence_length = tf.ones( (self.batch_size, ), dtype=tf.int32) * MAX_SPECIAL_LENGTH decoder_initial_state = original_enc_final_state if self.config.apply_attention: attention = LuongAttention(self.config.decoder_hidden_size, enc_hidden_states, self.input_length_placeholder, probability_fn=tf.nn.softmax) cell_dec = AttentionWrapper( cell_dec, attention, cell_input_fn=lambda inputs, _: inputs, attention_layer_size=self.config.decoder_hidden_size, initial_cell_state=original_enc_final_state) decoder_initial_state = cell_dec.zero_state(self.batch_size, dtype=tf.float32) decoder = Seq2SeqDecoder(self.config, self.input_placeholder, self.input_length_placeholder, adjusted_output, sequence_length, self.batch_number_placeholder, max_length=MAX_SPECIAL_LENGTH) rnn_output, sample_ids = decoder.decode( cell_dec, decoder_initial_state, output_size, output_embed_matrix, training, grammar_helper=SpecialSequenceGrammarHelper(grammar)) logit_special_sequence = rnn_output token_special_sequence = tf.cast(sample_ids, dtype=tf.int32) # adjust tokens back to their output code adjusted_top = tf.expand_dims(top_token + grammar.num_control_tokens, axis=1) adjusted_special_sequence = tf.where( token_special_sequence >= grammar.num_control_tokens, token_special_sequence + (first_value_token - grammar.num_control_tokens), token_special_sequence) adjusted_token_sequences = dict() for part in ('trigger', 'query', 'action'): token_sequence = part_token_sequence_preds[part] adjusted_token_sequence = tf.where( token_sequence >= grammar.num_control_tokens, token_sequence + (first_value_token - grammar.num_control_tokens), token_sequence) adjusted_token_sequences[part] = adjusted_token_sequence # remove EOS from the middle of the sentence adjusted_token_sequences['trigger'] = tf.where( tf.equal(adjusted_token_sequences['trigger'], grammar.end), tf.zeros_like(adjusted_token_sequences['trigger']), adjusted_token_sequences['trigger']) adjusted_token_sequences['query'] = tf.where( tf.equal(adjusted_token_sequences['query'], grammar.end), tf.zeros_like(adjusted_token_sequences['query']), adjusted_token_sequences['query']) adjusted_trigger = tf.expand_dims(adjusted_trigger, axis=1) adjusted_query = tf.expand_dims(adjusted_query, axis=1) adjusted_action = tf.expand_dims(adjusted_action, axis=1) program_sequence = tf.concat( (adjusted_top, adjusted_trigger, adjusted_token_sequences['trigger'], adjusted_query, adjusted_token_sequences['query'], adjusted_action, adjusted_token_sequences['action']), axis=1) full_special_sequence = tf.concat( (adjusted_top, adjusted_special_sequence), axis=1) # full special sequence is smaller than program sequence, so we need to pad it all the way to the same shape full_special_sequence = pad_up_to(full_special_sequence, tf.shape(program_sequence)[1], rank=1) rule_token = grammar.dictionary['rule'] - grammar.num_control_tokens full_sequence = tf.where(tf.equal(top_token, rule_token), program_sequence, full_special_sequence) return ThreePartAlignerResult(top_logits, part_logit_preds, part_logit_sequence_preds, logit_special_sequence, full_sequence)
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] with tf.variable_scope('Encoder') as scope: x = embedded_inputs #3 Conv Layers for i in range(3): x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) encoder_conv_output = x #bi-directional LSTM cell_fw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM') outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: if hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( 128, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'step_bah': attention_mechanism = BahdanauStepwiseMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, mode="parallel") elif hp.attention_type == 'mon_bah': attention_mechanism = BahdanauMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loung': attention_mechanism = LuongAttention( 128, encoder_outputs, memory_sequence_length=input_lengths) # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence. #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( dec_outputs, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets #self.stop_token_targets = stop_token_targets #self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) #log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, name, input_reprs, roll_direction=0, activate=True, is_translate=False, word_in=None, encoder_reprs=encoder.bi_reprs): self.name = name with tf.variable_scope(name + '/predictions'): #decoder_state = tf.layers.dense(input_reprs, config.projection_size, name='encoder_to_decoder') decoder_state = input_reprs with tf.variable_scope('word_embeddings_vi'): word_embedding_matrix = tf.get_variable( 'word_embedding_matrix_vi', initializer=pretrained_embeddings_vi) if is_translate: word_embeddings = tf.nn.embedding_lookup( word_embedding_matrix, word_in) else: word_embeddings = tf.nn.embedding_lookup( word_embedding_matrix, words_tgt_in) word_embeddings = tf.nn.dropout( word_embeddings, inputs.keep_prob) word_embeddings *= tf.get_variable('emb_scale', initializer=1.0) decoder_lstm = model_helpers.lstm_cell( config.bidirectional_sizes[0], inputs.keep_prob, config.projection_size) decoder_output_layer = tf.layers.Dense(n_classes, name='predict') if not is_translate: attention_mechanism = LuongAttention( num_units=config.attention_units, memory=encoder_reprs, memory_sequence_length=size_sr, scale=True) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, attention_layer_size=config.attention_units) batch_size = tf.shape(words_tgt_in)[0] decoder_initial_state = attention_cell.zero_state( dtype=tf.float32, batch_size=batch_size * config.beam_width) decoder_state = decoder_initial_state.clone( cell_state=decoder_state) helper = tf.contrib.seq2seq.TrainingHelper( word_embeddings, size_tgt) decoder = tf.contrib.seq2seq.BasicDecoder( attention_cell, helper, decoder_state, decoder_output_layer) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( decoder) # swap_memory=True) self.logits = outputs.rnn_output else: if config.decode_mode == 'greedy': helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( word_embedding_matrix, [embeddings.START, embeddings.START], embeddings.END) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_lstm, helper, decoder_state, decoder_output_layer) elif config.decode_mode == 'beam': encoder_reprs = tf.contrib.seq2seq.tile_batch( encoder_reprs, multiplier=config.beam_width) decoder_state = tf.contrib.seq2seq.tile_batch( decoder_state, multiplier=config.beam_width) size_src = tf.contrib.seq2seq.tile_batch( size_sr, multiplier=config.beam_width) attention_mechanism = LuongAttention( num_units=config.attention_units, memory=encoder_reprs, memory_sequence_length=size_src, scale=True) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, attention_layer_size=config.attention_units) batch_size = 2 decoder_initial_state = attention_cell.zero_state( dtype=tf.float32, batch_size=batch_size * config.beam_width) decoder_state = decoder_initial_state.clone( cell_state=decoder_state) #decoder_state = tf.contrib.seq2seq.tile_batch( # decoder_state, multiplier=config.beam_width) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=attention_cell, embedding=word_embedding_matrix, start_tokens=[ embeddings.START, embeddings.START ], end_token=embeddings.END, initial_state=decoder_state, beam_width=config.beam_width, output_layer=decoder_output_layer) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=config.max_translate_length) #swap_memory=True) if config.decode_mode == 'greedy': self.sample_ids = outputs.sample_id elif config.decode_mode == 'beam': self.sample_ids = outputs.predicted_ids ''' outputs, state = tf.nn.dynamic_rnn( model_helpers.lstm_cell(config.bidirectional_sizes[0], inputs.keep_prob, config.projection_size), word_embeddings, initial_state=decoder_state, dtype=tf.float32, sequence_length=size_tgt, scope='predictlstm' ) ''' self.state = state #self.logits = tf.layers.dense(outputs, n_classes, name='predict') #self.logits = tf.layers.dense(outputs.rnn_output, n_classes, name='predict') if is_translate: return targets = words_tgt_out targets *= (1 - inputs.label_smoothing) targets += inputs.label_smoothing / n_classes self.loss = model_helpers.masked_ce_loss( self.logits, targets, inputs.mask)
def buildModel(self): T_in = self.args.T_in T_out = self.args.T_out D_in = self.args.D_in D_out = self.args.D_out E = self.args.embedding_dim H = self.args.hidden_dim SOS = self.args.SOS EOS = self.args.EOS PAD = self.args.PAD beam_width = 3 # Input with tf.name_scope('input'): x = tf.placeholder(shape=(None, T_in), dtype=tf.int32, name='encoder_inputs') # N, T_out y = tf.placeholder(shape=(None, T_out), dtype=tf.int32, name='decoder_inputs') # N x_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # N y_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # dynamic sample num batch_size = tf.shape(x)[0] # symbol mask sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD # input mask x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32) y_with_sos_mask = tf.sequence_mask(y_len, T_out + 1, dtype=tf.float32) y_with_pad = tf.concat([y, pad], axis=1) eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS # masked inputs y_with_eos = y_with_pad + eos_mask y_with_sos = tf.concat([sos, y], axis=1) ## Embedding with tf.name_scope('embedding'): if self.args.use_pretrained: embedding_pretrained = np.fromfile(self.args.pretrained_file, dtype=np.float32).reshape( (-1, E)) embedding = tf.Variable(embedding_pretrained, trainable=False) else: embedding = tf.get_variable(name='embedding', shape=(D_in, E), dtype=tf.float32, initializer=xavier_initializer()) e_x = tf.nn.embedding_lookup(embedding, x) e_y = tf.nn.embedding_lookup(embedding, y_with_sos) if self.args.mode == 'train': e_x = tf.nn.dropout(e_x, self.args.keep_prob) ## Encoder with tf.name_scope('encoder'): ## Multi-BiLSTM fw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, e_x, sequence_length=x_len, dtype=tf.float32, time_major=False, scope=None) encoder_output = bi_encoder_output[0] + bi_encoder_output[1] encoder_final_state = bi_encoder_state[0] ## Decoder with tf.name_scope('decoder'): decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) decoder_lengths = tf.ones(shape=[batch_size], dtype=tf.int32) * (T_out + 1) ## Trainning decoder with tf.variable_scope('attention'): attention_mechanism = LuongAttention( num_units=H, memory=encoder_output, memory_sequence_length=x_len, name='attention_fn') projection_layer = Dense(units=D_out, kernel_initializer=xavier_initializer()) train_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=H) train_decoder_init_state = train_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) training_helper = TrainingHelper(e_y, decoder_lengths, time_major=False) train_decoder = BasicDecoder( cell=train_decoder_cell, helper=training_helper, initial_state=train_decoder_init_state, output_layer=projection_layer) train_decoder_outputs, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=T_out + 1) # N, T_out+1, D_out train_decoder_outputs = ln(train_decoder_outputs.rnn_output) ## Beam_search decoder beam_memory = tile_batch(encoder_output, beam_width) beam_memory_state = tile_batch(encoder_final_state, beam_width) beam_memory_length = tile_batch(x_len, beam_width) with tf.variable_scope('attention', reuse=True): beam_attention_mechanism = LuongAttention( num_units=H, memory=beam_memory, memory_sequence_length=beam_memory_length, name='attention_fn') beam_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=beam_attention_mechanism, attention_layer_size=None) beam_decoder_init_state = beam_decoder_cell.zero_state( batch_size=batch_size * beam_width, dtype=tf.float32).clone(cell_state=beam_memory_state) start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS beam_decoder = BeamSearchDecoder( cell=beam_decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=EOS, initial_state=beam_decoder_init_state, beam_width=beam_width, output_layer=projection_layer) beam_decoder_outputs, _, _ = dynamic_decode( beam_decoder, scope=tf.get_variable_scope(), maximum_iterations=T_out + 1) beam_decoder_result_ids = beam_decoder_outputs.predicted_ids with tf.name_scope('loss'): logits = tf.nn.softmax(train_decoder_outputs) cross_entropy = tf.keras.losses.sparse_categorical_crossentropy( y_with_eos, logits) loss_mask = tf.sequence_mask(y_len + 1, T_out + 1, dtype=tf.float32) loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast( batch_size, dtype=tf.float32) prediction = tf.argmax(logits, 2) ## train_op with tf.name_scope('train'): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) ## gradient clips trainable_params = tf.trainable_variables() gradients = tf.gradients(loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.args.gradient_clip_num) train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params), global_step=global_step) # Summary with tf.name_scope('summary'): tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
def build_decoder_cell(self, encoder_outputs, encoder_state): """ 构建解码器cell """ encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch(encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的 batch_size *= self.beam_width if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) cell = MultiRNNCell([ self.build_signle_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) def cell_input_fn(inputs, attention): """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算""" if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) attention_cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='AttentionWrapper') # 空状态 decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) #传递encoder的状态 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) return attention_cell, decoder_initial_state
def build_graph(self, values, values_mask): with vs.variable_scope(self.name): lens = tf.reduce_sum(values_mask, axis=1) attention_mechanism = LuongAttention(self.attention_dim, values, lens) encoder = RNNEncoder(self.attention_dim, self.keep_prob, attention_mechanism) return encoder.build_graph(values, values_mask)
def build_decoder_cell(self): """构建解码器cell""" encoder_outputs = self.encoder_outputs encoder_last_state = self.encoder_last_state encoder_inputs_length = self.encoder_inputs_length if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量 # encoder_outputs, encoder_last_state, encoder_inputs_length # needs to be tiled so that: # [batch_size, .., ..] -> [batch_size x beam_width, .., ..] if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_last_state = nest.map_structure( lambda s: seq2seq.tile_batch(s, self.beam_width), self.encoder_last_state) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) # 计算解码器的隐藏神经元数,如果编码器是 bidirectional 的 # 那么解码器的一些隐藏神经元应该乘 2 num_units = self.hidden_units if self.bidirectional: num_units *= 2 # 下面是两种不同的 Attention 机制 if self.attention_type.lower() == 'luong': # 'Luong' style attention: https://arxiv.org/abs/1508.04025 self.attention_mechanism = LuongAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: # Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # Building decoder_cell self.decoder_cell_list = [ self.build_single_cell(num_units, use_residual=self.use_residual) for i in range(self.depth) ] decoder_initial_state = encoder_last_state def attn_decoder_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 """ if not self.attn_input_feeding: return inputs # Essential when use_residual=True hidden_units = self.hidden_units if self.bidirectional: hidden_units *= 2 attn_projection = layers.Dense( hidden_units, dtype=tf.float32, # use_bias=False, name='attn_input_feeding') return attn_projection(array_ops.concat([inputs, attention], -1)) # AttentionWrapper wraps RNNCell with the attention_mechanism # Note: We implement Attention mechanism only on the top decoder layer self.decoder_cell_list[-1] = AttentionWrapper( cell=self.decoder_cell_list[-1], attention_mechanism=self.attention_mechanism, # attention_layer_size=self.hidden_units, attention_layer_size=int(num_units / 2), cell_input_fn=attn_decoder_input_fn, initial_cell_state=encoder_last_state[-1], alignment_history=self.alignment_history, name='Attention_Wrapper') # To be compatible with AttentionWrapper, the encoder last state # of the top layer should be converted # into the AttentionWrapperState form # We can easily do this by calling AttentionWrapper.zero_state # Also if beamsearch decoding is used, # the batch_size argument in .zero_state # should be ${decoder_beam_width} times to the origianl batch_size # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的 batch_size = self.batch_size if not self.use_beamsearch_decode \ else self.batch_size * self.beam_width initial_state = [state for state in encoder_last_state] initial_state[-1] = self.decoder_cell_list[-1].zero_state( batch_size=batch_size, dtype=tf.float32) decoder_initial_state = tuple(initial_state) return MultiRNNCell(self.decoder_cell_list), decoder_initial_state
def build_decoder_cell(self,encoder_outputs,encoder_state): ''' 构建解码器的cell :param encoder_outputs: :param encoder_state: :return: ''' encoder_input_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs,(1,0,2)) if self.use_beamsearch_decode: '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是 batch的数据变成了原来的self.beam_width 倍 ''' encoder_outputs = seq2seq.tile_batch( encoder_outputs,multiplier=self.beam_width ) encoder_state = seq2seq.tile_batch( encoder_state,multiplier=self.beam_width ) encoder_input_length = seq2seq.tile_batch( self.encoder_inputs_length,multiplier=self.beam_width ) #如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size batch_size *=self.beam_width if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) else: self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length )#这里的memory 觉得传递得有问题,为什么不是encoder_state呢? cell = MultiRNNCell( [ self.build_single_cell( self.hidden_units, use_residual=self.use_residual ) for _ in range(self.depth) ]) alignment_history = ( self.mode != 'train' and not self.use_beamsearch_decode ) def cell_input_fn(inputs,attention): ''' 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算 :param inputs: :param attention: :return: ''' if not self.use_residual: return array_ops.concat([inputs,attention],-1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') ''' 这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是 layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1)) 因为Dense内部实际上是定义了__call__(self): 的方法,因此可以这样使用 ''' return attn_projection(array_ops.concat([inputs,attention],-1)) cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history,#这个是attention的历史信息 cell_input_fn=cell_input_fn,#将attention拼接起来和input拼接起来 name='Attention_Wrapper' )#AttentionWrapper 注意力机制的包裹器 decoder_initial_state = cell.zero_state( batch_size,tf.float32 )#这里初始化decoder_inital_state #传递encoder的状态 decoder_initial_state = decoder_initial_state.clone( cell_state = encoder_state ) return cell,decoder_initial_state
def build_decoder_cell(self, encoder_outputs, encoder_state): """ 构建解码器cell :param encoder_outputs: 编码输出 :param encoder_state: 编码final state :return: cell: 带attention机制的rnn解码单元, decoder_initial_state:decoder隐藏状态h0输入 """ encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # BeamSearchDecoder if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch(encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) if self.attention_type.lower() == 'luong': # 'Luong' style attention: https://arxiv.org/abs/1508.04025 self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: # Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # Building decoder_cell if self.bidirectional: cell = MultiRNNCell([ self.build_single_cell(self.hidden_units * 2, use_residual=self.use_residual) for _ in range(self.depth) ]) else: cell = MultiRNNCell([ self.build_single_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) # 在预测模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) def cell_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 """ if not self.use_residual: return array_ops.concat([inputs, attention], -1) mul = 2 if self.bidirectional else 1 attn_projection = layers.Dense(self.hidden_units * mul, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper(cell, self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='Attention_Wrapper') if self.use_beamsearch_decode: # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size # batch_size *= self.beam_width decoder_initial_state = cell.zero_state(batch_size=batch_size * self.beam_width, dtype=tf.float32) decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) else: # 空状态 decoder_initial_state = cell.zero_state(batch_size, tf.float32) # 传递encoder状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) return cell, decoder_initial_state
def build_decoder_cell(self, encoder_outputs, encoder_states): ''' 构建解码器的cell,返回一个解码器的cell和解码器初始化状态。 :param encoder_outputs: :param encoder_state: :return: ''' encoder_input_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_states = encoder_states[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) assert encoder_input_length is not None, 'encoder_state_length 不能为空' assert isinstance(batch_size, int), 'batchsize的值必须为int类型' assert encoder_outputs is not None, 'encoder_outputs is not None' assert encoder_states is not None, 'encoder_state is not None' #########################使用beamsearch的情况##################################################### if self.use_beamsearch_decode: '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是 batch的数据变成了原来的self.beam_width 倍 ''' encoder_outputs = seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width ) encoder_states = seq2seq.tile_batch( encoder_states, multiplier=self.beam_width ) encoder_input_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width ) # 如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size batch_size *= self.beam_width #########################使用beamsearch的情况##################################################### #########################使用注意力机制########################################################### if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) else: self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) # 双向LSTM的话encoder_outputs 就是它的隐藏状态h1 #########################使用注意力机制########################################################### cell = MultiRNNCell( [ self.build_single_cell( self.hidden_units, use_residual=self.use_residual ) for _ in range(self.depth) ]) # 这个cell就是多层的。 alignment_history = ( self.mode != 'train' and not self.use_beamsearch_decode ) # alignment_history在不是训练状态以及没有使用beamsearch的时候使用。 def cell_input_fn(inputs, attention): ''' 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算 使用注意力机制才会进行的运算 :param inputs: :param attention: :return: ''' if not self.use_residual: print(inputs.get_shape, 'inputs_shape') print(attention.get_shape, 'inputs_shape') print(array_ops.concat([inputs, attention], -1), 'inputs和attention拼接之后的形状') return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') ''' 这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是 layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1)) Dense最终继承了Layer类,Layer中定义了call方法和__call__ 方法,Dense也重写了call方法,__call__方法中调用call方法,call方法中还是起一个全连接层层的作用,__call__ 方法中执行流程是:pre process,call,post process ''' return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, # 这个是attention的历史信息 cell_input_fn=cell_input_fn, # 将attention拼接起来和input拼接起来 name='Attention_Wrapper' ) # AttentionWrapper 注意力机制的包裹器 decoder_initial_state = cell.zero_state( batch_size, tf.float32 ) # 这里初始化decoder_inital_state # 传递encoder的状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_states ) return cell, decoder_initial_state