def r_net(self): hps = self._hps size = hps.size q_rep = self.question_inputs c_rep = self.context_inputs with tf.variable_scope('embedding_encoder_layer'): with tf.variable_scope('stacked_embedding_encoder_block'): # question encoding q_rep = encoder_block_v1(q_rep, self.batch_size, self.max_q_length, hps.dropout_rate, 4, 7, 3, size, self.dropout) tf.get_variable_scope().reuse_variables() with tf.variable_scope('stacked_embedding_encoder_block'): # context encoding c_rep = encoder_block_v1(c_rep, self.batch_size, self.max_c_length, hps.dropout_rate, 4, 7, 3, size, self.dropout) with tf.variable_scope('context_question_attention_layer'): with tf.variable_scope('question_aware_context'): with tf.variable_scope('context'): context_c = multihead_attention(q_rep, c_rep) with tf.variable_scope('question_semantic_fusion'): q_rep = tf.concat([q_rep, context_c, q_rep * context_c], axis=-1) q_rep = encoder_block_v1(q_rep, self.batch_size, self.max_q_length, hps.dropout_rate, 2, 7, 3, size, self.dropout) with tf.variable_scope('context_aware_question'): with tf.variable_scope('context'): context_q = multihead_attention(c_rep, q_rep) with tf.variable_scope('context_semantic_fusion'): c_rep = tf.concat([c_rep, context_q, c_rep * context_q], axis=-1) for i in xrange(hps.num_stacks): with tf.variable_scope('stack_%d' % i): c_rep = encoder_block_v1(c_rep, self.batch_size, self.max_c_length, hps.dropout_rate, 2, 7, 3, size, self.dropout) # with tf.variable_scope('residual_drop_%d' % i): # death_rate = self.set_death_rate(i, hps.num_stacks, hps.last_rate) # rand = tf.random_uniform([], minval=0.0, maxval=1.0) # gate = tf.Variable(rand > death_rate, trainable=False) # c_rep = tf.cond(self.dropout, # lambda: residual_drop_train(c_rep, c_rep_new, gate), # lambda: residual_drop_test(c_rep, c_rep_new, 1.0 - death_rate)) with tf.variable_scope('memory_based_answer_pointer'): with tf.variable_scope('init_state'): z_s = tf.reduce_mean(q_rep, axis=1, keep_dims=True) z_s = tf.cond(self.dropout, lambda: tf.nn.dropout(z_s, keep_prob=1.0 - hps.dropout_rate), lambda: z_s) for i in xrange(hps.T): if i > 0: tf.get_variable_scope().reuse_variables() with tf.variable_scope('start_position'): start_pos_scores, u_s = fn(c_rep, z_s) with tf.variable_scope('start_pos_memory_semantic_fusion_unit'): z_e = sfu(z_s, u_s) z_e = tf.cond(self.dropout, lambda: tf.nn.dropout(z_e, keep_prob=1.0 - hps.dropout_rate), lambda: z_e) with tf.variable_scope('end_position'): end_pos_scores, u_e = fn(c_rep, z_e) with tf.variable_scope('end_pos_memory_semantic_fusion_unit'): z_s = sfu(z_e, u_e) z_s = tf.cond(self.dropout, lambda: tf.nn.dropout(z_s, keep_prob=1.0 - hps.dropout_rate), lambda: z_s) self.pos_scores = [start_pos_scores, end_pos_scores]
def encoder_impl(self, encoder_input, is_training): attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 # Mask encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_input), axis=-1), 0.0) encoder_output = dense(encoder_input, self._config.hidden_units, activation=tf.identity, use_bias=True, name="src_change") encoder_output = tf.contrib.layers.layer_norm(encoder_output, center=True, scale=True, trainable=True) # Add positional signal encoder_output = common_attention.add_timing_signal_1d(encoder_output) # Dropout encoder_output = tf.layers.dropout(encoder_output, rate=residual_dropout_rate, training=is_training) # Blocks for i in range(self._config.num_blocks_enc): with tf.variable_scope("block_{}".format(i)): # Multihead Attention encoder_output = residual( encoder_output, multihead_attention( query_antecedent=encoder_output, memory_antecedent=None, bias=common_attention.attention_bias_ignore_padding( encoder_padding), total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name='encoder_self_attention', summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward encoder_output = residual( encoder_output, ff_hidden(inputs=encoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) # Mask padding part to zeros. encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding), axis=-1) return encoder_output
def transformer_encoder(encoder_input): x = encoder_input with tf.variable_scope('encoder'): with tf.variable_scope('self_attention'): y = multihead_attention(x, None) if hps.mode == 'train': y = tf.nn.dropout(y, 1.0 - hps.dropout_rate) with tf.variable_scope('shortcut_norm_1'): x += y return x
def encoder_impl(self, encoder_input, is_training): attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 # Mask encoder_padding = tf.equal(encoder_input, 0) # Embedding encoder_output = embedding(encoder_input, vocab_size=self._config.src_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="src_embedding") # Add positional signal encoder_output = common_attention.add_timing_signal_1d(encoder_output) # Dropout encoder_output = tf.layers.dropout( encoder_output, rate=self._config.residual_dropout_rate, training=is_training) # Blocks for i in range(self._config.num_blocks): with tf.variable_scope("block_{}".format(i)): # Multihead Attention encoder_output = residual( encoder_output, multihead_attention( query_antecedent=encoder_output, memory_antecedent=None, bias=common_attention.attention_bias_ignore_padding( encoder_padding), total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name='encoder_self_attention', summaries=True), dropout_rate=self._config.residual_dropout_rate) # Feed Forward encoder_output = residual( encoder_output, common_layers.conv_hidden_relu( inputs=encoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, summaries=True), dropout_rate=residual_dropout_rate) # Mask padding part to zeros. encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding), axis=-1) return encoder_output
def __call__(self, inputs, mask): ''' Args: inputs: sequence embeddings (item_embeddings + pos_embeddings) shape: (batch_size , max_len, embedding_size) mask: deal with mask shape: (batch_size, max_len, 1) Return: Output sequences which has the same shape with inputs ''' if self.pos_fixed: # use sin /cos positional embedding position_encoding = self.get_position_encoding( inputs) # (batch_size, len, num_units) inputs += position_encoding inputs *= mask for i in range(self.num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention inputs = multihead_attention( queries=layer_normalization(inputs), keys=inputs, num_units=self.num_units, num_heads=self.num_heads, dropout_keep_prob=self.dropout_keep_prob, causality=False, scope="self_attention") # Feed forward inputs = feedforward( layer_normalization(inputs), num_units=[self.num_units, self.num_units], dropout_keep_prob=self.dropout_keep_prob) inputs *= mask outputs = layer_normalization( inputs) # (batch_size, max_len, num_units) return outputs
def decoder_with_caching_impl(self, decoder_input, decoder_cache, encoder_output, is_training): # decoder_input: [batch_size * beam_size, step], 该step逐步增加,即1,2,3,.. # decoder_cache: [batch_size * beam_size, 0, num_blocks , hidden_units ] # encoder_output: [batch_size * beam_size, time_step, hidden_units] attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) decoder_output = embedding(decoder_input, vocab_size=self._config.dst_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding decoder_output += common_attention.add_timing_signal_1d(decoder_output) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) new_cache = [] # Blocks for i in range(self._config.num_blocks): with tf.variable_scope("block_{}".format(i)): # Multihead Attention (self-attention) decoder_output = residual( decoder_output[:, -1:, :], multihead_attention( query_antecedent=decoder_output, memory_antecedent=None, bias=None, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, reserve_last=True, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, reserve_last=True, name="decoder_vanilla_attention", summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual( decoder_output, ff_hidden(decoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) decoder_output = tf.concat( [decoder_cache[:, :, i, :], decoder_output], axis=1) new_cache.append(decoder_output[:, :, None, :]) new_cache = tf.concat( new_cache, axis=2) # [batch_size, n_step, num_blocks, num_hidden] return decoder_output, new_cache
def decoder_impl(self, decoder_input, encoder_output, is_training): # decoder_input: [batch_size, step] # encoder_output: [batch_size, time_step, hidden_units] attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) decoder_output = embedding(decoder_input, vocab_size=self._config.dst_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding decoder_output += common_attention.add_timing_signal_1d(decoder_output) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) # Bias for preventing peeping later information self_attention_bias = common_attention.attention_bias_lower_triangle( tf.shape(decoder_input)[1]) # Blocks for i in range(self._config.num_blocks_dec): with tf.variable_scope("block_{}".format(i)): # Multihead Attention (self-attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=None, bias=self_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name="decoder_vanilla_attention", summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual( decoder_output, ff_hidden(decoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) return decoder_output
num_units=hidden_units, zero_pad=False, scale=False, scope="enc_pe") embeds *= key_masks # Dropout embeds = tf.nn.dropout(embeds, keep_prob=dropout_keep_prob) enc = embeds # Blocks for i in range(conf.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): # Multihead Attention enc = utils.multihead_attention(queries=enc, keys=embeds, num_units=hidden_units, num_heads=10, dropout_rate=dropout_keep_prob, causality=False) # Feed Forward enc = utils.feedforward( enc, num_units=[4 * hidden_units, hidden_units]) text_embeddings = tf.reduce_mean(enc, axis=1) else: tf.logging.info("1D Convolution Model") sizes = range(2, 5) result_tensors = [] for ngram_size in sizes: # 256 -> 2,3 best yet. text_conv1d = tf.layers.conv1d( inputs=embeds,
def r_net(self): hps = self._hps with tf.variable_scope('question_encoding'): q_rep = self.question_inputs q_states = [] for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): q_cell = tf.contrib.rnn.GRUCell(hps.size) q_rep, q_state = tf.nn.bidirectional_dynamic_rnn( q_cell, q_cell, q_rep, sequence_length=self.question_lens, dtype=self.dtype) q_rep = tf.concat(q_rep, axis=-1) q_states.append(q_state) assert q_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('context_encoding'): c_rep = self.context_inputs for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): c_cell = tf.contrib.rnn.GRUCell(hps.size) c_rep, c_state = tf.nn.bidirectional_dynamic_rnn( c_cell, c_cell, c_rep, initial_state_fw=q_states[i][0], initial_state_bw=q_states[i][1], sequence_length=self.context_lens) c_rep = tf.concat(c_rep, axis=-1) assert c_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('question_aware'): q_a_cell = tf.contrib.rnn.GRUCell(hps.size) context_q = multihead_attention(c_rep, q_rep) inputs = sfu(c_rep, context_q) c_rep, state = tf.nn.bidirectional_dynamic_rnn(q_a_cell, q_a_cell, inputs, self.context_lens, dtype=self.dtype) c_rep = tf.concat(c_rep, axis=-1) with tf.variable_scope('self_attention'): s_a_cell = tf.contrib.rnn.GRUCell(hps.size) context_c = multihead_attention(c_rep, c_rep) inputs = sfu(c_rep, context_c) c_rep, state = tf.nn.bidirectional_dynamic_rnn(s_a_cell, s_a_cell, inputs, self.context_lens, dtype=self.dtype) c_rep = tf.concat(c_rep, axis=-1) # if hps.mode == 'train': # c_rep = tf.nn.dropout(c_rep, 1.0 - hps.dropout_rate) assert c_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('output_layer'): answer_cell = tf.contrib.rnn.GRUCell(2 * hps.size) with tf.variable_scope('pointer'): v_q = tf.get_variable('question_parameters', [hps.batch_size, 2 * hps.size], self.dtype, tf.truncated_normal_initializer()) _, state = pointer(q_rep, v_q, answer_cell) tf.get_variable_scope().reuse_variables() start_pos_scores, state = pointer(c_rep, state, answer_cell) tf.get_variable_scope().reuse_variables() end_pos_scores, state = pointer(c_rep, state, answer_cell) self.pos_scores = [start_pos_scores, end_pos_scores]
def mnemonic_reader(self): hps = self._hps with tf.variable_scope('question_encoding'): q_rep = self.question_inputs q_states = [] for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): q_cell = tf.contrib.rnn.GRUCell(hps.size) q_rep, q_state = tf.nn.bidirectional_dynamic_rnn( q_cell, q_cell, q_rep, sequence_length=self.question_lens, dtype=self.dtype) q_rep = tf.concat(q_rep, axis=-1) q_states.append(q_state) assert q_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('context_encoding'): c_rep = self.context_inputs for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): c_cell = tf.contrib.rnn.GRUCell(hps.size) c_rep, c_state = tf.nn.bidirectional_dynamic_rnn( c_cell, c_cell, c_rep, initial_state_fw=q_states[i][0], initial_state_bw=q_states[i][1], sequence_length=self.context_lens) c_rep = tf.concat(c_rep, axis=-1) assert c_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('iterative_aligner'): for i in xrange(hps.T): with tf.variable_scope('question_aware_%d' % i): q_a_cell = tf.contrib.rnn.GRUCell(hps.size) context_q = multihead_attention(c_rep, q_rep) inputs = sfu(c_rep, context_q) c_rep, state = tf.nn.bidirectional_dynamic_rnn( q_a_cell, q_a_cell, inputs, self.context_lens, dtype=self.dtype) c_rep = tf.concat(c_rep, axis=-1) with tf.variable_scope('self_attention_%d' % i): s_a_cell = tf.contrib.rnn.GRUCell(hps.size) context_c = multihead_attention(c_rep, c_rep) inputs = sfu(c_rep, context_c) c_rep, state = tf.nn.bidirectional_dynamic_rnn( s_a_cell, s_a_cell, inputs, self.context_lens, dtype=self.dtype) c_rep = tf.concat(c_rep, axis=-1) # if hps.mode == 'train': # c_rep = tf.nn.dropout(c_rep, 1.0 - hps.dropout_rate) assert c_rep.get_shape()[-1].value == 2 * hps.size with tf.variable_scope('memory_based_answer_pointer'): z_s = tf.expand_dims(tf.concat(q_state, axis=1), axis=1) for i in xrange(hps.L): with tf.variable_scope('start_position_%d' % i): start_pos_scores, u_s = fn(c_rep, z_s) with tf.variable_scope( 'start_pos_memory_semantic_fusion_unit_%d' % i): z_e = sfu(z_s, u_s) with tf.variable_scope('end_position_%d' % i): end_pos_scores, u_e = fn(c_rep, z_e) with tf.variable_scope( 'end_pos_memory_semantic_fusion_unit_%d' % i): z_s = sfu(z_e, u_e) self.pos_scores = [start_pos_scores, end_pos_scores]
def transformer(self): hps = self._hps with tf.variable_scope('question_convolution_encoding'): q_rep = self.question_inputs q_output = conv_glu_v2(q_rep, 3, 1, hps.size, self.batch_size) q_output = tf.cond(self.dropout, lambda: tf.nn.dropout(q_output, keep_prob=1.0 - hps.dropout_rate), lambda: q_output) q_rep = short_cut(q_rep, q_output, q_output.get_shape()[-1].value) with tf.variable_scope('context_convolution_encoding'): c_rep = self.context_inputs c_output = conv_glu_v2(c_rep, 3, 1, hps.size, self.batch_size) c_output = tf.cond(self.dropout, lambda: tf.nn.dropout(c_output, keep_prob=1.0 - hps.dropout_rate), lambda: c_output) c_rep = short_cut(c_rep, c_output, c_output.get_shape()[-1].value) with tf.variable_scope('question_encoding'): for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): q_rep, q_state = bi_sru( x=q_rep, output_size=hps.size, sequence_length=self.question_lens, dtype=self.dtype ) q_rep = tf.concat(q_rep, axis=-1) q_rep = tf.layers.dense(q_rep, units=hps.size, use_bias=False, name='q_rep') q_rep = tf.cond(self.dropout, lambda: tf.nn.dropout(q_rep, keep_prob=1.0 - hps.dropout_rate), lambda: q_rep) assert q_rep.get_shape()[-1].value == hps.size with tf.variable_scope('context_encoding'): for i in xrange(hps.num_layers): with tf.variable_scope('layer%d' % i): c_rep, c_state = bi_sru( x=c_rep, output_size=hps.size, sequence_length=self.context_lens, dtype=self.dtype ) c_rep = tf.concat(c_rep, axis=-1) c_rep = tf.layers.dense(c_rep, units=hps.size, use_bias=False, name='c_rep') c_rep = tf.cond(self.dropout, lambda: tf.nn.dropout(c_rep, keep_prob=1.0 - hps.dropout_rate), lambda: c_rep) assert c_rep.get_shape()[-1].value == hps.size with tf.variable_scope('iterative_aligner'): for i in xrange(hps.T): with tf.variable_scope('question_aware_%d' % i): with tf.variable_scope('multihead_attention'): context_q = multihead_attention(c_rep, q_rep) with tf.variable_scope('gate'): inputs = gate(c_rep, context_q) with tf.variable_scope('GRU'): c_rep, c_state = bi_sru( x=inputs, output_size=hps.size, sequence_length=self.context_lens, dtype=self.dtype ) c_rep = tf.concat(c_rep, axis=-1) c_rep = tf.layers.dense(c_rep, units=hps.size, use_bias=False, name='c_rep') c_rep = tf.cond(self.dropout, lambda: tf.nn.dropout(c_rep, keep_prob=1.0 - hps.dropout_rate), lambda: c_rep) with tf.variable_scope('self_attention_%d' % i): with tf.variable_scope('multihead_attention'): context_c = multihead_attention(c_rep, c_rep) with tf.variable_scope('semantic_fusion_unit'): inputs = gate(c_rep, context_c) with tf.variable_scope('GRU'): c_rep, c_state = bi_sru( x=inputs, output_size=hps.size, sequence_length=self.context_lens, dtype=self.dtype ) c_rep = tf.concat(c_rep, axis=-1) c_rep = tf.layers.dense(c_rep, units=hps.size, use_bias=False, name='c_rep') c_rep = tf.cond(self.dropout, lambda: tf.nn.dropout(c_rep, keep_prob=1.0 - hps.dropout_rate), lambda: c_rep) assert c_rep.get_shape()[-1].value == hps.size with tf.variable_scope('output_layer'): with tf.variable_scope('init_state'): z_s = tf.layers.dense(tf.concat(q_state, axis=-1), units=hps.size, use_bias=False, name='z_s') z_s = tf.expand_dims(z_s, axis=1) z_s = tf.cond(self.dropout, lambda: tf.nn.dropout(z_s, keep_prob=1.0 - hps.dropout_rate), lambda: z_s) for i in xrange(hps.T): if i > 0: tf.get_variable_scope().reuse_variables() with tf.variable_scope('start_position'): start_pos_scores, u_s = fn(c_rep, z_s) with tf.variable_scope('start_pos_memory_semantic_fusion_unit'): z_e = sfu(z_s, u_s) z_e = tf.cond(self.dropout, lambda: tf.nn.dropout(z_e, keep_prob=1.0 - hps.dropout_rate), lambda: z_e) with tf.variable_scope('end_position'): end_pos_scores, u_e = fn(c_rep, z_e) with tf.variable_scope('end_pos_memory_semantic_fusion_unit'): z_s = sfu(z_e, u_e) z_s = tf.cond(self.dropout, lambda: tf.nn.dropout(z_s, keep_prob=1.0 - hps.dropout_rate), lambda: z_s) self.pos_scores = [start_pos_scores, end_pos_scores]