def naive_decode(encoding, state_size, document_length): """ Decodes encoding to answer span logits. Args: encoding: Document representation, shape [N, D, xH]. Returns: A tuple containing Logit for answer span start position, shape [N, D]. Logit for answer span end position, shape [N, D]. """ with tf.variable_scope('decode_start'): start_relu = tf.layers.dense(encoding, state_size, activation=tf.nn.relu) start_logit = tf.layers.dense(start_relu, 1) start_logit = tf.squeeze(start_logit) start_logit = _maybe_mask_score(start_logit, document_length, -1e30) with tf.variable_scope('decode_end'): end_relu = tf.layers.dense(encoding, state_size, activation=tf.nn.relu) end_logit = tf.layers.dense(end_relu, 1) end_logit = tf.squeeze(end_logit) end_logit = _maybe_mask_score(end_logit, document_length, -1e30) return start_logit, end_logit
def highway_maxout_network(answer): span_encoding = start_and_end_encoding(encoding, answer) r_input = tf.concat([state, span_encoding], axis=1) r_input = tf.nn.dropout(r_input, keep_prob) r = tf.layers.dense(r_input, state_size, use_bias=False, activation=tf.tanh) r = tf.expand_dims(r, 1) r = tf.tile(r, (1, max_len, 1)) highway_input = tf.concat([encoding, r], 2) logit = highway_maxout(highway_input, state_size, pool_size, keep_prob) logit = _maybe_mask_score(logit, document_length, -1e30) return logit
def highway_maxout(encoding, hidden_state, start, end, context_length, batch_size, state_size): """ Highway maxout network Defined in original DCN paper: https://arxiv.org/pdf/1611.01604.pdf """ start_encoding = get_encoding_at_index(encoding, start, batch_size) end_encoding = get_encoding_at_index(encoding, end, batch_size) r = tf.layers.dense(tf.concat([hidden_state, start_encoding, end_encoding], axis=1), use_bias=False, activation=tf.tanh, units=state_size) r = tf.expand_dims(r, 1) r = tf.tile(r, (1, tf.shape(encoding)[1], 1)) layer_1 = sparse_mixture_of_experts_layer(tf.concat([encoding, r], axis=2)) layer_2 = maxout_layer(layer_1) output = maxout_layer(tf.concat([layer_1, layer_2], axis=2), 1) logit = tf.squeeze(output, -1) return _maybe_mask_score(logit, context_length, float('-inf'))
def __init__(self, pretrained_embeddings, hparams): self.hparams = copy.copy(hparams) self.pretrained_embeddings = pretrained_embeddings # Setup placeholders self.question = tf.placeholder(tf.int32, (None, None), name='question') self.question_length = tf.placeholder(tf.int32, (None,), name='question_length') self.paragraph = tf.placeholder(tf.int32, (None, None), name='paragraph') self.paragraph_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') self.answer_span = tf.placeholder(tf.int32, (None, 2), name='answer_span') self.is_training = tf.placeholder(tf.bool, shape=(), name='is_training') # replace with tf.placeholder_with_default # Word embeddings with tf.variable_scope('embeddings'): embedded_vocab = tf.Variable(self.pretrained_embeddings, name='shared_embedding', trainable=hparams['trainable_embeddings'], dtype=tf.float32) q_embeddings = tf.nn.embedding_lookup(embedded_vocab, self.question) p_embeddings = tf.nn.embedding_lookup(embedded_vocab, self.paragraph) # Character embeddings to word vectors if hparams['use_char_cnn']: self.question_chars = tf.placeholder(tf.int32, (None, None, self.hparams['max_word_length']), name='question_chars') self.paragraph_chars = tf.placeholder(tf.int32, (None, None, self.hparams['max_word_length']), name='paragraph_chars') with tf.variable_scope('char_cnn', reuse=tf.AUTO_REUSE): filter_widths = [5] # TODO add as comma separated FLAGS argument num_filters = [100] # TODO add as comma separated FLAGS argument char_embeddings = tf.get_variable('char_embeddings', shape=[self.hparams['char_vocab_size'], self.hparams['char_embedding_size']], dtype=tf.float32) q_word_vectors = char_cnn_word_vectors(self.question_chars, char_embeddings, filter_widths, num_filters) p_word_vectors = char_cnn_word_vectors(self.paragraph_chars, char_embeddings, filter_widths, num_filters) # reusing filters q_embeddings = tf.concat([q_embeddings, q_word_vectors], axis=2) p_embeddings = tf.concat([p_embeddings, p_word_vectors], axis=2) # Setup RNN Cells cell = lambda: cell_factory(hparams['cell'], hparams['state_size'], self.is_training, hparams['input_keep_prob'], hparams['output_keep_prob'], hparams['state_keep_prob']) final_cell = lambda: cell_factory(hparams['cell'], hparams['state_size'], self.is_training, hparams['final_input_keep_prob'], hparams['output_keep_prob'], hparams['state_keep_prob']) # TODO TEMP # Setup Encoders with tf.variable_scope('prediction'): if hparams['model'] == 'baseline': self.encode = baseline_encode elif hparams['model'] == 'dcn': self.encode = dcn_encode else: self.encode = dcnplus_encode encoding = self.encode(cell, final_cell, q_embeddings, self.question_length, p_embeddings, self.paragraph_length, keep_prob=maybe_dropout(hparams['keep_prob'], self.is_training)) encoding = tf.nn.dropout(encoding, keep_prob=maybe_dropout(hparams['encoding_keep_prob'], self.is_training)) # Decoder, loss and prediction mechanism are different for baseline/mixed and dcn/dcn_plus if hparams['model'] in ('baseline', 'mixed'): with tf.variable_scope('prediction'): start_logit, end_logit = naive_decode(encoding, hparams['state_size'], self.paragraph_length) start_prob, end_prob = tf.nn.softmax(start_logit), tf.nn.softmax(end_logit) self.answer = max_product_span(start_prob, end_prob, self.paragraph_length) with tf.variable_scope('loss'): start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logit, labels=self.answer_span[:, 0], name='start_loss') end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logit, labels=self.answer_span[:, 1], name='end_loss') loss_per_example = start_loss + end_loss self.loss = tf.reduce_mean(loss_per_example) elif hparams['model'] in ('dcn', 'dcnplus'): with tf.variable_scope('prediction'): logits = dcn_decode(encoding, self.paragraph_length, hparams['state_size'], hparams['pool_size'], hparams['max_iter'], keep_prob=maybe_dropout(hparams['keep_prob'], self.is_training)) last_iter_logit = logits.read(hparams['max_iter']-1) start_logit, end_logit = last_iter_logit[:,:,0], last_iter_logit[:,:,1] start = tf.argmax(start_logit, axis=1, name='answer_start') if hparams['force_end_gt_start']: end_logit = _maybe_mask_to_start(end_logit, start, -1e30) if hparams['max_answer_length'] > 0: end_logit = _maybe_mask_score(end_logit, start+hparams['max_answer_length'], -1e30) self.answer = (start, tf.argmax(end_logit, axis=1, name='answer_end')) with tf.variable_scope('loss'): self.loss = dcn_loss(logits, self.answer_span, max_iter=hparams['max_iter']) # Solely for diagnostics purposes with tf.variable_scope('last_iter_loss'): start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logit, labels=self.answer_span[:, 0], name='start_loss') end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logit, labels=self.answer_span[:, 1], name='end_loss') last_loss = tf.reduce_mean(start_loss + end_loss) tf.summary.scalar('cross_entropy_last_iter', last_loss) global_step = tf.train.get_or_create_global_step() with tf.variable_scope('train'): if hparams['exponential_decay']: lr = tf.train.exponential_decay(learning_rate=hparams['learning_rate'], global_step=global_step, decay_steps=hparams['decay_steps'], decay_rate=hparams['decay_rate'], staircase=hparams['staircase']) else: lr = hparams['learning_rate'] optimizer = tf.train.AdamOptimizer(lr) grad, tvars = zip(*optimizer.compute_gradients(self.loss)) if hparams['clip_gradients']: grad, _ = tf.clip_by_global_norm(grad, hparams['max_gradient_norm'], name='gradient_clipper') grad_norm = tf.global_norm(grad) self.train = optimizer.apply_gradients(zip(grad, tvars), global_step=global_step, name='apply_grads') tf.summary.scalar('cross_entropy', self.loss) tf.summary.scalar('learning_rate', lr) tf.summary.scalar('grad_norm', grad_norm)
def __init__(self, word_level_num_units, uttr_level_num_units, n_hidden_units, memory, memory_sequence_length, hist_length): """Construct the utterance level attention mechanism. Args: word_level_num_units: Word level attention depth. uttr_level_num_units: Utterance level attention depth. n_hidden_units: Number of hidden units for utterance-level encoder. memory: The memory to query; the output of the bidirectional RNNs. This tensor should be shaped `[max_hist_len, batch_size, max_uttr_len, 2*n_hidden_units]`. memory_sequence_length: Sequence lengths for the batch entries in memory. Shaped `[max_hist_len, batch_size]`. hist_length: Lengths for the utterances in history. Shaped `[batch_size]`. """ self._query_layer = tf.layers.Dense(uttr_level_num_units, name='uttr_level_query_layer', use_bias=False, dtype=tf.float32) self._memory_layer = tf.layers.Dense(uttr_level_num_units, name='uttr_level_memory_layer', use_bias=False, dtype=tf.float32) self._uttr_enc_cell = tf.nn.rnn_cell.GRUCell(n_hidden_units) self._uttr_level_num_units = uttr_level_num_units word_level_dec_query_layer = tf.layers.Dense( word_level_num_units, name='word_level_dec_query_layer', use_bias=False, dtype=tf.float32) word_level_enc_query_layer = tf.layers.Dense( word_level_num_units, name='word_level_enc_query_layer', use_bias=False, dtype=tf.float32) word_level_memory_layer = tf.layers.Dense( word_level_num_units, name='word_level_memory_layer', use_bias=False, dtype=tf.float32) self._attention_v_ul = tf.Variable(tf.truncated_normal( [uttr_level_num_units], stddev=0.1), name='attention_v_ul') self._attention_v_wl = tf.Variable(tf.truncated_normal( [word_level_num_units], stddev=0.1), name='attention_v_wl') self._word_level_attns = [] for i in range(memory.shape[0].value): self._word_level_attns.append( WordLevelAttentionMechanism( attention_v=self._attention_v_wl, dec_query_layer=word_level_dec_query_layer, enc_query_layer=word_level_enc_query_layer, memory_layer=word_level_memory_layer, memory=memory[i, :, :, :], memory_sequence_length=memory_sequence_length[i, :])) self.dtype = tf.float32 self._memory = memory self._hist_length = hist_length self._batch_size = memory.shape[1].value self._alignments_size = memory.shape[0].value self._alignments_w_size = self._alignments_size * self._word_level_attns[ 0].alignments_size score_mask_value = tf.as_dtype(self.dtype).as_numpy_dtype(-np.inf) self._probability_fn = lambda score: tf.nn.softmax( _maybe_mask_score(score, self._hist_length, score_mask_value)) # self._values changes each time self.__call__() is called self._values = tf.zeros( [self._batch_size, self._alignments_size, n_hidden_units])