Exemplo n.º 1
0
def naive_decode(encoding, state_size, document_length):
    """ Decodes encoding to answer span logits.

    Args:  
        encoding: Document representation, shape [N, D, xH].  
    
    Returns:  
        A tuple containing  
            Logit for answer span start position, shape [N, D].  
            Logit for answer span end position, shape [N, D].
    """

    with tf.variable_scope('decode_start'):
        start_relu = tf.layers.dense(encoding,
                                     state_size,
                                     activation=tf.nn.relu)
        start_logit = tf.layers.dense(start_relu, 1)
        start_logit = tf.squeeze(start_logit)
        start_logit = _maybe_mask_score(start_logit, document_length, -1e30)

    with tf.variable_scope('decode_end'):
        end_relu = tf.layers.dense(encoding, state_size, activation=tf.nn.relu)
        end_logit = tf.layers.dense(end_relu, 1)
        end_logit = tf.squeeze(end_logit)
        end_logit = _maybe_mask_score(end_logit, document_length, -1e30)
    return start_logit, end_logit
Exemplo n.º 2
0
 def highway_maxout_network(answer):
     span_encoding = start_and_end_encoding(encoding, answer)
     r_input = tf.concat([state, span_encoding], axis=1)
     r_input = tf.nn.dropout(r_input, keep_prob)
     r = tf.layers.dense(r_input,
                         state_size,
                         use_bias=False,
                         activation=tf.tanh)
     r = tf.expand_dims(r, 1)
     r = tf.tile(r, (1, max_len, 1))
     highway_input = tf.concat([encoding, r], 2)
     logit = highway_maxout(highway_input, state_size, pool_size, keep_prob)
     logit = _maybe_mask_score(logit, document_length, -1e30)
     return logit
Exemplo n.º 3
0
def highway_maxout(encoding, hidden_state, start, end, context_length,
                   batch_size, state_size):
    """
  Highway maxout network
  Defined in original DCN paper: https://arxiv.org/pdf/1611.01604.pdf
  """
    start_encoding = get_encoding_at_index(encoding, start, batch_size)
    end_encoding = get_encoding_at_index(encoding, end, batch_size)
    r = tf.layers.dense(tf.concat([hidden_state, start_encoding, end_encoding],
                                  axis=1),
                        use_bias=False,
                        activation=tf.tanh,
                        units=state_size)
    r = tf.expand_dims(r, 1)
    r = tf.tile(r, (1, tf.shape(encoding)[1], 1))
    layer_1 = sparse_mixture_of_experts_layer(tf.concat([encoding, r], axis=2))
    layer_2 = maxout_layer(layer_1)
    output = maxout_layer(tf.concat([layer_1, layer_2], axis=2), 1)
    logit = tf.squeeze(output, -1)
    return _maybe_mask_score(logit, context_length, float('-inf'))
Exemplo n.º 4
0
    def __init__(self, pretrained_embeddings, hparams):
        self.hparams = copy.copy(hparams)
        self.pretrained_embeddings = pretrained_embeddings

        # Setup placeholders
        self.question = tf.placeholder(tf.int32, (None, None), name='question')
        self.question_length = tf.placeholder(tf.int32, (None,), name='question_length')
        self.paragraph = tf.placeholder(tf.int32, (None, None), name='paragraph')
        self.paragraph_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')
        self.answer_span = tf.placeholder(tf.int32, (None, 2), name='answer_span')
        self.is_training = tf.placeholder(tf.bool, shape=(), name='is_training')   # replace with tf.placeholder_with_default

        # Word embeddings
        with tf.variable_scope('embeddings'):
            embedded_vocab = tf.Variable(self.pretrained_embeddings, name='shared_embedding', trainable=hparams['trainable_embeddings'], dtype=tf.float32)  
            q_embeddings = tf.nn.embedding_lookup(embedded_vocab, self.question)
            p_embeddings = tf.nn.embedding_lookup(embedded_vocab, self.paragraph)
        
        # Character embeddings to word vectors
        if hparams['use_char_cnn']:
            self.question_chars = tf.placeholder(tf.int32, (None, None, self.hparams['max_word_length']), name='question_chars')
            self.paragraph_chars = tf.placeholder(tf.int32, (None, None, self.hparams['max_word_length']), name='paragraph_chars')
        
            with tf.variable_scope('char_cnn', reuse=tf.AUTO_REUSE):
                filter_widths = [5]  # TODO add as comma separated FLAGS argument
                num_filters = [100]  # TODO add as comma separated FLAGS argument
                char_embeddings = tf.get_variable('char_embeddings', shape=[self.hparams['char_vocab_size'], self.hparams['char_embedding_size']], dtype=tf.float32)
                q_word_vectors = char_cnn_word_vectors(self.question_chars, char_embeddings, filter_widths, num_filters)
                p_word_vectors = char_cnn_word_vectors(self.paragraph_chars, char_embeddings, filter_widths, num_filters)  # reusing filters
                q_embeddings = tf.concat([q_embeddings, q_word_vectors], axis=2)
                p_embeddings = tf.concat([p_embeddings, p_word_vectors], axis=2)

        # Setup RNN Cells
        cell = lambda: cell_factory(hparams['cell'], hparams['state_size'], self.is_training, hparams['input_keep_prob'], hparams['output_keep_prob'], hparams['state_keep_prob'])
        final_cell = lambda: cell_factory(hparams['cell'], hparams['state_size'], self.is_training, hparams['final_input_keep_prob'], hparams['output_keep_prob'], hparams['state_keep_prob'])  # TODO TEMP

        # Setup Encoders
        with tf.variable_scope('prediction'):
            if hparams['model'] == 'baseline':
                self.encode = baseline_encode
            elif hparams['model'] == 'dcn':
                self.encode = dcn_encode
            else:
                self.encode = dcnplus_encode
            encoding = self.encode(cell, final_cell, q_embeddings, self.question_length, p_embeddings, self.paragraph_length, keep_prob=maybe_dropout(hparams['keep_prob'], self.is_training))
            encoding = tf.nn.dropout(encoding, keep_prob=maybe_dropout(hparams['encoding_keep_prob'], self.is_training))
        
        # Decoder, loss and prediction mechanism are different for baseline/mixed and dcn/dcn_plus
        if hparams['model'] in ('baseline', 'mixed'):
            with tf.variable_scope('prediction'):
                start_logit, end_logit = naive_decode(encoding, hparams['state_size'], self.paragraph_length)
                start_prob, end_prob = tf.nn.softmax(start_logit), tf.nn.softmax(end_logit)
                self.answer = max_product_span(start_prob, end_prob, self.paragraph_length)

            with tf.variable_scope('loss'):
                start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logit, labels=self.answer_span[:, 0], name='start_loss')
                end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logit, labels=self.answer_span[:, 1], name='end_loss')
                loss_per_example = start_loss + end_loss
                self.loss = tf.reduce_mean(loss_per_example)

        elif hparams['model'] in ('dcn', 'dcnplus'):
            with tf.variable_scope('prediction'):
                logits = dcn_decode(encoding, self.paragraph_length, hparams['state_size'], hparams['pool_size'], hparams['max_iter'], keep_prob=maybe_dropout(hparams['keep_prob'], self.is_training))
                last_iter_logit = logits.read(hparams['max_iter']-1)
                start_logit, end_logit = last_iter_logit[:,:,0], last_iter_logit[:,:,1]
                start = tf.argmax(start_logit, axis=1, name='answer_start')
                if hparams['force_end_gt_start']:
                    end_logit = _maybe_mask_to_start(end_logit, start, -1e30)
                if hparams['max_answer_length'] > 0:
                    end_logit = _maybe_mask_score(end_logit, start+hparams['max_answer_length'], -1e30)
                self.answer = (start, tf.argmax(end_logit, axis=1, name='answer_end'))

            with tf.variable_scope('loss'):
                self.loss = dcn_loss(logits, self.answer_span, max_iter=hparams['max_iter'])

            # Solely for diagnostics purposes
            with tf.variable_scope('last_iter_loss'):
                start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logit, labels=self.answer_span[:, 0], name='start_loss')
                end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logit, labels=self.answer_span[:, 1], name='end_loss')
                last_loss = tf.reduce_mean(start_loss + end_loss)
            tf.summary.scalar('cross_entropy_last_iter', last_loss)

        global_step = tf.train.get_or_create_global_step()
        with tf.variable_scope('train'):
            if hparams['exponential_decay']:
                lr = tf.train.exponential_decay(learning_rate=hparams['learning_rate'], 
                                                global_step=global_step, 
                                                decay_steps=hparams['decay_steps'], 
                                                decay_rate=hparams['decay_rate'], 
                                                staircase=hparams['staircase']) 
            else:
                lr = hparams['learning_rate']
            optimizer = tf.train.AdamOptimizer(lr)
            grad, tvars = zip(*optimizer.compute_gradients(self.loss))
            if hparams['clip_gradients']:
                grad, _ = tf.clip_by_global_norm(grad, hparams['max_gradient_norm'], name='gradient_clipper')  
            grad_norm = tf.global_norm(grad)
            self.train = optimizer.apply_gradients(zip(grad, tvars), global_step=global_step, name='apply_grads')
        
        tf.summary.scalar('cross_entropy', self.loss)
        tf.summary.scalar('learning_rate', lr)
        tf.summary.scalar('grad_norm', grad_norm)
Exemplo n.º 5
0
    def __init__(self, word_level_num_units, uttr_level_num_units,
                 n_hidden_units, memory, memory_sequence_length, hist_length):
        """Construct the utterance level attention mechanism.
        Args:
            word_level_num_units: Word level attention depth.
            uttr_level_num_units: Utterance level attention depth.
            n_hidden_units: Number of hidden units for utterance-level encoder.
            memory: The memory to query; the output of the bidirectional RNNs.  This
                tensor should be shaped `[max_hist_len, batch_size, max_uttr_len, 2*n_hidden_units]`.
            memory_sequence_length: Sequence lengths for the batch entries in memory.
                Shaped `[max_hist_len, batch_size]`.
            hist_length: Lengths for the utterances in history. Shaped `[batch_size]`.
        """
        self._query_layer = tf.layers.Dense(uttr_level_num_units,
                                            name='uttr_level_query_layer',
                                            use_bias=False,
                                            dtype=tf.float32)
        self._memory_layer = tf.layers.Dense(uttr_level_num_units,
                                             name='uttr_level_memory_layer',
                                             use_bias=False,
                                             dtype=tf.float32)
        self._uttr_enc_cell = tf.nn.rnn_cell.GRUCell(n_hidden_units)
        self._uttr_level_num_units = uttr_level_num_units

        word_level_dec_query_layer = tf.layers.Dense(
            word_level_num_units,
            name='word_level_dec_query_layer',
            use_bias=False,
            dtype=tf.float32)
        word_level_enc_query_layer = tf.layers.Dense(
            word_level_num_units,
            name='word_level_enc_query_layer',
            use_bias=False,
            dtype=tf.float32)
        word_level_memory_layer = tf.layers.Dense(
            word_level_num_units,
            name='word_level_memory_layer',
            use_bias=False,
            dtype=tf.float32)

        self._attention_v_ul = tf.Variable(tf.truncated_normal(
            [uttr_level_num_units], stddev=0.1),
                                           name='attention_v_ul')
        self._attention_v_wl = tf.Variable(tf.truncated_normal(
            [word_level_num_units], stddev=0.1),
                                           name='attention_v_wl')

        self._word_level_attns = []
        for i in range(memory.shape[0].value):
            self._word_level_attns.append(
                WordLevelAttentionMechanism(
                    attention_v=self._attention_v_wl,
                    dec_query_layer=word_level_dec_query_layer,
                    enc_query_layer=word_level_enc_query_layer,
                    memory_layer=word_level_memory_layer,
                    memory=memory[i, :, :, :],
                    memory_sequence_length=memory_sequence_length[i, :]))

        self.dtype = tf.float32

        self._memory = memory
        self._hist_length = hist_length
        self._batch_size = memory.shape[1].value
        self._alignments_size = memory.shape[0].value

        self._alignments_w_size = self._alignments_size * self._word_level_attns[
            0].alignments_size

        score_mask_value = tf.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
        self._probability_fn = lambda score: tf.nn.softmax(
            _maybe_mask_score(score, self._hist_length, score_mask_value))

        # self._values changes each time self.__call__() is called
        self._values = tf.zeros(
            [self._batch_size, self._alignments_size, n_hidden_units])