示例#1
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.attention =='Baseline':
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        
        if self.attention=='BiDAF':
            attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output_c2q, attn_output_q2c= attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output_c2q is shape (batch_size, context_len, hidden_size*2) attn_output_q2c is shape (batch_size, 1,hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output_c2q,context_hiddens*attn_output_c2q,context_hiddens*attn_output_q2c], axis=2) # (batch_size, context_len, hidden_size*8)
        

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings
        to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into cross entropy function.
          self.pdist_start, self.pdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Apply EncoderBlock for the stacked embedding encoder layer
        with tf.variable_scope("StackedEmbeddingEncoder"):
            emb_encoder = EncoderBlock(self.flags.num_blocks_enc,
                                       self.keep_prob,
                                       self.flags.kernel_size_enc,
                                       self.flags.d_model,
                                       self.flags.num_conv_enc,
                                       self.flags.num_heads,
                                       self.flags.d_ff,
                                       l2_lambda=self.flags.l2_lambda)
            c_enc = emb_encoder.build_graph(self.c_embs,
                                            self.c_longest,
                                            self.c_mask,
                                            reduce_input_dim=True,
                                            reuse=None)
            q_enc = emb_encoder.build_graph(self.q_embs,
                                            self.q_longest,
                                            self.q_mask,
                                            reduce_input_dim=True,
                                            reuse=True)

        # Apply bidirectional attention for the context-query attention layer
        with tf.variable_scope("ContextQueryAttention"):
            bidaf = BiDAFAttn(self.keep_prob, l2_lambda=self.flags.l2_lambda)
            # Shape: [batch_size, context_len, vec_size*8].
            attn_outputs = bidaf.build_graph(c_enc, self.c_mask,
                                             self.c_longest, q_enc,
                                             self.q_mask, self.q_longest)

        # Apply EncoderBlock x3 for the modeling layer
        with tf.variable_scope("ModelEncoder"):
            model_encoder = EncoderBlock(self.flags.num_blocks_mod,
                                         self.keep_prob,
                                         self.flags.kernel_size_mod,
                                         self.flags.d_model,
                                         self.flags.num_conv_mod,
                                         self.flags.num_heads,
                                         self.flags.d_ff,
                                         l2_lambda=self.flags.l2_lambda)
            model_1 = model_encoder.build_graph(attn_outputs,
                                                self.c_longest,
                                                self.c_mask,
                                                reduce_input_dim=True)
            model_2 = model_encoder.build_graph(model_1,
                                                self.c_longest,
                                                self.c_mask,
                                                reuse=True)
            model_3 = model_encoder.build_graph(model_2,
                                                self.c_longest,
                                                self.c_mask,
                                                reuse=True)

        # Use a simple softmax output layer to compute start and end probability distributions
        with tf.variable_scope("Output"):
            with tf.variable_scope("StartDistribution"):
                start_inputs = tf.concat([model_1, model_2], axis=-1)
                softmax_layer_start = SimpleSoftmaxLayer(
                    l2_lambda=self.flags.l2_lambda)
                self.logits_start, self.pdist_start = softmax_layer_start.build_graph(
                    start_inputs, self.c_mask)

            with tf.variable_scope("EndDistribution"):
                end_inputs = tf.concat([model_1, model_3], axis=-1)
                softmax_layer_end = SimpleSoftmaxLayer(
                    l2_lambda=self.flags.l2_lambda)
                self.logits_end, self.pdist_end = softmax_layer_end.build_graph(
                    end_inputs, self.c_mask)
示例#3
0
文件: qa_model.py 项目: xuwd11/QANet
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']:
            encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 self.keep_prob,
                                 cell_type=self.FLAGS.cell_type)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        elif self.FLAGS.cell_type == 'qanet':
            encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \
                                num_heads=self.FLAGS.emb_num_heads, \
                                filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \
                                keep_prob=self.keep_prob, input_mapping=True)
            context_hiddens = encoder.build_graph(self.context_embs,
                                                  self.context_mask)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask)

        if self.FLAGS.attention == 'basic':
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

        elif self.FLAGS.attention == 'bidaf':
            attn_layer = BiDAFAttn(self.keep_prob)
            blended_reps = attn_layer.build_graph(context_hiddens,
                                                  self.context_mask,
                                                  question_hiddens,
                                                  self.qn_mask)

        if self.FLAGS.modeling_layer == 'basic':
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps,
                num_outputs=self.FLAGS.hidden_size,
                weights_initializer=initializer_relu()
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)

        elif self.FLAGS.modeling_layer == 'rnn':
            encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                       cell_type=self.FLAGS.cell_type, name='m1')
            m1 = encoder_start.build_graph(blended_reps, self.context_mask)
            encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                     cell_type=self.FLAGS.cell_type, name='m2')
            m2 = encoder_end.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([blended_reps, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([blended_reps, m2], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet':
            modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                         num_layers=self.FLAGS.model_num_layers, \
                                         num_heads=self.FLAGS.model_num_heads, \
                                         filters=self.FLAGS.hidden_size, \
                                         kernel_size=self.FLAGS.model_kernel_size, \
                                         keep_prob=self.keep_prob, input_mapping=False, \
                                         name='modeling_encoder')
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder.build_graph(m0, self.context_mask)
            m2 = modeling_encoder.build_graph(m1, self.context_mask)
            m3 = modeling_encoder.build_graph(m2, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m1, m2], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m1, m3], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet2':
            modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder1')
            '''
            modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder2')
            '''
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder1.build_graph(m0, self.context_mask)
            m2 = modeling_encoder1.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m0, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m0, m2], -1), self.context_mask)
示例#4
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        with tf.variable_scope('context_conv1') as scope:
            context_conv1_filter = truncated_normal_var(
                name='context_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv1 = tf.nn.conv2d(self.context_character_embs,
                                         context_conv1_filter,
                                         strides,
                                         padding='SAME')
            context_conv1_bias = zero_var(name='context_conv1_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv1_add_bias = tf.nn.bias_add(context_conv1,
                                                    context_conv1_bias)
            context_relu_conv1 = tf.nn.relu(context_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        context_pool1 = tf.nn.max_pool(context_relu_conv1,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer1')

        with tf.variable_scope('context_conv2') as scope:
            context_conv2_filter = truncated_normal_var(
                name='context_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv2 = tf.nn.conv2d(context_pool1,
                                         context_conv2_filter,
                                         strides,
                                         padding='SAME')
            context_conv2_bias = zero_var(name='context_conv2_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv2_add_bias = tf.nn.bias_add(context_conv2,
                                                    context_conv2_bias)
            context_relu_conv2 = tf.nn.relu(context_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        context_pool2 = tf.nn.max_pool(context_relu_conv2,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer2')

        with tf.variable_scope('context_conv3') as scope:
            context_conv3_filter = truncated_normal_var(
                name='context_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv3 = tf.nn.conv2d(context_pool2,
                                         context_conv3_filter,
                                         strides,
                                         padding='SAME')
            context_conv3_bias = zero_var(name='context_conv3_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv3_add_bias = tf.nn.bias_add(context_conv3,
                                                    context_conv3_bias)
            context_relu_conv3 = tf.nn.relu(context_conv3_add_bias)
        pool_size = [1, 1, 4, 1]
        context_pool3 = tf.nn.max_pool(context_relu_conv3,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer3')
        context_flattened_layer = tf.reshape(
            context_pool3,
            [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE
             ])  #batch,300,192
        context_final = tf.concat([self.context_embs, context_flattened_layer],
                                  axis=2)

        with tf.variable_scope('qn_conv1') as scope:
            qn_conv1_filter = truncated_normal_var(
                name='qn_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv1 = tf.nn.conv2d(self.qn_character_embs,
                                    qn_conv1_filter,
                                    strides,
                                    padding='SAME')
            qn_conv1_bias = zero_var(name='qn_conv1_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias)
            qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        qn_pool1 = tf.nn.max_pool(qn_relu_conv1,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer1')

        with tf.variable_scope('qn_conv2') as scope:
            qn_conv2_filter = truncated_normal_var(
                name='qn_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv2 = tf.nn.conv2d(qn_pool1,
                                    qn_conv2_filter,
                                    strides,
                                    padding='SAME')
            qn_conv2_bias = zero_var(name='qn_conv2_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias)
            qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        qn_pool2 = tf.nn.max_pool(qn_relu_conv2,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer2')

        with tf.variable_scope('qn_conv3') as scope:
            qn_conv3_filter = truncated_normal_var(
                name='qn_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv3 = tf.nn.conv2d(qn_pool2,
                                    qn_conv3_filter,
                                    strides,
                                    padding='SAME')
            qn_conv3_bias = zero_var(name='qn_conv3_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias)
            qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias)
        pool_size = [1, 1, 3, 1]
        qn_pool3 = tf.nn.max_pool(qn_relu_conv3,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer3')
        qn_flattened_layer = tf.reshape(
            qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE
                       ])  #batch,30,128
        qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2)

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        print("context_final final shape %s" % (context_final.get_shape()))
        print("context_mask final shape %s" % (self.context_mask.get_shape()))
        print("qn_final final shape %s" % (qn_final.get_shape()))
        print("qn_mask final shape %s" % (self.qn_mask.get_shape()))
        context_hiddens = encoder.build_graph(
            context_final,
            self.context_mask)  # (batch_size, context_len, hidden_size*2+192)
        question_hiddens = encoder.build_graph(
            qn_final,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
        #print("blended_reps shape %s" % (blended_reps.get_shape()))
        #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1')
        #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask)
        #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2')
        #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask)

        attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*8)
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)
        model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder1")
        blended_reps_thro_model_layer1 = model_encoder1.build_graph(
            blended_reps,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder2")
        blended_reps_thro_model_layer2 = model_encoder2.build_graph(
            blended_reps_thro_model_layer1,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder3")
        blended_reps_thro_model_layer3 = model_encoder3.build_graph(
            blended_reps_thro_model_layer2,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
示例#5
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        
        # ***********************
        # *** Highway network ***
        # ***********************
        
        highway_net = HighWayNetwork()
        self.context_embs = highway_net.build_graph(self.context_embs)
        self.qn_embs = highway_net.build_graph(self.qn_embs)
        
        # **********************************
        # *** Contextual Embedding layer ***
        # **********************************
        # Use a biLSTM to get hidden states for the context and the question
        # Note: here the biLSTMEncoder is shared (i.e. the weights are the same) between the context and the question.
        # biLSTM encoding utilizes contextual clues from surrounding words to refine the embeddings.
        encoder = biLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        
        # ****************************
        # *** Attention Flow layer ***
        # ****************************
        # Couples query and context vectors and produces a set of query-aware feature vectors for ea. word in the document
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, c2q_attn_output, _, q2c_attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, 
                                                                        context_hiddens, self.context_mask) 
        # c2q_attn_output is shape (batch_size, context_len, 2h), q2c_attn_output is (batch_size, 1, 2h)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, c2q_attn_output, tf.multiply(context_hiddens, c2q_attn_output), 
                                  tf.multiply(context_hiddens, q2c_attn_output)], axis=2,
                                    name="blended_reps") # (batch_size, context_len, hidden_size*8)
        
        # **********************
        # *** Modeling layer ***
        # **********************
        # Scans the context
        Modeling_layer = Modeling_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob)
        blended_reps_final = Modeling_layer.build_graph(blended_reps)
        
        # ********************
        # *** Output layer ***
        # ********************
        # Provide an answer to the query

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_final], 2),
                                                                                     self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        encoder_out = Output_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob)
        blended_reps_final_hiddens = encoder_out.build_graph(blended_reps_final) # (batch_size, context_len, hidden_size*2)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_final_hiddens], 2),
                                                                               self.context_mask)
示例#6
0
    def build_graph(self, multi_lstm=False, bidaf=False):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if multi_lstm is False:
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        else:
            encoder = MultiLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        if bidaf is False:
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)
        else:
            attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            c2q_attn, q2c_attn = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)
            q2c_attn = q2c_attn + tf.zeros(
                shape=[1, c2q_attn.shape[1], c2q_attn.shape[2]])
            print(q2c_attn.shape, c2q_attn.shape)
            context_c2q = tf.multiply(context_hiddens, c2q_attn)
            context_q2c = tf.multiply(context_hiddens, q2c_attn)
            blended_reps = tf.concat(
                [context_hiddens, c2q_attn, context_c2q, context_q2c],
                axis=2)  # (batch_size, context_hiddens, hidden_size*8)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        if self.FLAGS.start_lstm_decode is False:
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)
        else:
            with vs.variable_scope("StartDist"):
                start_decode_layer = StartDecodeLayer(self.FLAGS.hidden_size,
                                                      self.keep_prob)
                self.logits_start, self.probdist_start = start_decode_layer.build_graph(
                    blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        if self.FLAGS.cond_pred is False:
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
        else:
            logits_start_float32 = tf.expand_dims(tf.cast(self.logits_start,
                                                          dtype=tf.float32),
                                                  axis=2)
            logits_start_float32 = logits_start_float32 + tf.zeros(
                shape=(1, blended_reps_final.shape[1],
                       blended_reps_final.shape[2]),
                dtype=tf.float32)
            print(blended_reps_final.dtype, blended_reps_final.shape,
                  logits_start_float32.dtype, logits_start_float32.shape)
            comb_blended_reps = tf.concat(
                [blended_reps_final, logits_start_float32], axis=2)
            with vs.variable_scope("EndDist"):
                conditional_output_layer = ConditionalOutputLayer(
                    self.FLAGS.hidden_size, self.keep_prob)
                self.logits_end, self.probdist_end = conditional_output_layer.build_graph(
                    comb_blended_reps, self.context_mask)