Пример #1
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        """

        # Use a RNN to get hidden states for the context and the question
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask, False)

        # Use softmax layer to compute probability distribution for end location
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask, False)
Пример #2
0
    def build_graph(self):
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2)
        _, attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        attn_layer = R_Net_Attn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                self.FLAGS)
        output = attn_layer.build_graph(
            attn_output, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        blended_reps_final = tf.contrib.layers.fully_connected(
            tf.concat([attn_output, output], 2),
            num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #3
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
       """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        print self.context_embs.shape
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        biattn_layer = AttentionFlowLayer(self.keep_prob, self.FLAGS.l2_lambda)
        biattn_output = biattn_layer.build_graph(context_hiddens,
                                                 self.context_mask,
                                                 question_hiddens,
                                                 self.qn_mask,
                                                 scope="AttnFlow")

        #RNNEncoder layer
        model_layer = Model_Layer(self.FLAGS.hidden_size, self.keep_prob)
        model_output = model_layer.build_graph(biattn_output,
                                               self.context_mask)
        #Fully connected

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            model_output, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        #What are the masks used for?

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #4
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
          self.c2q_dist : (batch_size, context_len, question_len) Context to Question attention probability. Each row should sum to 1
          except if the context word is masked.
          self.q2c_dist : (batch_size, context_len) Question to Context attention probability. Each row should sum to 1.
        """
        print("Building BIDAF")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        self.c2q_attn_dist, self.q2c_attn_dist, attn_output = \
            attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*4)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #5
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        with vs.variable_scope("Encoder"):
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # shapes are U_tilde: (batch_size, context_len, 2h), H_tilde: (batch_size, context_len, 1)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*8)

        with vs.variable_scope("M1_init"):
            # Bidirectional GRU M1
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_1_init = modeling_layer.build_graph(blended_reps, self.context_mask) # (batch_size, N, 2h)

        with vs.variable_scope("M1"):
            # Bidrectional GRU M2
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_1 = modeling_layer.build_graph(blended_reps_1_init, self.context_mask) # (batch_size, N, 2h)

        with vs.variable_scope("M2"):
            # Bidrectional GRU M2
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_2 = modeling_layer.build_graph(blended_reps_1, self.context_mask) # (batch_size, N, 2h)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_1], axis=2), self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_2], axis=2), self.context_mask)
Пример #6
0
    def build_graph(self):
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        #qn_embs_concat.set_shape((None, None, 1024+self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("start"):
            softmax_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_start.build_graph(
                context_embs_concat, self.context_mask)
        with tf.variable_scope("end"):
            softmax_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_end.build_graph(
                context_embs_concat, self.context_mask)
Пример #7
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
Пример #8
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        encoderQ = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask,"rnnencoder1") # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoderQ.build_graph(self.qn_embs, self.qn_mask,"rnnencoderQ") # (batch_size, question_len, ,"rnnencoder1"hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, attn_output,new_attn = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,2*self.FLAGS.hidden_size) # attn_output is shape (batch_size, context_len, hidden_size*2)

        _,_,blended_reps_final=build_graph_middle(self,new_attn,attn_output,context_hiddens,question_hiddens)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)

        

        '''
Пример #9
0
    def build_graph_coattention(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        """

        # Use a RNN to get hidden states for the context and the question
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Compute both sided attention
        coatt= Coattention()
        co_att= coatt.build_graph(self.FLAGS.batch_size,question_hiddens, context_hiddens, self.FLAGS.question_len, self.FLAGS.context_len, 2*self.FLAGS.hidden_size, self.keep_prob)
        
        co_att_final = tf.contrib.layers.fully_connected(co_att, num_outputs=self.FLAGS.hidden_size)
        # Use softmax layer to compute probability distribution for start location
        with vs.variable_scope("StartDist") as scp:
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(co_att_final, self.context_mask, True)
            scp.reuse_variables()
        # Use softmax layer to compute probability distribution for end location
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(co_att_final, self.context_mask, True)
Пример #10
0
    def build_graph(self):
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        if self.FLAGS.max_word_len:
            context_hiddens = encoder.build_graph(
                tf.concat([self.context_embs, self.context_char_hidden], 2),
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                tf.concat([self.qn_embs, self.qn_char_hidden], 2),
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        else:
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        attn_layer = BiDAF_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, [
            self.FLAGS.batch_size, self.FLAGS.context_len,
            self.FLAGS.question_len
        ])
        output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        blended_reps_final = tf.contrib.layers.fully_connected(
            output, num_outputs=self.FLAGS.hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #11
0
    def build_graph(self):
        attn_layer = DynamicAttention_Attn(self.keep_prob, self.FLAGS)
        output = attn_layer.build_graph(
            self.qn_embs, self.qn_mask, self.context_embs, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        encoder = RNNEncoder(self.FLAGS.embedding_size * 2, self.keep_prob)
        context_hiddens = encoder.build_graph(
            output,
            self.context_mask)  # (batch_size, context_len, embedding_size*4)

        blended_reps_final = tf.contrib.layers.fully_connected(
            context_hiddens, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #12
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        with vs.variable_scope("e1c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
        with vs.variable_scope("e1q"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
            # con_qn_hiddens = encoder.build_graph(self.con_qn_embs, self.con_qn_mask)

        # context_hiddens = con_qn_hiddens[:, :self.FLAGS.context_len, :]
        # question_hiddens = con_qn_hiddens[:, self.FLAGS.context_len:, :]
        # with vs.variable_scope("e2"):
        #     encoder1 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     context_hiddens = encoder1.build_graph(context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #     question_hiddens = encoder1.build_graph(question_hiddens, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        # _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
        with vs.variable_scope("a1"):
            attn_layer = BidirectionalAttnNew(self.keep_prob,
                                              self.FLAGS.hidden_size * 2,
                                              self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        # blended_reps_c = tf.concat([context_hiddens, attn_output_val], axis=2) # (batch_size, context_len, hidden_size*4)
        # blended_reps_q = tf.concat([question_hiddens, attn_output_key], axis=2)

        with vs.variable_scope("e2_1c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens_f = encoder.build_graph(
                attn_output,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
        # with vs.variable_scope("a2"):
        #     attn_layer1 = BidirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     _, _, attn_output_val, attn_output_key = attn_layer1.build_graph(question_hiddens,
        #                                                                 self.qn_mask,
        #                                                                 context_hiddens,
        #                                                                 self.context_mask)

        blended_reps_st = tf.concat([context_hiddens_f, attn_output], axis=2)

        with vs.variable_scope("e3c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens_f_end = encoder.build_graph(
                context_hiddens_f, self.context_mask)

        blended_reps_end = tf.concat([context_hiddens_f_end, attn_output],
                                     axis=2)

        # with vs.variable_scope("AnsPoiStRNN"):
        #     encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     start_hidden = encoder.build_graph(blended_reps, self.context_mask)
        # print "OK1"
        # with vs.variable_scope("AnsPoiStATT"):
        #     attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     start_att_dis,  start_att_out = attn_layer.build_graph(question_hiddens, self.qn_mask, start_hidden)
        # print start_att_dis.shape, start_att_out.shape
        # print "OK2"
        # with vs.variable_scope("AnsPoiEnRNN"):
        #     encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     end_hidden = encoder.build_graph(start_att_out, self.context_mask)
        # print "OK3"
        # with vs.variable_scope("AnsPoiStATT"):
        #     attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     end_att_dis, _ = attn_layer.build_graph(end_hidden, self.context_mask, question_hiddens)
        # print "OK4"

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final_st = tf.contrib.layers.fully_connected(
            blended_reps_st, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        blended_reps_final_end = tf.contrib.layers.fully_connected(
            blended_reps_end, num_outputs=self.FLAGS.hidden_size)

        # print "###", blended_reps_final.shape
        # print start_att_dis.shape, end_att_dis.shape
        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final_st, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final_end, self.context_mask)
Пример #13
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.h_hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.h_num_layers,
                             combiner=self.FLAGS.h_combiner,
                             cell_type=self.FLAGS.h_cell_type)
        if self.FLAGS.share_encoder:
            question_hiddens, question_states_fw, question_states_bw = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        else:
            question_encoder = RNNEncoder(self.FLAGS.h_hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.h_num_layers,
                                          combiner=self.FLAGS.h_combiner,
                                          cell_type=self.FLAGS.h_cell_type,
                                          scope='question_encoder')
            question_hiddens, question_states_fw, question_states_bw = question_encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        if not self.FLAGS.reuse_question_states:
            question_states_fw, question_states_bw = None, None
        context_hiddens, _, _ = encoder.build_graph(
            self.context_embs,
            self.context_mask,
            initial_states_fw=question_states_fw,
            initial_states_bw=question_states_bw
        )  # (batch_size, context_len, hidden_size*2)

        if self.FLAGS.use_bidaf:
            attn_layer = BiDAF(self.keep_prob)
            context_att, question_att = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)
            blended_reps = tf.concat([
                context_hiddens, context_att, context_hiddens * context_att,
                context_hiddens * question_att
            ],
                                     axis=2)
        else:
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output, context_hiddens * attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

        if self.FLAGS.modeling_layer_uses_rnn:
            modelling_encoder = RNNEncoder(
                self.FLAGS.h_model_size,
                self.keep_prob,
                num_layers=self.FLAGS.h_model_layers,
                combiner=self.FLAGS.h_combiner,
                cell_type=self.FLAGS.h_cell_type,
                scope='blended_reps_scope')
            blended_reps_final, model_states_fw, model_states_bw = modelling_encoder.build_graph(
                blended_reps, self.context_mask)
        else:
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.h_hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            if self.FLAGS.use_rnn_for_ends:
                end_encoder = RNNEncoder(self.FLAGS.h_model_size,
                                         self.keep_prob,
                                         num_layers=self.FLAGS.h_model_layers,
                                         combiner=self.FLAGS.h_combiner,
                                         cell_type=self.FLAGS.h_cell_type,
                                         scope='blended_reps_final')
                blended_reps_combined = tf.concat([
                    blended_reps_final,
                    tf.expand_dims(self.probdist_start, 2)
                ], 2)
                blended_reps_final, _, _ = end_encoder.build_graph(
                    blended_reps_combined,
                    self.context_mask,
                    initial_states_fw=model_states_fw,
                    initial_states_bw=model_states_bw)
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #14
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_embs = self.context_embs
        qn_embs = self.qn_embs
        if self.FLAGS.enable_cnn:
            context_embs = tf.concat(
                [self.context_embs, self.context_char_embs], axis=2)
            qn_embs = tf.concat([self.qn_embs, self.qn_char_embs], axis=2)

        context_hiddens = encoder.build_graph(
            context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            qn_embs, self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Encode query-aware representations of the context words
        bidaf_attn_layer = BidafAttn(self.keep_prob, self.FLAGS.context_len,
                                     self.FLAGS.hidden_size * 2)
        bidaf_out = bidaf_attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens,
            self.context_mask)  # (batch_size, context_len, hidden_size*8)

        # Condense the information: hidden_size*8 --> hidden_size*2
        bidaf_out = tf.contrib.layers.fully_connected(
            bidaf_out,
            num_outputs=self.FLAGS.hidden_size * 2,
            normalizer_fn=tf.contrib.layers.batch_norm
        )  # (batch_size, context_len, hidden_size*2)

        # Co-attention
        co_attn_layer = CoAttnLite(self.keep_prob, self.FLAGS.hidden_size,
                                   self.FLAGS.hidden_size * 2)
        co_out = co_attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        bico_out = tf.concat([bidaf_out, co_out],
                             2)  # (batch_size, context_len, hidden_size*4)

        # Capture interactions among context words conditioned on the query.
        gru_layer1 = RNNEncoder(
            self.FLAGS.hidden_size, self.keep_prob
        )  # params: (hidden_size*4 + hidden_size) * hidden_size * 2 * 3
        model_reps1 = gru_layer1.build_graph(
            bico_out, self.context_mask, variable_scope='ModelGRU1'
        )  # (batch_size, context_len, hidden_size*2)

        gru_layer2 = RNNEncoder(
            self.FLAGS.hidden_size, self.keep_prob
        )  # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3
        model_reps2 = gru_layer2.build_graph(
            model_reps1, self.context_mask, variable_scope='ModelGRU2'
        )  # (batch_size, context_len, hidden_size*2)

        # Self Attention & GRU layer parallel to GRU layer2.
        with tf.variable_scope('SelfAttnGRU'):
            self_attn_layer = MulAttn(self.keep_prob,
                                      self.FLAGS.hidden_size * 2,
                                      self.FLAGS.hidden_size * 2)
            se_attn = self_attn_layer.build_graph(
                model_reps1, self.context_mask, model_reps1,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            se_gru_layer = RNNEncoder(
                self.FLAGS.hidden_size, self.keep_prob
            )  # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3
            se_out = se_gru_layer.build_graph(
                se_attn, self.context_mask, variable_scope='SelfGRU'
            )  # (batch_size, context_len, hidden_size*2)

        model_reps = tf.concat([model_reps2, se_out],
                               2)  # (batch_size, context_len, hidden_size*4)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            start_reps = tf.concat(
                [bico_out, model_reps],
                2)  # (batch_size, context_len, hidden_size*10)
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                start_reps, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            gru_end_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            model_end_reps = gru_end_layer.build_graph(
                model_reps, self.context_mask, variable_scope='EndGRU'
            )  # (batch_size, context_len, hidden_size*2)
            end_reps = tf.concat(
                [bico_out, model_end_reps],
                2)  # (batch_size, context_len, hidden_size*10)

            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                end_reps, self.context_mask)

        for variable in tf.trainable_variables():
            tf.summary.histogram(variable.name.replace(':', '/'), variable)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.model == "baseline" :
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn":
            print("INSIDE the BIDAF model")
            encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn":
            encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)

        if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn":
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "baseline" :
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
            _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "coatt" :
            #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2)
            #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2)
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
            blended_reps_final = attn_output
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                contextLen = tf.reduce_sum(self.context_mask, axis=1)
                cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size)
                (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32)
                U_1 = tf.concat([fw_out, bw_out], axis=2)
                out = tf.nn.dropout(U_1, self.keep_prob)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask)


        elif self.FLAGS.model =="bidaf"  or self.FLAGS.model=="bidaf_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            if self.FLAGS.model == "bidaf_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp


            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask)



            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                # Concatenate the start logits with the modelling layer output to get the input to the
                # end word lstm
                #self.logits_start has a shape of #(batch_size, context_len)
                logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1)
                end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2)

                # LSTM
                end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask)

                blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask)

        elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

            if self.FLAGS.model == "bidaf_dynamic_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp

            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)

            # We now feed this to dynamic decoder module coded in Answer decoder
            # the output of the decoder are start, end, alpha_logits and beta_logits
            # start and end have a shape of (batch_size, num_iterations)
            #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim)
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = mod_layer_out[:,0,:]
            u_e_init = mod_layer_out[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

        elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn":
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)

            if self.FLAGS.model == "coatt_dynamic_self_attn":
                CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                attn_output = U
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = attn_output[:,0,:]
            u_e_init = attn_output[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
Пример #16
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        print "Running Attention Model with... %s" % self.FLAGS.attention
        if self.FLAGS.attention == "BiDAF":

            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

            bidaf_attn_layer = BiDirectionalAttn(self.keep_prob,
                                                 self.FLAGS.hidden_size * 2,
                                                 self.FLAGS.hidden_size * 2,
                                                 self.FLAGS.question_len,
                                                 self.FLAGS.context_len)
            _, context_to_question, _, question_to_context = bidaf_attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)

            # Combine attention vectors and hidden context vector
            context_c2q = tf.multiply(context_hiddens, context_to_question)
            context_q2c = tf.multiply(context_hiddens, question_to_context)
            blended_reps = tf.concat(
                [
                    context_hiddens, context_to_question, context_c2q,
                    context_q2c
                ],
                axis=2)  # (batch_size, context_len, hidden_size*8)

            # Modeling Layers (2 layers of bidirectional LSTM) encodes the query-aware representations of context words.
            modeling_layer = BiRNN(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_1 = modeling_layer.build_graph(
                blended_reps,
                self.context_mask)  # (batch_size, context_len, hidden_size*2).

            modeling_layer_2 = BiRNN2(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_final = modeling_layer_2.build_graph(
                blended_reps_1,
                self.context_mask)  # (batch_size, context_len, hidden_size*2).

        else:  # Default: self.FLAGS.attention == "BasicAttn"

            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #17
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        # character-level CNN to get hybrid word embeddings
        charCnn = CharCNN(self.FLAGS.word_len, self.FLAGS.char_embedding_size,
                          self.FLAGS.num_filters, self.FLAGS.kernel_size)
        # (batch_size, context_len, num_filters)
        char_context_hiddens = charCnn.build_graph(self.char_context_embs,
                                                   self.char_context_mask,
                                                   self.FLAGS.context_len)
        # (batch_size, question_len, num_filters)
        char_qn_hiddens = charCnn.build_graph(self.char_qn_embs,
                                              self.char_qn_mask,
                                              self.FLAGS.question_len)

        # hybrid word embeddings
        hybrid_context_embs = tf.concat(
            [self.context_embs, char_context_hiddens],
            axis=-1)  # (batch_size, context_len, emb_size+char_emb_size)
        hybrid_qn_embs = tf.concat(
            [self.qn_embs, char_qn_hiddens],
            axis=-1)  # (batch_size, question_len, emb_size+char_emb_size)

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, "GRU")
        context_hiddens = encoder.build_graph(
            hybrid_context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            hybrid_qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # coattention has been the best attention model I've found
        attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                            self.FLAGS.hidden_size * 2)
        u = attn_layer.build_graph(
            question_hiddens, self.qn_mask,
            context_hiddens)  # shape (batch_size, context_len, 8*hidden_size)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            u, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #18
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']:
            encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 self.keep_prob,
                                 cell_type=self.FLAGS.cell_type)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        elif self.FLAGS.cell_type == 'qanet':
            encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \
                                num_heads=self.FLAGS.emb_num_heads, \
                                filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \
                                keep_prob=self.keep_prob, input_mapping=True)
            context_hiddens = encoder.build_graph(self.context_embs,
                                                  self.context_mask)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask)

        if self.FLAGS.attention == 'basic':
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

        elif self.FLAGS.attention == 'bidaf':
            attn_layer = BiDAFAttn(self.keep_prob)
            blended_reps = attn_layer.build_graph(context_hiddens,
                                                  self.context_mask,
                                                  question_hiddens,
                                                  self.qn_mask)

        if self.FLAGS.modeling_layer == 'basic':
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps,
                num_outputs=self.FLAGS.hidden_size,
                weights_initializer=initializer_relu()
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)

        elif self.FLAGS.modeling_layer == 'rnn':
            encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                       cell_type=self.FLAGS.cell_type, name='m1')
            m1 = encoder_start.build_graph(blended_reps, self.context_mask)
            encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                     cell_type=self.FLAGS.cell_type, name='m2')
            m2 = encoder_end.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([blended_reps, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([blended_reps, m2], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet':
            modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                         num_layers=self.FLAGS.model_num_layers, \
                                         num_heads=self.FLAGS.model_num_heads, \
                                         filters=self.FLAGS.hidden_size, \
                                         kernel_size=self.FLAGS.model_kernel_size, \
                                         keep_prob=self.keep_prob, input_mapping=False, \
                                         name='modeling_encoder')
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder.build_graph(m0, self.context_mask)
            m2 = modeling_encoder.build_graph(m1, self.context_mask)
            m3 = modeling_encoder.build_graph(m2, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m1, m2], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m1, m3], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet2':
            modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder1')
            '''
            modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder2')
            '''
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder1.build_graph(m0, self.context_mask)
            m2 = modeling_encoder1.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m0, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m0, m2], -1), self.context_mask)
Пример #19
0
    def build_graph(self):
        """
        Builds the main part of the graph for the model
        
         Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # NOTE CHANGE: concantanate glove and elmo embedding
        # How to handle elmo context_len and glove context_len mismatch?
        # Just make the context_ids no max context_len
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)
        qn_embs_concat = tf.concat(
            [self.elmo_question_input, self.qn_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        #set shape so that it can pass to dynamic lstm
        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        qn_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("biLSTM"):
            Encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 keep_prob=self.keep_prob,
                                 cell_type="lstm",
                                 input_size=1024 + self.FLAGS.embedding_size)
            #shared weights (same scope)
            context_hiddens = Encoder.build_graph(
                context_embs_concat,
                self.context_mask,
                scope="context_question_encoder"
            )  #(batch_size, context_len, hidden_size*2)
            question_hiddens = Encoder.build_graph(
                qn_embs_concat, self.qn_mask, scope="context_question_encoder"
            )  #(batch_size, question_len, hidden_size*2)

        with tf.variable_scope("bidaf"):
            bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob)
            b = bidaf_object.build_graph(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask)  #(batch_size, context_len, hidden_size*8)

        with tf.variable_scope("self_attn_layer"):
            SelfAttn_object = SelfAttn(self.FLAGS.hidden_size,
                                       self.FLAGS.hidden_size * 2,
                                       self.keep_prob,
                                       input_size=self.FLAGS.hidden_size * 2)
            M = SelfAttn_object.build_graph(
                b, self.context_mask,
                cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

        #Make prediction
        with tf.variable_scope('prediction_layer'):
            #Encode the self-attended context first
            with tf.variable_scope("final_lstm_layer"):
                final_lstm_object = RNNEncoder(
                    self.FLAGS.hidden_size,
                    keep_prob=self.keep_prob,
                    cell_type="lstm",
                    input_size=self.FLAGS.hidden_size * 2)
                M_prime = final_lstm_object.build_graph(
                    M, self.context_mask,
                    scope="final_lstm")  #(batch_size, context_len, h*2)

            #Get start distribution
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    M_prime,
                    self.context_mask)  #both are (batch_size, context_len)

            with tf.variable_scope("EndDist"):
                logit_start_expand = tf.expand_dims(
                    self.logits_start, axis=2)  #(batch_size, context_len, 1)
                blended_end_rnn_input = tf.concat(
                    [logit_start_expand, M_prime],
                    axis=2)  #(batch_size, context_len, hidden_size*2)
                end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size,
                                          keep_prob=self.keep_prob,
                                          direction="unidirectional")
                end_rnn_output = end_dist_rnn.build_graph(
                    blended_end_rnn_input,
                    self.context_mask,
                    scope="end_dist_rnn")

                # Get the end dist
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    end_rnn_output, self.context_mask)
Пример #20
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        
        # ***********************
        # *** Highway network ***
        # ***********************
        
        highway_net = HighWayNetwork()
        self.context_embs = highway_net.build_graph(self.context_embs)
        self.qn_embs = highway_net.build_graph(self.qn_embs)
        
        # **********************************
        # *** Contextual Embedding layer ***
        # **********************************
        # Use a biLSTM to get hidden states for the context and the question
        # Note: here the biLSTMEncoder is shared (i.e. the weights are the same) between the context and the question.
        # biLSTM encoding utilizes contextual clues from surrounding words to refine the embeddings.
        encoder = biLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        
        # ****************************
        # *** Attention Flow layer ***
        # ****************************
        # Couples query and context vectors and produces a set of query-aware feature vectors for ea. word in the document
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, c2q_attn_output, _, q2c_attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, 
                                                                        context_hiddens, self.context_mask) 
        # c2q_attn_output is shape (batch_size, context_len, 2h), q2c_attn_output is (batch_size, 1, 2h)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, c2q_attn_output, tf.multiply(context_hiddens, c2q_attn_output), 
                                  tf.multiply(context_hiddens, q2c_attn_output)], axis=2,
                                    name="blended_reps") # (batch_size, context_len, hidden_size*8)
        
        # **********************
        # *** Modeling layer ***
        # **********************
        # Scans the context
        Modeling_layer = Modeling_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob)
        blended_reps_final = Modeling_layer.build_graph(blended_reps)
        
        # ********************
        # *** Output layer ***
        # ********************
        # Provide an answer to the query

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_final], 2),
                                                                                     self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        encoder_out = Output_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob)
        blended_reps_final_hiddens = encoder_out.build_graph(blended_reps_final) # (batch_size, context_len, hidden_size*2)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_final_hiddens], 2),
                                                                               self.context_mask)
Пример #21
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        if self.FLAGS.use_char_cnn:
            with vs.variable_scope('char_encoding'):
                self.context_char_encodings, self.cnn_filters1 = char_encoder2(
                    self.context_char_embs, self.FLAGS.context_len,
                    self.FLAGS.word_len, self.FLAGS.cnn_filter_width,
                    self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters)
                #self.context_char_encodings = tf.nn.dropout(self.context_char_encodings, self.keep_prob)
                tf.get_variable_scope().reuse_variables()
                self.qn_char_encodings, self.cnn_filters2 = char_encoder2(
                    self.qn_char_embs, self.FLAGS.question_len,
                    self.FLAGS.word_len, self.FLAGS.cnn_filter_width,
                    self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters)
                #self.qn_char_encodings = tf.nn.dropout(self.qn_char_encodings, self.keep_prob)

            joined_context_embs = tf.concat(
                [self.context_embs, self.context_char_encodings], axis=2)
            joined_qn_embs = tf.concat([self.qn_embs, self.qn_char_encodings],
                                       axis=2)
            assert joined_context_embs.shape[
                2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters
            assert joined_qn_embs.shape[
                2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters
        else:
            joined_context_embs = self.context_embs
            joined_qn_embs = self.qn_embs

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        with vs.variable_scope('embedding_layer'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                joined_context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                joined_qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        attn_layer = BDAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                            self.FLAGS.hidden_size * 2)
        attn_output = attn_layer.build_graph(
            question_hiddens,
            self.qn_mask,
            context_hiddens,
            self.context_mask,
            q2c=self.FLAGS.use_q2c_attention
        )  # attn_output is shape (batch_size, context_len, hidden_size*6)
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)
        # Use context hidden states to attend to question hidden states
        #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
        with vs.variable_scope('layer1'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            layer1_reps = encoder.build_graph(blended_reps, self.context_mask)
        with vs.variable_scope('layer2'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            layer2_reps = encoder.build_graph(layer1_reps, self.context_mask)
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        final_reps = tf.contrib.layers.fully_connected(
            layer2_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        #final_reps = layer2_reps

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_start.build_graph(
                final_reps, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            #end_hiddens = encoder.build_graph(final_reps, self.context_mask) # (batch_size, context_len, hidden_size*2)
            softmax_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_end.build_graph(
                final_reps, self.context_mask)
Пример #22
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        context_input_lens = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.context_char_ids, tf.bool),
                                  tf.int32),
                          axis=2), [-1])
        qn_input_lens = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.qn_char_ids, tf.bool),
                                  tf.int32),
                          axis=2), [-1])
        cell_fw = rnn_cell.GRUCell(self.FLAGS.hidden_size)
        cell_bw = rnn_cell.GRUCell(self.FLAGS.hidden_size)
        _, (state_fw,
            state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                        cell_bw,
                                                        self.context_char_embs,
                                                        context_input_lens,
                                                        dtype=tf.float32)
        ch_emb = tf.reshape(
            tf.concat([state_fw, state_bw], axis=1),
            [-1, self.FLAGS.context_len, 2 * self.FLAGS.hidden_size])
        self.context_embs = tf.concat([self.context_embs, ch_emb], axis=2)

        _, (state_fw,
            state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                        cell_bw,
                                                        self.qn_char_embs,
                                                        qn_input_lens,
                                                        dtype=tf.float32)
        qh_emb = tf.reshape(
            tf.concat([state_fw, state_bw], axis=1),
            [-1, self.FLAGS.question_len, 2 * self.FLAGS.hidden_size])
        self.qn_embs = tf.concat([self.qn_embs, qh_emb], axis=2)

        # ToDo Deep encoder
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        _, attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #23
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        #####  Calculate multiple attention models  #####

        ###  Use context hidden states to attend to question hidden states  ###
        """
        basicAttn_layer = BasicAttn(self.keep_prob, 
            self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        # attn_output is shape (batch_size, context_len, hidden_size*2)
        _, basicAttn_output = basicAttn_layer.build_graph(question_hiddens, 
            self.qn_mask, context_hiddens) 
        """

        ###  Bidirectional attention flow  ###
        biDirAttn_layer = bidirectionalAttn(self.keep_prob, self.FLAGS.context_len, 
            			self.FLAGS.hidden_size*2, self.FLAGS.question_len, self.FLAGS.hidden_size*2)
        # attn_output is shape (batch_size, context_len, 2*hidden_size)
        biDirAttn_output = biDirAttn_layer.build_graph(question_hiddens, 
           		 	self.qn_mask, context_hiddens, self.context_mask)
	biDir_output = tf.contrib.layers.fully_connected(
                             	biDirAttn_output, num_outputs=self.FLAGS.hidden_size)

        ###  Coattention  ###
        coAttn_layer = coattention(self.keep_prob, self.FLAGS.context_len, 
            			self.FLAGS.hidden_size*2, self.FLAGS.question_len, self.FLAGS.hidden_size*2)
        # attn_output is shape (batch_size, context_len, 2*hidden_size)
        coAttn_output = coAttn_layer.build_graph(question_hiddens, self.qn_mask, 
            			context_hiddens, self.context_mask)
	coAttn_output = tf.contrib.layers.fully_connected(
                         	coAttn_output, num_outputs=self.FLAGS.hidden_size)

        ###  Combine attention models  ###
        # Weight attentions
        attentions = tf.concat([biDir_output, coAttn_output], axis=2)
        attn_weight_calc = get_attn_weights(2, self.FLAGS.question_len, self.FLAGS.context_len, 
            self.FLAGS.hidden_size, self.keep_prob)
        attn_weights = attn_weight_calc.build_graph(question_hiddens, self.qn_mask, attentions)

        gatedAttns = tf.multiply(attentions, attn_weights) 
        print("Gattn", gatedAttns.shape.as_list())

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, gatedAttns], axis=2) # (batch_size, context_len, hidden_size*12)

        """
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_layer1 = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size*2) 
        blended_reps_layer1_DO = tf.nn.dropout(blended_reps_layer1, self.keep_prob) 
        blended_reps_layer2 = tf.contrib.layers.fully_connected(blended_reps_layer1_DO, num_outputs=self.FLAGS.hidden_size)
        blended_reps_layer2_DO = tf.nn.dropout(blended_reps_layer2, self.keep_prob) 
        blended_reps_final = tf.layers.dense(blended_reps_layer2_DO, self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        """


        attnSize    = gatedAttns.shape.as_list()[2]
        FClayer1    = tf.contrib.layers.fully_connected(gatedAttns, attnSize, scope="FC1") 
        FClayer1_DO = tf.nn.dropout(FClayer1, self.keep_prob) 

        wordStart = tf.concat([FClayer1_DO[:,0,:],   FClayer1_DO[:,0,:],    FClayer1_DO[:,1,:]],  axis=1, name="wStart")
        wordMidd  = tf.concat([FClayer1_DO[:,:-2,:], FClayer1_DO[:,1:-1,:], FClayer1_DO[:,2:,:]], axis=2, name="wMidd")
        wordEnd   = tf.concat([FClayer1_DO[:,-2,:],  FClayer1_DO[:,-1,:],   FClayer1_DO[:,-1,:]], axis=1, name="wEnd")
        wordCC    = tf.concat([tf.expand_dims(wordStart, 1), wordMidd, tf.expand_dims(wordEnd, 1)], axis=1, name="wCC")
        print("wordCC", wordCC.shape.as_list())
        FClayer2    = tf.contrib.layers.fully_connected(wordCC, 3*attnSize, scope="FC2")
        FClayer2_DO = tf.nn.dropout(FClayer2, self.keep_prob)

        conv1     = tf.layers.conv1d(FClayer2_DO, self.FLAGS.hidden_size, kernel_size=5, padding='same')
        conv1_DO  = tf.nn.dropout(conv1, self.keep_prob)
        conv2     = tf.layers.conv1d(conv1_DO, self.FLAGS.hidden_size, kernel_size=5, padding='same')
        conv2_DO  = tf.nn.dropout(conv2, self.keep_prob) 
        print("conv2", conv1.shape.as_list())

        lstmInp = tf.concat([conv1_DO, conv2_DO, attentions], axis=2)

        with vs.variable_scope("outputLSTM"):
           lstmCell = rnn_cell.LSTMCell(self.FLAGS.hidden_size*2)
           lstmDO   = DropoutWrapper(lstmCell, input_keep_prob=self.keep_prob)
           lstmOutputs,states = tf.nn.dynamic_rnn(lstmDO, lstmInp, dtype=tf.float32)

        lstmOut_layer1    = tf.contrib.layers.fully_connected(lstmOutputs, num_outputs=self.FLAGS.hidden_size*2) 
        lstmOut_layer1_DO = tf.nn.dropout(lstmOut_layer1, self.keep_prob) 
        lstmOut_layer2    = tf.contrib.layers.fully_connected(lstmOut_layer1_DO, num_outputs=self.FLAGS.hidden_size)
        lstmOut_layer2_DO = tf.nn.dropout(lstmOut_layer2, self.keep_prob)
        lstmOut_final     = tf.layers.dense(lstmOut_layer2_DO, self.FLAGS.hidden_size) 

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(lstmOut_final, self.context_mask)
            #self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(lstmOut_final, self.context_mask)
Пример #24
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building Pointer Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states

        # BIDAG LAYER
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens,
                                                     self.qn_mask,
                                                     context_hiddens,
                                                     self.context_mask)
        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output],
                                 axis=2)  # bs, c_l, 8h

        #SELF ATTENTION LAYER
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size,
                                   self.FLAGS.selfattn_size)
        _, self_attn_output = self_attn_layer.build_graph(
            bidaf_output,
            self.context_mask)  # batch_size, context_len, 8 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [bidaf_output, self_attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*16)
        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                            self.keep_prob,
                                            num_layers=self.FLAGS.num_layers,
                                            name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(
            blended_reps,
            self.context_mask)  # batch_size, context_len, hidden_size * 2

        # MODELING LAYER
        modeling_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                      self.keep_prob,
                                      num_layers=self.FLAGS.num_layers,
                                      name="ModelingEncoder")
        modeling_output = modeling_encoder.build_graph(blended_reps,
                                                       self.context_mask)
        modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.num_layers,
                                          name="ModelingEncoder2")
        modeling_output_two = modeling_encoder_two.build_graph(
            modeling_output, self.context_mask)

        total_reps_start = tf.concat([blended_reps, modeling_output], axis=2)
        total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2)

        # OUTPUT LAYER
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                total_reps_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                total_reps_end, self.context_mask)
Пример #25
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        with tf.variable_scope('context_conv1') as scope:
            context_conv1_filter = truncated_normal_var(
                name='context_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv1 = tf.nn.conv2d(self.context_character_embs,
                                         context_conv1_filter,
                                         strides,
                                         padding='SAME')
            context_conv1_bias = zero_var(name='context_conv1_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv1_add_bias = tf.nn.bias_add(context_conv1,
                                                    context_conv1_bias)
            context_relu_conv1 = tf.nn.relu(context_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        context_pool1 = tf.nn.max_pool(context_relu_conv1,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer1')

        with tf.variable_scope('context_conv2') as scope:
            context_conv2_filter = truncated_normal_var(
                name='context_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv2 = tf.nn.conv2d(context_pool1,
                                         context_conv2_filter,
                                         strides,
                                         padding='SAME')
            context_conv2_bias = zero_var(name='context_conv2_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv2_add_bias = tf.nn.bias_add(context_conv2,
                                                    context_conv2_bias)
            context_relu_conv2 = tf.nn.relu(context_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        context_pool2 = tf.nn.max_pool(context_relu_conv2,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer2')

        with tf.variable_scope('context_conv3') as scope:
            context_conv3_filter = truncated_normal_var(
                name='context_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv3 = tf.nn.conv2d(context_pool2,
                                         context_conv3_filter,
                                         strides,
                                         padding='SAME')
            context_conv3_bias = zero_var(name='context_conv3_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv3_add_bias = tf.nn.bias_add(context_conv3,
                                                    context_conv3_bias)
            context_relu_conv3 = tf.nn.relu(context_conv3_add_bias)
        pool_size = [1, 1, 4, 1]
        context_pool3 = tf.nn.max_pool(context_relu_conv3,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer3')
        context_flattened_layer = tf.reshape(
            context_pool3,
            [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE
             ])  #batch,300,192
        context_final = tf.concat([self.context_embs, context_flattened_layer],
                                  axis=2)

        with tf.variable_scope('qn_conv1') as scope:
            qn_conv1_filter = truncated_normal_var(
                name='qn_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv1 = tf.nn.conv2d(self.qn_character_embs,
                                    qn_conv1_filter,
                                    strides,
                                    padding='SAME')
            qn_conv1_bias = zero_var(name='qn_conv1_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias)
            qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        qn_pool1 = tf.nn.max_pool(qn_relu_conv1,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer1')

        with tf.variable_scope('qn_conv2') as scope:
            qn_conv2_filter = truncated_normal_var(
                name='qn_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv2 = tf.nn.conv2d(qn_pool1,
                                    qn_conv2_filter,
                                    strides,
                                    padding='SAME')
            qn_conv2_bias = zero_var(name='qn_conv2_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias)
            qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        qn_pool2 = tf.nn.max_pool(qn_relu_conv2,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer2')

        with tf.variable_scope('qn_conv3') as scope:
            qn_conv3_filter = truncated_normal_var(
                name='qn_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv3 = tf.nn.conv2d(qn_pool2,
                                    qn_conv3_filter,
                                    strides,
                                    padding='SAME')
            qn_conv3_bias = zero_var(name='qn_conv3_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias)
            qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias)
        pool_size = [1, 1, 3, 1]
        qn_pool3 = tf.nn.max_pool(qn_relu_conv3,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer3')
        qn_flattened_layer = tf.reshape(
            qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE
                       ])  #batch,30,128
        qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2)

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        print("context_final final shape %s" % (context_final.get_shape()))
        print("context_mask final shape %s" % (self.context_mask.get_shape()))
        print("qn_final final shape %s" % (qn_final.get_shape()))
        print("qn_mask final shape %s" % (self.qn_mask.get_shape()))
        context_hiddens = encoder.build_graph(
            context_final,
            self.context_mask)  # (batch_size, context_len, hidden_size*2+192)
        question_hiddens = encoder.build_graph(
            qn_final,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
        #print("blended_reps shape %s" % (blended_reps.get_shape()))
        #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1')
        #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask)
        #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2')
        #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask)

        attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*8)
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)
        model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder1")
        blended_reps_thro_model_layer1 = model_encoder1.build_graph(
            blended_reps,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder2")
        blended_reps_thro_model_layer2 = model_encoder2.build_graph(
            blended_reps_thro_model_layer1,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder3")
        blended_reps_thro_model_layer3 = model_encoder3.build_graph(
            blended_reps_thro_model_layer2,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings
        to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into cross entropy function.
          self.pdist_start, self.pdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Apply EncoderBlock for the stacked embedding encoder layer
        with tf.variable_scope("StackedEmbeddingEncoder"):
            emb_encoder = EncoderBlock(self.flags.num_blocks_enc,
                                       self.keep_prob,
                                       self.flags.kernel_size_enc,
                                       self.flags.d_model,
                                       self.flags.num_conv_enc,
                                       self.flags.num_heads,
                                       self.flags.d_ff,
                                       l2_lambda=self.flags.l2_lambda)
            c_enc = emb_encoder.build_graph(self.c_embs,
                                            self.c_longest,
                                            self.c_mask,
                                            reduce_input_dim=True,
                                            reuse=None)
            q_enc = emb_encoder.build_graph(self.q_embs,
                                            self.q_longest,
                                            self.q_mask,
                                            reduce_input_dim=True,
                                            reuse=True)

        # Apply bidirectional attention for the context-query attention layer
        with tf.variable_scope("ContextQueryAttention"):
            bidaf = BiDAFAttn(self.keep_prob, l2_lambda=self.flags.l2_lambda)
            # Shape: [batch_size, context_len, vec_size*8].
            attn_outputs = bidaf.build_graph(c_enc, self.c_mask,
                                             self.c_longest, q_enc,
                                             self.q_mask, self.q_longest)

        # Apply EncoderBlock x3 for the modeling layer
        with tf.variable_scope("ModelEncoder"):
            model_encoder = EncoderBlock(self.flags.num_blocks_mod,
                                         self.keep_prob,
                                         self.flags.kernel_size_mod,
                                         self.flags.d_model,
                                         self.flags.num_conv_mod,
                                         self.flags.num_heads,
                                         self.flags.d_ff,
                                         l2_lambda=self.flags.l2_lambda)
            model_1 = model_encoder.build_graph(attn_outputs,
                                                self.c_longest,
                                                self.c_mask,
                                                reduce_input_dim=True)
            model_2 = model_encoder.build_graph(model_1,
                                                self.c_longest,
                                                self.c_mask,
                                                reuse=True)
            model_3 = model_encoder.build_graph(model_2,
                                                self.c_longest,
                                                self.c_mask,
                                                reuse=True)

        # Use a simple softmax output layer to compute start and end probability distributions
        with tf.variable_scope("Output"):
            with tf.variable_scope("StartDistribution"):
                start_inputs = tf.concat([model_1, model_2], axis=-1)
                softmax_layer_start = SimpleSoftmaxLayer(
                    l2_lambda=self.flags.l2_lambda)
                self.logits_start, self.pdist_start = softmax_layer_start.build_graph(
                    start_inputs, self.c_mask)

            with tf.variable_scope("EndDistribution"):
                end_inputs = tf.concat([model_1, model_3], axis=-1)
                softmax_layer_end = SimpleSoftmaxLayer(
                    l2_lambda=self.flags.l2_lambda)
                self.logits_end, self.pdist_end = softmax_layer_end.build_graph(
                    end_inputs, self.c_mask)
Пример #27
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
	with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE):
            if self.FLAGS.use_stacked_encoder:
                encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob)
            else:
                encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
	if self.FLAGS.num_encoding_layers > 1:
	    with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE):
		encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
	        context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask)
	        question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.bidaf:
            attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2)
            blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8)
        else:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Self-attention layer
        if self.FLAGS.self_attend:
            self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz)
            blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked
        #       from BiDAF paper) to the hidden states from the attention layer.
        if self.FLAGS.modeling_layer:
            with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE):
                model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask)
	if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1:
	    with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE):
		model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask)
            # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob)
            # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask)

        if self.FLAGS.pointer_network: #TODO: define flag
            with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE):
                pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size)
                (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \
                    pointer_network.build_graph(blended_reps_final, self.context_mask)
        else:
                # Use softmax layer to compute probability distribution for start location
                # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        question_variation = tf.layers.dense(question_hiddens,
                                             question_hiddens.get_shape()[2],
                                             activation=tf.tanh)

        # In[]

        #question_length = tf.placeholder(tf.int32, (None,), name='question_length')
        #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')

        unmasked_affinity = tf.einsum(
            'ndh,nqh->ndq', context_hiddens,
            question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity = maybe_mask_affinity(unmasked_affinity, self.document_length)
        attention_p = tf.nn.softmax(affinity, dim=1)
        unmasked_affinity_t = tf.transpose(
            unmasked_affinity,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t = maybe_mask_affinity(unmasked_affinity_t,
                                         self.question_length)
        attention_q = tf.nn.softmax(affinity_t, dim=1)
        summary_q = tf.einsum(
            'ndh,ndq->nqh', context_hiddens,
            attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d = tf.einsum(
            'nqh,nqd->ndh', question_variation,
            attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

        encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
        context2 = encoder1.build_graph(
            summary_d,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question2 = encoder1.build_graph(
            summary_q,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        unmasked_affinity1 = tf.einsum(
            'ndh,nqh->ndq', context2,
            question2)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity1 = maybe_mask_affinity(unmasked_affinity1,
                                        self.document_length)
        attention_p1 = tf.nn.softmax(affinity1, dim=1)
        unmasked_affinity_t1 = tf.transpose(
            unmasked_affinity1,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1,
                                          self.question_length)
        attention_q1 = tf.nn.softmax(affinity_t1, dim=1)
        summary_q1 = tf.einsum(
            'ndh,ndq->nqh', context2,
            attention_p1)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d1 = tf.einsum(
            'nqh,nqd->ndh', question2,
            attention_q1)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1)

        # In[]
        document_representations = [
            context_hiddens,  # E^D_1
            context2,  # E^D_2
            summary_d,  # S^D_1
            summary_d1,  # S^D_2
            coattention_d,  # C^D_1
            coattention_d1,  # C^D_2
        ]

        document_representation = tf.concat(document_representations, 2)
        encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob)
        U = encoder2.build_graph(document_representation, self.context_mask)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, U],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
Пример #29
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        with vs.variable_scope("EmbedLayer"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        _, a, c = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        c = tf.expand_dims(c, 1)
        blended_reps = tf.concat(
            [context_hiddens, a, context_hiddens * a, context_hiddens * c],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        with vs.variable_scope("startModelLayer"):
            modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                                self.keep_prob)
            m1 = modeling_layer_encoder.build_graph(blended_reps,
                                                    self.context_mask)
        with vs.variable_scope("endModelLayer"):
            modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                                self.keep_prob)
            m2 = modeling_layer_encoder.build_graph(blended_reps,
                                                    self.context_mask)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            blended_reps_final_start = tf.concat([m1, blended_reps], axis=2)
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            blended_reps_final_end = tf.concat([m2, blended_reps], axis=2)
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final_end, self.context_mask)
Пример #30
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        if self.FLAGS.more_single_dir_rnn:
            # Use a RNN to get hidden states for the context and the question
            # Note: here the RNNEncoder is shared (i.e. the weights are the same)
            # between the context and the question.
            encoder0 = RNNEncoder0(self.FLAGS.hidden_size, self.keep_prob)
            # (batch_size, context_len, hidden_size)
            context_hiddens0 = encoder0.build_graph(self.context_embs, self.context_mask) 
            # (batch_size, question_len, hidden_size)
            question_hiddens0 = encoder0.build_graph(self.qn_embs, self.qn_mask) 
    
            encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
            # (batch_size, context_len, hidden_size*2)
            context_hiddens1 = encoder1.build_graph(context_hiddens0, self.context_mask) 
            # (batch_size, question_len, hidden_size*2)
            question_hiddens1 = encoder1.build_graph(question_hiddens0, self.qn_mask) 
        else:
            encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
            # (batch_size, context_len, hidden_size*2)
            context_hiddens1 = encoder1.build_graph(self.context_embs, self.context_mask) 
            # (batch_size, question_len, hidden_size*2)
            question_hiddens1 = encoder1.build_graph(self.qn_embs, self.qn_mask) 

        # Use context hidden states to attend to question hidden states
        basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2, self.FLAGS.advanced_basic_attn)
        # attn_output is shape (batch_size, context_len, hidden_size*2)
        _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens1, self.qn_mask, context_hiddens1) 

        # Concat basic_attn_output to context_hiddens to get blended_reps0
        blended_reps0 = tf.concat([context_hiddens1, basic_attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        if self.FLAGS.more_single_dir_rnn:
            rnnBasicAttn = RNNBasicAttn(self.FLAGS.hidden_size*4, self.keep_prob)
            rnn_basic_attn_reps = rnnBasicAttn.build_graph(blended_reps0, self.context_mask) # (batch_size, context_len, hidden_size*4)
        else:
            rnn_basic_attn_reps = blended_reps0
        
        # Gang: adding self attention (R-NET)
        # self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*4)
        # # (batch_size, context_len, hidden_size*4)
        # _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask) 

        # Gang: adding dot attention (Attention Is All You Need)
        dot_attn_layer = DotAttn(self.keep_prob, self.FLAGS.hidden_size*4, self.FLAGS.advanced_dot_attn)
        # (batch_size, context_len, hidden_size*4)
        _, dot_attn_output = dot_attn_layer.build_graph(rnn_basic_attn_reps, self.context_mask) 
        
        # Concat dot_attn_output to blended_reps0 to get blended_reps1
        blended_reps1 = tf.concat([rnn_basic_attn_reps, dot_attn_output], axis=2) # (batch_size, context_len, hidden_size*8)

        # Gang: adding gated representation (R-NET)
        if self.FLAGS.gated_reps:
            gated_reps_layer = GatedReps(self.FLAGS.hidden_size*8)
            gated_blended_reps = gated_reps_layer.build_graph(blended_reps1)
        else:
            gated_blended_reps = blended_reps1

        rnnDotAttn = RNNDotAttn(self.FLAGS.hidden_size*8, self.keep_prob)
        # (batch_size, context_len, hidden_size*16)
        rnn_dot_attn_reps = rnnDotAttn.build_graph(gated_blended_reps, self.context_mask) 

        if self.FLAGS.use_answer_pointer:
            # blended_reps_final = tf.contrib.layers.fully_connected(rnn_dot_attn_reps, 
            #                      num_outputs = self.FLAGS.hidden_size*2) 

            pointer_layer_start = AnswerPointerLayerStart(self.keep_prob, self.FLAGS.hidden_size, self.FLAGS.hidden_size*16)
            rQ, self.logits_start, self.probdist_start = pointer_layer_start.build_graph(question_hiddens1, self.qn_mask, 
                                                                                         rnn_dot_attn_reps, self.context_mask)

            pointer_layer_end = AnswerPointerLayerEnd(self.keep_prob, self.FLAGS.hidden_size*16)
            self.logits_end, self.probdist_end = pointer_layer_end.build_graph(self.probdist_start, rQ, 
                                                                               rnn_dot_attn_reps, self.context_mask)
        else:
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            # blended_reps_final is shape (batch_size, context_len, hidden_size)
            blended_reps_final = tf.contrib.layers.fully_connected(rnn_dot_attn_reps, num_outputs=self.FLAGS.hidden_size) 

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)
    
            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
Пример #31
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.self_attention:
            encoder = RNNEncoder(self.FLAGS.hidden_size_encoder,
                                 self.keep_prob)
        else:
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.simple_attention:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        if self.FLAGS.co_attention:
            #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space
            question_hiddens_t = tf.transpose(
                question_hiddens,
                perm=[0, 2, 1])  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens_t = tf.contrib.layers.fully_connected(
                question_hiddens_t,
                num_outputs=self.FLAGS.question_len,
                activation_fn=tf.nn.tanh
            )  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens = tf.transpose(
                trans_question_hiddens_t,
                perm=[0, 2, 1])  #(batch_size,question_len,hidden_size*2)

            #Computing the coattention context
            co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            co_attn_output = co_attn_layer.build_graph(
                trans_question_hiddens, self.qn_mask, self.context_mask,
                context_hiddens)  #(batch_size,context_len,6*hidden_size)

            # performing the fusion of temporal information to the coattention context via a bidirectional GRU
            with tf.variable_scope("co-attn-encoder"):
                co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size,
                                              self.keep_prob)
                blended_reps_final = co_attn_encoder.build_graph(
                    co_attn_output, self.context_mask)

        if self.FLAGS.self_attention:
            # implemrntation of self attention of the rnet paper

            self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder,
                                              self.FLAGS.hidden_size_qp,
                                              self.FLAGS.hidden_size_pp,
                                              self.keep_prob)
            v_p = self_attention_encoder.build_graph_qp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len)
            h_p = self_attention_encoder.build_graph_pp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, v_p, self.FLAGS.context_len,
                self.FLAGS.question_len)
            blended_reps_final = tf.concat(
                [context_hiddens, v_p, h_p],
                axis=2)  #(batch_size,context_len,5*hidden_size)

        if self.FLAGS.answer_pointer:
            #implementation of answer pointer as used in R-Net paper
            if self.FLAGS.co_attention:
                hidden_size_attn = self.FLAGS.hidden_size * 2
            elif self.FLAGS.self_attention:
                hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp
            else:
                hidden_size_attn = self.FLAGS.hidden_size

            answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder,
                                           hidden_size_attn,
                                           self.FLAGS.question_len,
                                           self.keep_prob)
            p, logits = answer_decoder.build_graph_answer_pointer(
                question_hiddens, context_hiddens, blended_reps_final,
                self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask,
                self.context_mask)

            self.logits_start = logits[0]
            self.probdist_start = p[0]

            self.logits_end = logits[1]
            self.probdist_end = p[1]

        if self.FLAGS.simple_softmax:
            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)