예제 #1
0
    def build_graph(self):
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2)
        _, attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        attn_layer = R_Net_Attn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                self.FLAGS)
        output = attn_layer.build_graph(
            attn_output, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        blended_reps_final = tf.contrib.layers.fully_connected(
            tf.concat([attn_output, output], 2),
            num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #2
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        """

        # Use a RNN to get hidden states for the context and the question
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask, False)

        # Use softmax layer to compute probability distribution for end location
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask, False)
예제 #3
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
       """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        print self.context_embs.shape
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        biattn_layer = AttentionFlowLayer(self.keep_prob, self.FLAGS.l2_lambda)
        biattn_output = biattn_layer.build_graph(context_hiddens,
                                                 self.context_mask,
                                                 question_hiddens,
                                                 self.qn_mask,
                                                 scope="AttnFlow")

        #RNNEncoder layer
        model_layer = Model_Layer(self.FLAGS.hidden_size, self.keep_prob)
        model_output = model_layer.build_graph(biattn_output,
                                               self.context_mask)
        #Fully connected

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            model_output, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        #What are the masks used for?

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #4
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
          self.c2q_dist : (batch_size, context_len, question_len) Context to Question attention probability. Each row should sum to 1
          except if the context word is masked.
          self.q2c_dist : (batch_size, context_len) Question to Context attention probability. Each row should sum to 1.
        """
        print("Building BIDAF")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        self.c2q_attn_dist, self.q2c_attn_dist, attn_output = \
            attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*4)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #5
0
    def __init__(self, config):
        super(RNNEncDec, self).__init__()
        self.vocab_size = config['vocab_size']
        self.maxlen = config['max_sent_len']
        self.temp = config['temp']

        #下面是原来的RNN结构
        self.desc_embedder = nn.Embedding(self.vocab_size,
                                          config['emb_dim'],
                                          padding_idx=PAD_ID)
        self.api_embedder = nn.Embedding(self.vocab_size,
                                         config['emb_dim'],
                                         padding_idx=PAD_ID)

        self.encoder = RNNEncoder(
            self.desc_embedder, None, config['emb_dim'], config['n_hidden'],
            True, config['n_layers'],
            config['noise_radius'])  # utter encoder: encode response to vector
        self.ctx2dec = nn.Sequential(  # from context to decoder initial hidden
            nn.Linear(2 * config['n_hidden'], config['n_hidden']),
            nn.Tanh(),
        )
        self.ctx2dec.apply(self.init_weights)
        self.decoder = RNNDecoder(self.api_embedder, config['emb_dim'],
                                  config['n_hidden'], self.vocab_size,
                                  config['attention'], 1,
                                  config['dropout'])  # decoder: P(x|c,z)
예제 #6
0
파일: seq2seq.py 프로젝트: caidw/deepAPI
    def __init__(self, config):
        super(RNNSeq2Seq, self).__init__()
        self.vocab_size = config['vocab_size']
        self.maxlen = config['max_sent_len']
        self.clip = config['clip']
        self.temp = config['temp']

        self.desc_embedder = nn.Embedding(self.vocab_size,
                                          config['emb_dim'],
                                          padding_idx=PAD_ID)
        self.api_embedder = nn.Embedding(self.vocab_size,
                                         config['emb_dim'],
                                         padding_idx=PAD_ID)

        self.encoder = RNNEncoder(
            self.desc_embedder, None, config['emb_dim'], config['n_hidden'],
            True, config['n_layers'],
            config['noise_radius'])  # utter encoder: encode response to vector
        self.ctx2dec = nn.Sequential(  # from context to decoder initial hidden
            nn.Linear(2 * config['n_hidden'], config['n_hidden']),
            nn.Tanh(),
        )
        self.ctx2dec.apply(self.init_weights)
        self.decoder = RNNDecoder(self.api_embedder, config['emb_dim'],
                                  config['n_hidden'], self.vocab_size,
                                  config['attention'], 1,
                                  config['dropout'])  # decoder: P(x|c,z)

        self.optimizer = ScheduledOptim(
            optim.Adam(filter(lambda x: x.requires_grad, self.parameters()),
                       betas=(0.9, 0.98),
                       eps=1e-09), config['n_hidden'],
            config['n_warmup_steps'])

        self.criterion_ce = nn.CrossEntropyLoss()
예제 #7
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
        
        if self.FLAGS.attention_type == 'dot_product':
            print("<<<<<<<< Adding dot_poduct attention >>>")         
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
    
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        
        elif self.FLAGS.attention_type == 'self_attention':
            print("<<<<<<<<< Adding Self attention over basic attention >>>>>>>")
            basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            
            self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.self_attn_zsize, self.FLAGS.hidden_size*2)
            _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask)
            concated_basic_self = tf.concat([basic_attn_output,self_attn_output], axis=2) #(bs,N,4h)
            
            self_attn_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps = self_attn_encoder.build_graph(concated_basic_self, self.context_mask, scope_name="self_attn_encoder") # (batch_size, N, hidden_size*2)
        
        elif self.FLAGS.attention_type == 'bidaf':
            print("<<<<<<<<< Adding BIDAF attention >>>>>>>")
            attn_layer = BidafAttn(self.keep_prob, self.FLAGS.hidden_size*2)
            c2q_attention, q2c_attention = attn_layer.build_graph(context_hiddens, question_hiddens, self.qn_mask, self.context_mask)
            
            # Combined tensors o get final output.....
            body_c2q_attention_mult = context_hiddens*c2q_attention # (batch_size, num_keys(N), 2h)
            q2c_expanded = tf.expand_dims(q2c_attention, 1) #(bs,1,2h)
            body_q2c_attention_mult = context_hiddens*q2c_expanded # (batch_size, num_keys(N), 2h)
            blended_reps = tf.concat([c2q_attention, body_c2q_attention_mult, body_q2c_attention_mult], axis=2) #(bs,N,6h) # context_hiddens removed
            blended_reps = tf.nn.dropout(blended_reps, self.keep_prob)
        
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        
        with vs.variable_scope("ClassProb"):
            softmax_layer_class = CustomSimpleSoftmaxLayer()
            
            #Both have dimesions:  shape (batch_size, 4)
            self.logits_class, self.probdist_class =  softmax_layer_class.build_graph(blended_reps_final, self.context_mask, self.FLAGS.reduction_type)
예제 #8
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
예제 #9
0
    def __init__(self,
                 batch_size,
                 num_classes,
                 glove_path,
                 vocab_size,
                 input_dim,
                 hidden_dim,
                 embedding_dim=300,
                 dropout_rate=0.1):
        super(rnet, self).__init__()
        self.batch_size = batch_size
        self.dropout_rate = dropout_rate
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Embedding
        self.glove_path = glove_path
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = GloveEmbedding(glove_path=self.glove_path,
                                        vocab_size=self.vocab_size,
                                        embedding_dim=self.embedding_dim)
        self.entity_embedding = EntityEmbedding(entity_path=self.entity_path,
                                                rel_size=self.rel_size,
                                                rel_dim=self.rel_dim)

        # LSTM - Encode
        self.encoder = RNNEncoder(self.batch_size,
                                  self.input_dim,
                                  self.hidden_dim,
                                  self.dropout_rate,
                                  mode='LSTM',
                                  bidirectional=True)
        # mLSTM
        self.m_lstm_attention = mLstm(2 * self.hidden_dim,
                                      self.hidden_dim,
                                      self.batch_size,
                                      IsGate=True,
                                      MatchWeightFunction='Bi')

        # selfMatching
        self.self_matching = mLstm(self.hidden_dim,
                                   self.hidden_dim,
                                   self.batch_size,
                                   IsGate=True,
                                   MatchWeightFunction='Bi')

        # fully connect
        self.num_classes = num_classes
        self.classifier = nn.Sequential(
            nn.Dropout(p=self.dropout_rate),
            nn.Linear(self.hidden_dim, self.hidden_dim), nn.Tanh(),
            nn.Dropout(p=self.dropout_rate), nn.Linear(self.hidden_dim, 1))
예제 #10
0
    def build_graph_coattention(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        """

        # Use a RNN to get hidden states for the context and the question
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Compute both sided attention
        coatt= Coattention()
        co_att= coatt.build_graph(self.FLAGS.batch_size,question_hiddens, context_hiddens, self.FLAGS.question_len, self.FLAGS.context_len, 2*self.FLAGS.hidden_size, self.keep_prob)
        
        co_att_final = tf.contrib.layers.fully_connected(co_att, num_outputs=self.FLAGS.hidden_size)
        # Use softmax layer to compute probability distribution for start location
        with vs.variable_scope("StartDist") as scp:
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(co_att_final, self.context_mask, True)
            scp.reuse_variables()
        # Use softmax layer to compute probability distribution for end location
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(co_att_final, self.context_mask, True)
예제 #11
0
    def build_graph(self):
        """Builds the main part of the graph for the model.
        """
        with vs.variable_scope("context"):
            context_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob)
            context_hiddens = context_encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)

        with vs.variable_scope("question"):
            question_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                          self.keep_prob)
            question_hiddens = question_encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
            question_last_hidden = tf.reshape(question_hiddens[:, -1, :],
                                              (-1, 2 * self.FLAGS.hidden_size))
            question_last_hidden = tf.contrib.layers.fully_connected(
                question_last_hidden, num_outputs=self.FLAGS.hidden_size)
        # Use context hidden states to attend to question hidden states

        # attn_output is shape (batch_size, context_len, hidden_size*2)
        # The following is BiDAF attention
        if self.FLAGS.use_bidaf:
            attn_layer = BiDAF(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
            attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens, self.
                context_mask)  # (batch_size, context_len, hidden_size * 6)
        else:  # otherwise, basic attention
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(question_hiddens,
                                                    self.qn_mask,
                                                    context_hiddens)
        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size)

        decoder = RNNDecoder(self.FLAGS.batch_size,
                             self.FLAGS.hidden_size,
                             self.ans_vocab_size,
                             self.FLAGS.answer_len,
                             self.ans_embedding_matrix,
                             self.keep_prob,
                             sampling_prob=self.sampling_prob,
                             schedule_embed=self.FLAGS.schedule_embed,
                             pred_method=self.FLAGS.pred_method)
        (self.train_logits, self.train_translations, _), \
        (self.dev_logits, self.dev_translations, self.attention_results) = decoder.build_graph(blended_reps_final, question_last_hidden,
                                                                       self.ans_embs, self.ans_mask, self.ans_ids,
                                                                       self.context_mask)
예제 #12
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        blended_reps_start, blended_reps_end = self.computeBlendedReps(
            context_hiddens, question_hiddens, False, BasicAttn)
        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_start, self.context_mask)
예제 #13
0
    def build_graph(self):
        attn_layer = DynamicAttention_Attn(self.keep_prob, self.FLAGS)
        output = attn_layer.build_graph(
            self.qn_embs, self.qn_mask, self.context_embs, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        encoder = RNNEncoder(self.FLAGS.embedding_size * 2, self.keep_prob)
        context_hiddens = encoder.build_graph(
            output,
            self.context_mask)  # (batch_size, context_len, embedding_size*4)

        blended_reps_final = tf.contrib.layers.fully_connected(
            context_hiddens, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #14
0
    def __init__(self, config, use_cuda):
        super(HGFU, self).__init__()
        self.config = config
        hidden_size = config['hidden_size']
        self.hidden_size = hidden_size
        self.encoder = RNNEncoder(rnn_type=config['rnn_type'],
                                  input_size=config['input_size'],
                                  hidden_size=config['hidden_size'],
                                  num_layers=config['encoder_num_layers'],
                                  dropout=config['dropout'],
                                  bidirectional=config['bidirectional'])
        self.decoder = getattr(nn, config['rnn_type'])(
            input_size=config['input_size'],
            hidden_size=config['hidden_size'],
            num_layers=config['decoder_num_layers'],
            dropout=config['dropout'])
        self.cue_decoder = getattr(nn, config['rnn_type'])(
            input_size=config['input_size'],
            hidden_size=config['hidden_size'],
            num_layers=config['decoder_num_layers'],
            dropout=config['dropout'])
        self.embedding = nn.Embedding(config['vocab_size'],
                                      config['input_size'],
                                      padding_idx=config['padding_idx'])
        # self.cue_decoder = RNNDecoder(rnn_type=config['rnn_type'],
        #                           attn_type=config['attn_type'],
        #                           input_size=config['input_size'],
        #                           hidden_size=config['hidden_size'],
        #                           num_layers=config['decoder_num_layers'],
        #                           dropout=config['dropout'])
        if config['attn_type'] != 'none':
            self.attn = GlobalAttention(hidden_size, config['attn_type'])

        self._cuda = use_cuda
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.Wk = nn.Linear(hidden_size * 2, hidden_size)
        self.presm_layer = nn.Linear(hidden_size, config['vocab_size'])
        self.sm = nn.LogSoftmax(dim=-1)
        self.dropout = nn.Dropout(p=config['dropout'])
예제 #15
0
    def build_graph(self):
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        if self.FLAGS.max_word_len:
            context_hiddens = encoder.build_graph(
                tf.concat([self.context_embs, self.context_char_hidden], 2),
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                tf.concat([self.qn_embs, self.qn_char_hidden], 2),
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        else:
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        attn_layer = BiDAF_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, [
            self.FLAGS.batch_size, self.FLAGS.context_len,
            self.FLAGS.question_len
        ])
        output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        blended_reps_final = tf.contrib.layers.fully_connected(
            output, num_outputs=self.FLAGS.hidden_size)

        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #16
0
파일: qa_model.py 프로젝트: xuwd11/QANet
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']:
            encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 self.keep_prob,
                                 cell_type=self.FLAGS.cell_type)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        elif self.FLAGS.cell_type == 'qanet':
            encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \
                                num_heads=self.FLAGS.emb_num_heads, \
                                filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \
                                keep_prob=self.keep_prob, input_mapping=True)
            context_hiddens = encoder.build_graph(self.context_embs,
                                                  self.context_mask)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask)

        if self.FLAGS.attention == 'basic':
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

        elif self.FLAGS.attention == 'bidaf':
            attn_layer = BiDAFAttn(self.keep_prob)
            blended_reps = attn_layer.build_graph(context_hiddens,
                                                  self.context_mask,
                                                  question_hiddens,
                                                  self.qn_mask)

        if self.FLAGS.modeling_layer == 'basic':
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps,
                num_outputs=self.FLAGS.hidden_size,
                weights_initializer=initializer_relu()
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)

        elif self.FLAGS.modeling_layer == 'rnn':
            encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                       cell_type=self.FLAGS.cell_type, name='m1')
            m1 = encoder_start.build_graph(blended_reps, self.context_mask)
            encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \
                                     cell_type=self.FLAGS.cell_type, name='m2')
            m2 = encoder_end.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([blended_reps, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([blended_reps, m2], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet':
            modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                         num_layers=self.FLAGS.model_num_layers, \
                                         num_heads=self.FLAGS.model_num_heads, \
                                         filters=self.FLAGS.hidden_size, \
                                         kernel_size=self.FLAGS.model_kernel_size, \
                                         keep_prob=self.keep_prob, input_mapping=False, \
                                         name='modeling_encoder')
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder.build_graph(m0, self.context_mask)
            m2 = modeling_encoder.build_graph(m1, self.context_mask)
            m3 = modeling_encoder.build_graph(m2, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m1, m2], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m1, m3], -1), self.context_mask)

        elif self.FLAGS.modeling_layer == 'qanet2':
            modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder1')
            '''
            modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \
                                          num_layers=self.FLAGS.model_num_layers, \
                                          num_heads=self.FLAGS.model_num_heads, \
                                          filters=self.FLAGS.hidden_size, \
                                          kernel_size=self.FLAGS.model_kernel_size, \
                                          keep_prob=self.keep_prob, input_mapping=False, \
                                          name='modeling_encoder2')
            '''
            m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \
                                  kernel_size=1, padding='SAME', name='attn_mapping')
            m1 = modeling_encoder1.build_graph(m0, self.context_mask)
            m2 = modeling_encoder1.build_graph(m1, self.context_mask)
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    tf.concat([m0, m1], -1), self.context_mask)
            with tf.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    tf.concat([m0, m2], -1), self.context_mask)
예제 #17
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building Pointer Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states

        # BIDAG LAYER
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens,
                                                     self.qn_mask,
                                                     context_hiddens,
                                                     self.context_mask)
        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output],
                                 axis=2)  # bs, c_l, 8h

        #SELF ATTENTION LAYER
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size,
                                   self.FLAGS.selfattn_size)
        _, self_attn_output = self_attn_layer.build_graph(
            bidaf_output,
            self.context_mask)  # batch_size, context_len, 8 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [bidaf_output, self_attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*16)
        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                            self.keep_prob,
                                            num_layers=self.FLAGS.num_layers,
                                            name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(
            blended_reps,
            self.context_mask)  # batch_size, context_len, hidden_size * 2

        # MODELING LAYER
        modeling_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                      self.keep_prob,
                                      num_layers=self.FLAGS.num_layers,
                                      name="ModelingEncoder")
        modeling_output = modeling_encoder.build_graph(blended_reps,
                                                       self.context_mask)
        modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.num_layers,
                                          name="ModelingEncoder2")
        modeling_output_two = modeling_encoder_two.build_graph(
            modeling_output, self.context_mask)

        total_reps_start = tf.concat([blended_reps, modeling_output], axis=2)
        total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2)

        # OUTPUT LAYER
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                total_reps_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                total_reps_end, self.context_mask)
예제 #18
0
# In[]
from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.ops import embedding_ops


# In[]
with vs.variable_scope("embeddings"):
    # Note: the embedding matrix is a tf.constant which means it's not a trainable parameter
    embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size)
    # Get the word embeddings for the context and question,
    # using the placeholders self.context_ids and self.qn_ids
    context_embs = embedding_ops.embedding_lookup(embedding_matrix, context_ids) # shape (batch_size, context_len, embedding_size)
    qn_embs = embedding_ops.embedding_lookup(embedding_matrix, qn_ids) # shape (batch_size, question_len, embedding_size)


encoder = RNNEncoder(FLAGS.hidden_size, keep_prob)
context_hiddens = encoder.build_graph(context_embs, context_mask) # (batch_size, context_len, hidden_size*2)
question_hiddens = encoder.build_graph(qn_embs, qn_mask) # (batch_size, question_len, hidden_size*2)

question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh);
        

# In[]

#question_length = tf.placeholder(tf.int32, (None,), name='question_length')
#document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')
question_length = tf.reduce_sum(qn_mask, reduction_indices=1) # shape (batch_size)
document_length = tf.reduce_sum(context_mask, reduction_indices=1) # shape (batch_size)

unmasked_affinity = tf.einsum('ndh,nqh->ndq', context_hiddens, question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
affinity = maybe_mask_affinity(unmasked_affinity, document_length)
예제 #19
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        with vs.variable_scope("Encoder"):
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        _, _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # shapes are U_tilde: (batch_size, context_len, 2h), H_tilde: (batch_size, context_len, 1)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*8)

        with vs.variable_scope("M1_init"):
            # Bidirectional GRU M1
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_1_init = modeling_layer.build_graph(blended_reps, self.context_mask) # (batch_size, N, 2h)

        with vs.variable_scope("M1"):
            # Bidrectional GRU M2
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_1 = modeling_layer.build_graph(blended_reps_1_init, self.context_mask) # (batch_size, N, 2h)

        with vs.variable_scope("M2"):
            # Bidrectional GRU M2
            modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps_2 = modeling_layer.build_graph(blended_reps_1, self.context_mask) # (batch_size, N, 2h)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_1], axis=2), self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_2], axis=2), self.context_mask)
예제 #20
0
파일: qa_model.py 프로젝트: Gregory06/SQuAD
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        with vs.variable_scope("e1c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                self.context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
        with vs.variable_scope("e1q"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            question_hiddens = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
            # con_qn_hiddens = encoder.build_graph(self.con_qn_embs, self.con_qn_mask)

        # context_hiddens = con_qn_hiddens[:, :self.FLAGS.context_len, :]
        # question_hiddens = con_qn_hiddens[:, self.FLAGS.context_len:, :]
        # with vs.variable_scope("e2"):
        #     encoder1 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     context_hiddens = encoder1.build_graph(context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #     question_hiddens = encoder1.build_graph(question_hiddens, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        # _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
        with vs.variable_scope("a1"):
            attn_layer = BidirectionalAttnNew(self.keep_prob,
                                              self.FLAGS.hidden_size * 2,
                                              self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        # blended_reps_c = tf.concat([context_hiddens, attn_output_val], axis=2) # (batch_size, context_len, hidden_size*4)
        # blended_reps_q = tf.concat([question_hiddens, attn_output_key], axis=2)

        with vs.variable_scope("e2_1c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens_f = encoder.build_graph(
                attn_output,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
        # with vs.variable_scope("a2"):
        #     attn_layer1 = BidirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     _, _, attn_output_val, attn_output_key = attn_layer1.build_graph(question_hiddens,
        #                                                                 self.qn_mask,
        #                                                                 context_hiddens,
        #                                                                 self.context_mask)

        blended_reps_st = tf.concat([context_hiddens_f, attn_output], axis=2)

        with vs.variable_scope("e3c"):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens_f_end = encoder.build_graph(
                context_hiddens_f, self.context_mask)

        blended_reps_end = tf.concat([context_hiddens_f_end, attn_output],
                                     axis=2)

        # with vs.variable_scope("AnsPoiStRNN"):
        #     encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     start_hidden = encoder.build_graph(blended_reps, self.context_mask)
        # print "OK1"
        # with vs.variable_scope("AnsPoiStATT"):
        #     attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     start_att_dis,  start_att_out = attn_layer.build_graph(question_hiddens, self.qn_mask, start_hidden)
        # print start_att_dis.shape, start_att_out.shape
        # print "OK2"
        # with vs.variable_scope("AnsPoiEnRNN"):
        #     encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #     end_hidden = encoder.build_graph(start_att_out, self.context_mask)
        # print "OK3"
        # with vs.variable_scope("AnsPoiStATT"):
        #     attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
        #     end_att_dis, _ = attn_layer.build_graph(end_hidden, self.context_mask, question_hiddens)
        # print "OK4"

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final_st = tf.contrib.layers.fully_connected(
            blended_reps_st, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        blended_reps_final_end = tf.contrib.layers.fully_connected(
            blended_reps_end, num_outputs=self.FLAGS.hidden_size)

        # print "###", blended_reps_final.shape
        # print start_att_dis.shape, end_att_dis.shape
        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final_st, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final_end, self.context_mask)
예제 #21
0
파일: qa_model.py 프로젝트: raksitov/SQuAD
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.h_hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.h_num_layers,
                             combiner=self.FLAGS.h_combiner,
                             cell_type=self.FLAGS.h_cell_type)
        if self.FLAGS.share_encoder:
            question_hiddens, question_states_fw, question_states_bw = encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        else:
            question_encoder = RNNEncoder(self.FLAGS.h_hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.h_num_layers,
                                          combiner=self.FLAGS.h_combiner,
                                          cell_type=self.FLAGS.h_cell_type,
                                          scope='question_encoder')
            question_hiddens, question_states_fw, question_states_bw = question_encoder.build_graph(
                self.qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)
        if not self.FLAGS.reuse_question_states:
            question_states_fw, question_states_bw = None, None
        context_hiddens, _, _ = encoder.build_graph(
            self.context_embs,
            self.context_mask,
            initial_states_fw=question_states_fw,
            initial_states_bw=question_states_bw
        )  # (batch_size, context_len, hidden_size*2)

        if self.FLAGS.use_bidaf:
            attn_layer = BiDAF(self.keep_prob)
            context_att, question_att = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)
            blended_reps = tf.concat([
                context_hiddens, context_att, context_hiddens * context_att,
                context_hiddens * question_att
            ],
                                     axis=2)
        else:
            # Use context hidden states to attend to question hidden states
            attn_layer = BasicAttn(self.keep_prob)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output, context_hiddens * attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

        if self.FLAGS.modeling_layer_uses_rnn:
            modelling_encoder = RNNEncoder(
                self.FLAGS.h_model_size,
                self.keep_prob,
                num_layers=self.FLAGS.h_model_layers,
                combiner=self.FLAGS.h_combiner,
                cell_type=self.FLAGS.h_cell_type,
                scope='blended_reps_scope')
            blended_reps_final, model_states_fw, model_states_bw = modelling_encoder.build_graph(
                blended_reps, self.context_mask)
        else:
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.h_hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            if self.FLAGS.use_rnn_for_ends:
                end_encoder = RNNEncoder(self.FLAGS.h_model_size,
                                         self.keep_prob,
                                         num_layers=self.FLAGS.h_model_layers,
                                         combiner=self.FLAGS.h_combiner,
                                         cell_type=self.FLAGS.h_cell_type,
                                         scope='blended_reps_final')
                blended_reps_combined = tf.concat([
                    blended_reps_final,
                    tf.expand_dims(self.probdist_start, 2)
                ], 2)
                blended_reps_final, _, _ = end_encoder.build_graph(
                    blended_reps_combined,
                    self.context_mask,
                    initial_states_fw=model_states_fw,
                    initial_states_bw=model_states_bw)
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #22
0
    def build_graph_middle(self,new_attn,attn_output,context_hiddens,question_hiddens):

        

        matrix_dimensions_answer = context_hiddens.get_shape().as_list()
        batch_size_answer,matrix_size_answer,hidden_size_answer = matrix_dimensions_answer[0],matrix_dimensions_answer[1],matrix_dimensions_answer[2]



        matrix_dimensions_question =  question_hiddens.get_shape().as_list()
        batch_size_question,matrix_size_question,hidden_size_question = matrix_dimensions_question[0],matrix_dimensions_question[1],matrix_dimensions_question[2]



        print(matrix_dimensions_answer,matrix_dimensions_question)
        ##time.sleep(100)

        #Add attention over attention code



        print("question",question_hiddens.get_shape().as_list())
        print("pargraph",context_hiddens.get_shape().as_list())
        print("attention matrix",new_attn.get_shape().as_list())


        P2Q = tf.nn.softmax(new_attn,1)   #(batch,paragraph,questions)

        QTilda = tf.matmul(P2Q,question_hiddens)        #(batch,paragraph,hidden*2) same as paragraph



        Q2P = tf.nn.softmax(new_attn,2)

        Q2PTranspose = tf.transpose(Q2P,perm=[0,2,1])

        PTilda = tf.matmul(Q2PTranspose,context_hiddens)        #(batch,question,hidden*2) same as question


        print("P2Q",P2Q.get_shape().as_list())
        print("QTilda",QTilda.get_shape().as_list())
        print("Q2P",Q2P.get_shape().as_list())
        print("PTilda",PTilda.get_shape().as_list())


        #Fusion layer below

        #variable_temp = self.Fuse(QTilda,context_hiddens,"paragraphGate","paragraphMatch",context_hiddens)
        #print(variable_temp.get_shape().as_list())
        print("AAA")
        ##time.sleep(100)


        paragraphNew = self.Fuse(QTilda,context_hiddens,"paragraphGate","paragraphMatchYOYO",context_hiddens)    #(batch,paragraph,hidden)
        paragraphNew.set_shape([None,matrix_size_answer,hidden_size_answer])
        questionNew = self.Fuse(PTilda,question_hiddens,"questionGate","questionMatch",question_hiddens)      #(batch,question,hidden)
        questionNew.set_shape([None,matrix_size_question,hidden_size_question])
        ##time.sleep(100)

        #paragraphNew = tf.Print(paragraphNew,[tf.shape(paragraphNew)])
        #questionNew = tf.Print(questionNew,[tf.shape(questionNew)])



        print(paragraphNew)
        print(questionNew)

        ##time.sleep(100)


        #paragraphNewMask  = tf.placeholder(tf.int32, shape=[None, 1])
        #questionNewMask  = tf.placeholder(tf.int32, shape=[None, 1])

        encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        encoder2Q = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_hiddens_new = encoder2.build_graph(paragraphNew, self.context_mask,"rnnencoder2")  #(batch,paragraph,context_len)
        question_hiddens_new = encoder2Q.build_graph(questionNew, self.qn_mask,"rnnencoder2Q")   #(batch,question,context_len)

        #context_hiddens_new = paragraphNew
        #question_hiddens_new = questionNew

        #context_hiddens_new = tf.Print(context_hiddens_new,[tf.shape(context_hiddens_new)])
        #question_hiddens_new = tf.Print(question_hiddens_new,[tf.shape(question_hiddens_new)])


        print(context_hiddens_new.get_shape().as_list())
        print("****")
        ####time.sleep(100)
        matrix_dimensions = tf.shape(context_hiddens)
        batch_size,matrix_size,hidden_size = matrix_dimensions[0],matrix_dimensions[1],matrix_dimensions[2]



        #Second fusing layer and softmax layer
        #New learnable matrix



        W1 = tf.get_variable("W1",shape=[matrix_size_answer,matrix_size_answer],trainable=True) #(matrix_size,matrix_size)


        #paragraphNewReshape = tf.reshape(context_hiddens_new,[batch_size*matrix_size,hidden_size])
        paragraphNewTranspose = tf.transpose(context_hiddens_new,perm=[0,2,1])
        paragraphNewReshape = tf.reshape(paragraphNewTranspose,[batch_size*hidden_size,matrix_size])    #(B*H,P)

        paragraphTempRep = tf.matmul(paragraphNewReshape,W1)                                            #(B*H,P)

        paragraphTempRep2 = tf.reshape(paragraphTempRep,[batch_size,hidden_size,matrix_size])
        paragraphTempRep3 = tf.matmul(paragraphTempRep2,context_hiddens_new)
        paragraphTempSoftmax = tf.nn.softmax(paragraphTempRep3)                             #(batch,hidden_size,hidden_size)

        paragraphSelfAllign = tf.matmul(paragraphTempSoftmax,tf.transpose(context_hiddens_new,perm=[0,2,1]))

        paragraphContextual = self.Fuse(tf.transpose(paragraphSelfAllign,perm=[0,2,1]),context_hiddens_new,"paragraphGate2","paragraphMatch2",context_hiddens) #(batch,pargraph,hidden)


        print(paragraphContextual.get_shape().as_list())
        #time.sleep(100)

        #paragraphContextual = tf.Print(paragraphContextual,[tf.shape(paragraphContextual)])


        '''
        batch_size2,matrix_size2,hidden_size2 = matrix_dimensions2[0],matrix_dimensions2[1],matrix_dimensions2[2]
        matrix_dimensions2 = tf.shape(context_hiddens_new)
        questionNewReshape = tf.reshape(question_hiddens_new,[batch_size2*matrix_size2,hidden_size2])
        questionTempRep = tf.matmul(tf.matmul(questionNewReshape,W1))
        questionTempRep2 = tf.reshape(questionTempRep,[batch_size2,matrix_size2,hidden_size2])
        questionTempRep3 = tf.matmul(questionTempRep2,tf.transpose(question_hiddens_new,dim=[0,2,1]))
        questionTempSoftmax = tf.nn.softmax(questionTempRep3)

        questionSelfAllign = tf.matmul(questionTempSoftmax,question_hiddens_new)
        '''

        encoder3 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)



        #pargraphContextualMask =  tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len])
        paragraphContextual.set_shape([batch_size_answer,matrix_size_answer,hidden_size_answer])

        print(batch_size_answer,matrix_size_answer,hidden_size_answer)
        print(self.context_mask.get_shape().as_list())
        #time.sleep(100)

        paragraphContextual = paragraphContextual
        #paragraphContextual=encoder3.build_graph(paragraphContextual, self.context_mask,"rnnencoder3")  #(batch,paragraph,context_len)


        #Code to represent question
        matrix_dimensions2 = tf.shape(question_hiddens)
        batch_size2,matrix_size2,hidden_size2 = matrix_dimensions2[0],matrix_dimensions2[1],matrix_dimensions2[2]



        #encoder4 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #questionSelfAllignMask =  tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len])
        questionSelfAllign = question_hiddens_new
        #encoder4.build_graph(question_hiddens_new, self.qn_mask,"rnnencoder4")  #(batch,question,H)


        Wq = tf.get_variable("Wq",shape=[1,question_hiddens.get_shape().as_list()[2]],trainable=True)      #(1,h)

        questionSelfAllignTranspose = tf.transpose(questionSelfAllign,perm=[2,0,1])
        questionSelfAllignReshape = tf.reshape(questionSelfAllignTranspose,[hidden_size2,matrix_size2*batch_size2])

        GammaTemp = tf.matmul(Wq,questionSelfAllignReshape)
        GammaTemp2 = tf.reshape(GammaTemp,[batch_size2,1,matrix_size2])
        Gamma = tf.nn.softmax(GammaTemp2)      #(batch,1,question)

        questionContextual = tf.matmul(Gamma,questionSelfAllign)    #(batch,1,hidden)

        print(questionContextual.get_shape().as_list())
        ###time.sleep(100)

        #For start point of answer
        WeightSoftmaxStart = tf.get_variable("WeightSoftmaxStart",[question_hiddens.get_shape().as_list()[2],question_hiddens.get_shape().as_list()[2]],trainable=True)
        questionTranspose = tf.transpose(questionContextual,perm=[0,2,1])
        questionContextualReshape = tf.reshape(questionTranspose,[batch_size,hidden_size])
        tempMatrixMult1 = tf.matmul(questionContextualReshape,WeightSoftmaxStart)

        tempMatrixMult1Reshape = tf.reshape(tempMatrixMult1,[batch_size,1,hidden_size])
        probStartMatrix = tf.matmul(tempMatrixMult1Reshape,tf.transpose(paragraphContextual,perm=[0,2,1]))  #(b,1,n)
        '''
        paragraphContextualTranspose = tf.reshape(paragraphContextual,[batch_size*matrix_size,hidden_size])

        tempMatrixMult1 = tf.matmul(paragraphContextualTranspose,WeightSoftmaxStart)
        tempMatrixMult1Reshape = tf.reshape(tempMatrixMult1,[batch_size,matrix_size,1])

        probStartMatrix = tf.matmul(tempMatrixMult1Reshape,questionContextual) #(batch,pargraph,context)
        '''

        #For end point of answer
        WeightSoftmaxEnd = tf.get_variable("WeightSoftmaxEnd",[question_hiddens.get_shape().as_list()[2],question_hiddens.get_shape().as_list()[2]],trainable=True)
        #questionTranspose = tf.transpose(questionContextual,perm=[0,2,1])
        #questionContextualReshape = tf.reshape(questionTranspose,[batch_size,hidden_size])
        tempMatrixMult2 = tf.matmul(questionContextualReshape,WeightSoftmaxEnd)
        tempMatrixMult1Reshape2 = tf.reshape(tempMatrixMult2,[batch_size,1,hidden_size])
        probEndMatrix = tf.matmul(tempMatrixMult1Reshape2,tf.transpose(paragraphContextual,perm=[0,2,1])) #(b,1,n)


        print(probStartMatrix.get_shape().as_list())
        print(probEndMatrix.get_shape().as_list())
        print("**************")


        probStartMatrix = tf.reshape(probStartMatrix,[batch_size,matrix_size])
        probEndMatrix = tf.reshape(probEndMatrix,[batch_size,matrix_size])

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        
        return probStartMatrix,probEndMatrix,blended_reps_final
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.model == "baseline" :
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn":
            print("INSIDE the BIDAF model")
            encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn":
            encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)

        if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn":
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "baseline" :
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
            _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "coatt" :
            #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2)
            #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2)
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
            blended_reps_final = attn_output
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                contextLen = tf.reduce_sum(self.context_mask, axis=1)
                cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size)
                (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32)
                U_1 = tf.concat([fw_out, bw_out], axis=2)
                out = tf.nn.dropout(U_1, self.keep_prob)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask)


        elif self.FLAGS.model =="bidaf"  or self.FLAGS.model=="bidaf_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            if self.FLAGS.model == "bidaf_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp


            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask)



            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                # Concatenate the start logits with the modelling layer output to get the input to the
                # end word lstm
                #self.logits_start has a shape of #(batch_size, context_len)
                logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1)
                end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2)

                # LSTM
                end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask)

                blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask)

        elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

            if self.FLAGS.model == "bidaf_dynamic_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp

            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)

            # We now feed this to dynamic decoder module coded in Answer decoder
            # the output of the decoder are start, end, alpha_logits and beta_logits
            # start and end have a shape of (batch_size, num_iterations)
            #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim)
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = mod_layer_out[:,0,:]
            u_e_init = mod_layer_out[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

        elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn":
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)

            if self.FLAGS.model == "coatt_dynamic_self_attn":
                CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                attn_output = U
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = attn_output[:,0,:]
            u_e_init = attn_output[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
예제 #24
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        if self.FLAGS.use_char_cnn:
            with vs.variable_scope('char_encoding'):
                self.context_char_encodings, self.cnn_filters1 = char_encoder2(
                    self.context_char_embs, self.FLAGS.context_len,
                    self.FLAGS.word_len, self.FLAGS.cnn_filter_width,
                    self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters)
                #self.context_char_encodings = tf.nn.dropout(self.context_char_encodings, self.keep_prob)
                tf.get_variable_scope().reuse_variables()
                self.qn_char_encodings, self.cnn_filters2 = char_encoder2(
                    self.qn_char_embs, self.FLAGS.question_len,
                    self.FLAGS.word_len, self.FLAGS.cnn_filter_width,
                    self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters)
                #self.qn_char_encodings = tf.nn.dropout(self.qn_char_encodings, self.keep_prob)

            joined_context_embs = tf.concat(
                [self.context_embs, self.context_char_encodings], axis=2)
            joined_qn_embs = tf.concat([self.qn_embs, self.qn_char_encodings],
                                       axis=2)
            assert joined_context_embs.shape[
                2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters
            assert joined_qn_embs.shape[
                2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters
        else:
            joined_context_embs = self.context_embs
            joined_qn_embs = self.qn_embs

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        with vs.variable_scope('embedding_layer'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(
                joined_context_embs,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(
                joined_qn_embs,
                self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        attn_layer = BDAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                            self.FLAGS.hidden_size * 2)
        attn_output = attn_layer.build_graph(
            question_hiddens,
            self.qn_mask,
            context_hiddens,
            self.context_mask,
            q2c=self.FLAGS.use_q2c_attention
        )  # attn_output is shape (batch_size, context_len, hidden_size*6)
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)
        # Use context hidden states to attend to question hidden states
        #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
        with vs.variable_scope('layer1'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            layer1_reps = encoder.build_graph(blended_reps, self.context_mask)
        with vs.variable_scope('layer2'):
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            layer2_reps = encoder.build_graph(layer1_reps, self.context_mask)
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        final_reps = tf.contrib.layers.fully_connected(
            layer2_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        #final_reps = layer2_reps

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_start.build_graph(
                final_reps, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            #end_hiddens = encoder.build_graph(final_reps, self.context_mask) # (batch_size, context_len, hidden_size*2)
            softmax_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_end.build_graph(
                final_reps, self.context_mask)
예제 #25
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # R-NET: gated attention layer
        #gated_attn_layer = GatedAttn(self.keep_prob)
        #context_hiddens = gated_attn_layer.build_graph(question_hiddens, context_hiddens, self.qn_mask)

        # Use context hidden states to attend to question hidden states
        #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Use bidirectional attention flow
        biDAF_layer = BidirectionalAttn(self.keep_prob,
                                        self.FLAGS.hidden_size * 2,
                                        self.FLAGS.hidden_size * 2)
        C2Q, Q2C = biDAF_layer.build_graph(question_hiddens, self.qn_mask,
                                           context_hiddens, self.context_mask)
        # Concat output_l, output_r to context_hiddens to get blended_reps
        #blended_reps = tf.concat([context_hiddens, C2Q, Q2C], axis=2) # (batch_size, context_len, hidden_size*6)
        blended_reps = tf.concat(
            [
                context_hiddens, C2Q, context_hiddens * C2Q,
                context_hiddens * Q2C
            ],
            axis=2)  # (batch_size, context_len, hidden_size*8)

        # Modeling layer for BiDAF
        biDAF_modeling_layer = ModelingForBiDAF(self.FLAGS.hidden_size,
                                                self.keep_prob)
        modeling_output = biDAF_modeling_layer.build_graph(
            blended_reps,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        #encoder = RNNEncoderV2(self.FLAGS.hidden_size, self.keep_prob)
        #context_fw, context_bw = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #question_fw, question_bw = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
        #mpm_layer = BiMPM(self.FLAGS.hidden_size, self.keep_prob)
        #mpm_reps, _ = mpm_layer.build_graph(context_fw, context_bw, question_fw, question_bw, self.context_mask, self.qn_mask)
        #self.matching_scores = mpm_reps # (batch_size, context_len, 8*MP_dim)

        #mpm_layer = BiMPM(self.FLAGS.hidden_size*2, self.keep_prob)
        #mpm_reps, _ = mpm_layer.build_graph(context_hiddens, question_hiddens, self.context_mask, self.qn_mask)
        #self.matching_scores = mpm_reps # (batch_size, context_len, 8*MP_dim)

        # Modeling layer for mpm
        #mpm_modeling_layer = ModelingForBiDAF(self.FLAGS.hidden_size, self.keep_prob)
        #modeling_output = mpm_modeling_layer.build_graph(mpm_reps, self.context_mask) # (batch_size, context_len, hidden_size*2)

        # Coattention:
        #attn_layer = CoAttn(self.FLAGS.hidden_size*2, self.keep_prob)
        #coattn_output = attn_layer.build_graph(question_hiddens, context_hiddens, self.qn_mask, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

        # R-NET: self attention layer
        #self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*4)
        #self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*8)
        #context_hiddens = self_attn_layer.build_graph(blended_reps, self.context_mask)
        #context_hiddens = self_attn_layer.build_graph(coattn_output, self.context_mask)

        # R-NET: pointer network output layer
        #ptr_net = PointerNetwork()
        #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = \
        #ptr_net.build_graph(question_hiddens, context_hiddens, self.qn_mask, self.context_mask)
        #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = \
        #ptr_net.build_graph(question_hiddens, coattn_output, self.qn_mask, self.context_mask)

        # Dynamic Pointing Decoder output layer
        dp_decoder = DynamicPointingDecoder(self.keep_prob)
        #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(coattn_output, self.context_mask)
        #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(blended_reps, self.context_mask)
        self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(
            modeling_output, self.context_mask)
        #self.logits_start, self.probdist_start, self.logits_end, self.probdist_end = dp_decoder.build_graph(mpm_reps, self.context_mask)
        """
예제 #26
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        with tf.variable_scope('context_conv1') as scope:
            context_conv1_filter = truncated_normal_var(
                name='context_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv1 = tf.nn.conv2d(self.context_character_embs,
                                         context_conv1_filter,
                                         strides,
                                         padding='SAME')
            context_conv1_bias = zero_var(name='context_conv1_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv1_add_bias = tf.nn.bias_add(context_conv1,
                                                    context_conv1_bias)
            context_relu_conv1 = tf.nn.relu(context_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        context_pool1 = tf.nn.max_pool(context_relu_conv1,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer1')

        with tf.variable_scope('context_conv2') as scope:
            context_conv2_filter = truncated_normal_var(
                name='context_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv2 = tf.nn.conv2d(context_pool1,
                                         context_conv2_filter,
                                         strides,
                                         padding='SAME')
            context_conv2_bias = zero_var(name='context_conv2_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv2_add_bias = tf.nn.bias_add(context_conv2,
                                                    context_conv2_bias)
            context_relu_conv2 = tf.nn.relu(context_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        context_pool2 = tf.nn.max_pool(context_relu_conv2,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer2')

        with tf.variable_scope('context_conv3') as scope:
            context_conv3_filter = truncated_normal_var(
                name='context_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            context_conv3 = tf.nn.conv2d(context_pool2,
                                         context_conv3_filter,
                                         strides,
                                         padding='SAME')
            context_conv3_bias = zero_var(name='context_conv3_bias',
                                          shape=[self.FLAGS.CONV_SHAPE],
                                          dtype=tf.float32)
            context_conv3_add_bias = tf.nn.bias_add(context_conv3,
                                                    context_conv3_bias)
            context_relu_conv3 = tf.nn.relu(context_conv3_add_bias)
        pool_size = [1, 1, 4, 1]
        context_pool3 = tf.nn.max_pool(context_relu_conv3,
                                       ksize=pool_size,
                                       strides=pool_size,
                                       padding='SAME',
                                       name='context_pool_layer3')
        context_flattened_layer = tf.reshape(
            context_pool3,
            [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE
             ])  #batch,300,192
        context_final = tf.concat([self.context_embs, context_flattened_layer],
                                  axis=2)

        with tf.variable_scope('qn_conv1') as scope:
            qn_conv1_filter = truncated_normal_var(
                name='qn_conv1_filter',
                shape=[1, 3, 50, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv1 = tf.nn.conv2d(self.qn_character_embs,
                                    qn_conv1_filter,
                                    strides,
                                    padding='SAME')
            qn_conv1_bias = zero_var(name='qn_conv1_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias)
            qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias)
        pool_size = [1, 1, 2, 1]
        qn_pool1 = tf.nn.max_pool(qn_relu_conv1,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer1')

        with tf.variable_scope('qn_conv2') as scope:
            qn_conv2_filter = truncated_normal_var(
                name='qn_conv2_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv2 = tf.nn.conv2d(qn_pool1,
                                    qn_conv2_filter,
                                    strides,
                                    padding='SAME')
            qn_conv2_bias = zero_var(name='qn_conv2_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias)
            qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias)

        pool_size = [1, 1, 3, 1]
        qn_pool2 = tf.nn.max_pool(qn_relu_conv2,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer2')

        with tf.variable_scope('qn_conv3') as scope:
            qn_conv3_filter = truncated_normal_var(
                name='qn_conv3_filter',
                shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE],
                dtype=tf.float32)
            strides = [1, 1, 1, 1]
            qn_conv3 = tf.nn.conv2d(qn_pool2,
                                    qn_conv3_filter,
                                    strides,
                                    padding='SAME')
            qn_conv3_bias = zero_var(name='qn_conv3_bias',
                                     shape=[self.FLAGS.CONV_SHAPE],
                                     dtype=tf.float32)
            qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias)
            qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias)
        pool_size = [1, 1, 3, 1]
        qn_pool3 = tf.nn.max_pool(qn_relu_conv3,
                                  ksize=pool_size,
                                  strides=pool_size,
                                  padding='SAME',
                                  name='qn_pool_layer3')
        qn_flattened_layer = tf.reshape(
            qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE
                       ])  #batch,30,128
        qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2)

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        print("context_final final shape %s" % (context_final.get_shape()))
        print("context_mask final shape %s" % (self.context_mask.get_shape()))
        print("qn_final final shape %s" % (qn_final.get_shape()))
        print("qn_mask final shape %s" % (self.qn_mask.get_shape()))
        context_hiddens = encoder.build_graph(
            context_final,
            self.context_mask)  # (batch_size, context_len, hidden_size*2+192)
        question_hiddens = encoder.build_graph(
            qn_final,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
        #print("blended_reps shape %s" % (blended_reps.get_shape()))
        #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1')
        #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask)
        #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2')
        #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask)

        attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens, self.context_mask
        )  # attn_output is shape (batch_size, context_len, hidden_size*8)
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*8)
        model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder1")
        blended_reps_thro_model_layer1 = model_encoder1.build_graph(
            blended_reps,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder2")
        blended_reps_thro_model_layer2 = model_encoder2.build_graph(
            blended_reps_thro_model_layer1,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size,
                                         self.keep_prob,
                                         model_name="RNNModelEncoder3")
        blended_reps_thro_model_layer3 = model_encoder3.build_graph(
            blended_reps_thro_model_layer2,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)
        #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #27
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.self_attention:
            encoder = RNNEncoder(self.FLAGS.hidden_size_encoder,
                                 self.keep_prob)
        else:
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.simple_attention:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        if self.FLAGS.co_attention:
            #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space
            question_hiddens_t = tf.transpose(
                question_hiddens,
                perm=[0, 2, 1])  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens_t = tf.contrib.layers.fully_connected(
                question_hiddens_t,
                num_outputs=self.FLAGS.question_len,
                activation_fn=tf.nn.tanh
            )  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens = tf.transpose(
                trans_question_hiddens_t,
                perm=[0, 2, 1])  #(batch_size,question_len,hidden_size*2)

            #Computing the coattention context
            co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            co_attn_output = co_attn_layer.build_graph(
                trans_question_hiddens, self.qn_mask, self.context_mask,
                context_hiddens)  #(batch_size,context_len,6*hidden_size)

            # performing the fusion of temporal information to the coattention context via a bidirectional GRU
            with tf.variable_scope("co-attn-encoder"):
                co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size,
                                              self.keep_prob)
                blended_reps_final = co_attn_encoder.build_graph(
                    co_attn_output, self.context_mask)

        if self.FLAGS.self_attention:
            # implemrntation of self attention of the rnet paper

            self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder,
                                              self.FLAGS.hidden_size_qp,
                                              self.FLAGS.hidden_size_pp,
                                              self.keep_prob)
            v_p = self_attention_encoder.build_graph_qp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len)
            h_p = self_attention_encoder.build_graph_pp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, v_p, self.FLAGS.context_len,
                self.FLAGS.question_len)
            blended_reps_final = tf.concat(
                [context_hiddens, v_p, h_p],
                axis=2)  #(batch_size,context_len,5*hidden_size)

        if self.FLAGS.answer_pointer:
            #implementation of answer pointer as used in R-Net paper
            if self.FLAGS.co_attention:
                hidden_size_attn = self.FLAGS.hidden_size * 2
            elif self.FLAGS.self_attention:
                hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp
            else:
                hidden_size_attn = self.FLAGS.hidden_size

            answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder,
                                           hidden_size_attn,
                                           self.FLAGS.question_len,
                                           self.keep_prob)
            p, logits = answer_decoder.build_graph_answer_pointer(
                question_hiddens, context_hiddens, blended_reps_final,
                self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask,
                self.context_mask)

            self.logits_start = logits[0]
            self.probdist_start = p[0]

            self.logits_end = logits[1]
            self.probdist_end = p[1]

        if self.FLAGS.simple_softmax:
            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
예제 #28
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        # character-level CNN to get hybrid word embeddings
        charCnn = CharCNN(self.FLAGS.word_len, self.FLAGS.char_embedding_size,
                          self.FLAGS.num_filters, self.FLAGS.kernel_size)
        # (batch_size, context_len, num_filters)
        char_context_hiddens = charCnn.build_graph(self.char_context_embs,
                                                   self.char_context_mask,
                                                   self.FLAGS.context_len)
        # (batch_size, question_len, num_filters)
        char_qn_hiddens = charCnn.build_graph(self.char_qn_embs,
                                              self.char_qn_mask,
                                              self.FLAGS.question_len)

        # hybrid word embeddings
        hybrid_context_embs = tf.concat(
            [self.context_embs, char_context_hiddens],
            axis=-1)  # (batch_size, context_len, emb_size+char_emb_size)
        hybrid_qn_embs = tf.concat(
            [self.qn_embs, char_qn_hiddens],
            axis=-1)  # (batch_size, question_len, emb_size+char_emb_size)

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, "GRU")
        context_hiddens = encoder.build_graph(
            hybrid_context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            hybrid_qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # coattention has been the best attention model I've found
        attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                            self.FLAGS.hidden_size * 2)
        u = attn_layer.build_graph(
            question_hiddens, self.qn_mask,
            context_hiddens)  # shape (batch_size, context_len, 8*hidden_size)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            u, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #29
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_embs = self.context_embs
        qn_embs = self.qn_embs
        if self.FLAGS.enable_cnn:
            context_embs = tf.concat(
                [self.context_embs, self.context_char_embs], axis=2)
            qn_embs = tf.concat([self.qn_embs, self.qn_char_embs], axis=2)

        context_hiddens = encoder.build_graph(
            context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            qn_embs, self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Encode query-aware representations of the context words
        bidaf_attn_layer = BidafAttn(self.keep_prob, self.FLAGS.context_len,
                                     self.FLAGS.hidden_size * 2)
        bidaf_out = bidaf_attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens,
            self.context_mask)  # (batch_size, context_len, hidden_size*8)

        # Condense the information: hidden_size*8 --> hidden_size*2
        bidaf_out = tf.contrib.layers.fully_connected(
            bidaf_out,
            num_outputs=self.FLAGS.hidden_size * 2,
            normalizer_fn=tf.contrib.layers.batch_norm
        )  # (batch_size, context_len, hidden_size*2)

        # Co-attention
        co_attn_layer = CoAttnLite(self.keep_prob, self.FLAGS.hidden_size,
                                   self.FLAGS.hidden_size * 2)
        co_out = co_attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)

        bico_out = tf.concat([bidaf_out, co_out],
                             2)  # (batch_size, context_len, hidden_size*4)

        # Capture interactions among context words conditioned on the query.
        gru_layer1 = RNNEncoder(
            self.FLAGS.hidden_size, self.keep_prob
        )  # params: (hidden_size*4 + hidden_size) * hidden_size * 2 * 3
        model_reps1 = gru_layer1.build_graph(
            bico_out, self.context_mask, variable_scope='ModelGRU1'
        )  # (batch_size, context_len, hidden_size*2)

        gru_layer2 = RNNEncoder(
            self.FLAGS.hidden_size, self.keep_prob
        )  # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3
        model_reps2 = gru_layer2.build_graph(
            model_reps1, self.context_mask, variable_scope='ModelGRU2'
        )  # (batch_size, context_len, hidden_size*2)

        # Self Attention & GRU layer parallel to GRU layer2.
        with tf.variable_scope('SelfAttnGRU'):
            self_attn_layer = MulAttn(self.keep_prob,
                                      self.FLAGS.hidden_size * 2,
                                      self.FLAGS.hidden_size * 2)
            se_attn = self_attn_layer.build_graph(
                model_reps1, self.context_mask, model_reps1,
                self.context_mask)  # (batch_size, context_len, hidden_size*2)
            se_gru_layer = RNNEncoder(
                self.FLAGS.hidden_size, self.keep_prob
            )  # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3
            se_out = se_gru_layer.build_graph(
                se_attn, self.context_mask, variable_scope='SelfGRU'
            )  # (batch_size, context_len, hidden_size*2)

        model_reps = tf.concat([model_reps2, se_out],
                               2)  # (batch_size, context_len, hidden_size*4)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            start_reps = tf.concat(
                [bico_out, model_reps],
                2)  # (batch_size, context_len, hidden_size*10)
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                start_reps, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            gru_end_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            model_end_reps = gru_end_layer.build_graph(
                model_reps, self.context_mask, variable_scope='EndGRU'
            )  # (batch_size, context_len, hidden_size*2)
            end_reps = tf.concat(
                [bico_out, model_end_reps],
                2)  # (batch_size, context_len, hidden_size*10)

            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                end_reps, self.context_mask)

        for variable in tf.trainable_variables():
            tf.summary.histogram(variable.name.replace(':', '/'), variable)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.

        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        question_variation = tf.layers.dense(question_hiddens,
                                             question_hiddens.get_shape()[2],
                                             activation=tf.tanh)

        # In[]

        #question_length = tf.placeholder(tf.int32, (None,), name='question_length')
        #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length')

        unmasked_affinity = tf.einsum(
            'ndh,nqh->ndq', context_hiddens,
            question_variation)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity = maybe_mask_affinity(unmasked_affinity, self.document_length)
        attention_p = tf.nn.softmax(affinity, dim=1)
        unmasked_affinity_t = tf.transpose(
            unmasked_affinity,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t = maybe_mask_affinity(unmasked_affinity_t,
                                         self.question_length)
        attention_q = tf.nn.softmax(affinity_t, dim=1)
        summary_q = tf.einsum(
            'ndh,ndq->nqh', context_hiddens,
            attention_p)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d = tf.einsum(
            'nqh,nqd->ndh', question_variation,
            attention_q)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q)

        encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob)
        context2 = encoder1.build_graph(
            summary_d,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question2 = encoder1.build_graph(
            summary_q,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        unmasked_affinity1 = tf.einsum(
            'ndh,nqh->ndq', context2,
            question2)  # [N, D, Q] or [N, 1+D, 1+Q] if sentinel
        affinity1 = maybe_mask_affinity(unmasked_affinity1,
                                        self.document_length)
        attention_p1 = tf.nn.softmax(affinity1, dim=1)
        unmasked_affinity_t1 = tf.transpose(
            unmasked_affinity1,
            [0, 2, 1])  # [N, Q, D] or [N, 1+Q, 1+D] if sentinel
        affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1,
                                          self.question_length)
        attention_q1 = tf.nn.softmax(affinity_t1, dim=1)
        summary_q1 = tf.einsum(
            'ndh,ndq->nqh', context2,
            attention_p1)  # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel
        summary_d1 = tf.einsum(
            'nqh,nqd->ndh', question2,
            attention_q1)  # [N, D, 2H] or [N, 1+D, 2H] if sentinel
        coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1)

        # In[]
        document_representations = [
            context_hiddens,  # E^D_1
            context2,  # E^D_2
            summary_d,  # S^D_1
            summary_d1,  # S^D_2
            coattention_d,  # C^D_1
            coattention_d1,  # C^D_2
        ]

        document_representation = tf.concat(document_representations, 2)
        encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob)
        U = encoder2.build_graph(document_representation, self.context_mask)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, U],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            blended_reps, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
예제 #31
0
    def build_graph(self):
        """
        Builds the main part of the graph for the model
        
         Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # NOTE CHANGE: concantanate glove and elmo embedding
        # How to handle elmo context_len and glove context_len mismatch?
        # Just make the context_ids no max context_len
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)
        qn_embs_concat = tf.concat(
            [self.elmo_question_input, self.qn_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        #set shape so that it can pass to dynamic lstm
        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        qn_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("biLSTM"):
            Encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 keep_prob=self.keep_prob,
                                 cell_type="lstm",
                                 input_size=1024 + self.FLAGS.embedding_size)
            #shared weights (same scope)
            context_hiddens = Encoder.build_graph(
                context_embs_concat,
                self.context_mask,
                scope="context_question_encoder"
            )  #(batch_size, context_len, hidden_size*2)
            question_hiddens = Encoder.build_graph(
                qn_embs_concat, self.qn_mask, scope="context_question_encoder"
            )  #(batch_size, question_len, hidden_size*2)

        with tf.variable_scope("bidaf"):
            bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob)
            b = bidaf_object.build_graph(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask)  #(batch_size, context_len, hidden_size*8)

        with tf.variable_scope("self_attn_layer"):
            SelfAttn_object = SelfAttn(self.FLAGS.hidden_size,
                                       self.FLAGS.hidden_size * 2,
                                       self.keep_prob,
                                       input_size=self.FLAGS.hidden_size * 2)
            M = SelfAttn_object.build_graph(
                b, self.context_mask,
                cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

        #Make prediction
        with tf.variable_scope('prediction_layer'):
            #Encode the self-attended context first
            with tf.variable_scope("final_lstm_layer"):
                final_lstm_object = RNNEncoder(
                    self.FLAGS.hidden_size,
                    keep_prob=self.keep_prob,
                    cell_type="lstm",
                    input_size=self.FLAGS.hidden_size * 2)
                M_prime = final_lstm_object.build_graph(
                    M, self.context_mask,
                    scope="final_lstm")  #(batch_size, context_len, h*2)

            #Get start distribution
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    M_prime,
                    self.context_mask)  #both are (batch_size, context_len)

            with tf.variable_scope("EndDist"):
                logit_start_expand = tf.expand_dims(
                    self.logits_start, axis=2)  #(batch_size, context_len, 1)
                blended_end_rnn_input = tf.concat(
                    [logit_start_expand, M_prime],
                    axis=2)  #(batch_size, context_len, hidden_size*2)
                end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size,
                                          keep_prob=self.keep_prob,
                                          direction="unidirectional")
                end_rnn_output = end_dist_rnn.build_graph(
                    blended_end_rnn_input,
                    self.context_mask,
                    scope="end_dist_rnn")

                # Get the end dist
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    end_rnn_output, self.context_mask)