def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. """ # Use a RNN to get hidden states for the context and the question encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask, False) # Use softmax layer to compute probability distribution for end location with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask, False)
def build_graph(self): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) attn_layer = R_Net_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS) output = attn_layer.build_graph( attn_output, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) blended_reps_final = tf.contrib.layers.fully_connected( tf.concat([attn_output, output], 2), num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) print self.context_embs.shape context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states biattn_layer = AttentionFlowLayer(self.keep_prob, self.FLAGS.l2_lambda) biattn_output = biattn_layer.build_graph(context_hiddens, self.context_mask, question_hiddens, self.qn_mask, scope="AttnFlow") #RNNEncoder layer model_layer = Model_Layer(self.FLAGS.hidden_size, self.keep_prob) model_output = model_layer.build_graph(biattn_output, self.context_mask) #Fully connected # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( model_output, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) #What are the masks used for? # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. self.c2q_dist : (batch_size, context_len, question_len) Context to Question attention probability. Each row should sum to 1 except if the context word is masked. self.q2c_dist : (batch_size, context_len) Question to Context attention probability. Each row should sum to 1. """ print("Building BIDAF") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) self.c2q_attn_dist, self.q2c_attn_dist, attn_output = \ attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*4) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ with vs.variable_scope("Encoder"): # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # shapes are U_tilde: (batch_size, context_len, 2h), H_tilde: (batch_size, context_len, 1) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*8) with vs.variable_scope("M1_init"): # Bidirectional GRU M1 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1_init = modeling_layer.build_graph(blended_reps, self.context_mask) # (batch_size, N, 2h) with vs.variable_scope("M1"): # Bidrectional GRU M2 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1 = modeling_layer.build_graph(blended_reps_1_init, self.context_mask) # (batch_size, N, 2h) with vs.variable_scope("M2"): # Bidrectional GRU M2 modeling_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_2 = modeling_layer.build_graph(blended_reps_1, self.context_mask) # (batch_size, N, 2h) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_1], axis=2), self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_2], axis=2), self.context_mask)
def build_graph(self): context_embs_concat = tf.concat( [self.elmo_context_input, self.context_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) context_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) #qn_embs_concat.set_shape((None, None, 1024+self.FLAGS.embedding_size)) self.qn_mask.set_shape((None, None)) self.context_mask.set_shape((None, None)) with tf.variable_scope("start"): softmax_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_start.build_graph( context_embs_concat, self.context_mask) with tf.variable_scope("end"): softmax_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_end.build_graph( context_embs_concat, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) encoderQ = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask,"rnnencoder1") # (batch_size, context_len, hidden_size*2) question_hiddens = encoderQ.build_graph(self.qn_embs, self.qn_mask,"rnnencoderQ") # (batch_size, question_len, ,"rnnencoder1"hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output,new_attn = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,2*self.FLAGS.hidden_size) # attn_output is shape (batch_size, context_len, hidden_size*2) _,_,blended_reps_final=build_graph_middle(self,new_attn,attn_output,context_hiddens,question_hiddens) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask) '''
def build_graph_coattention(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. """ # Use a RNN to get hidden states for the context and the question encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) _,context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) _,question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Compute both sided attention coatt= Coattention() co_att= coatt.build_graph(self.FLAGS.batch_size,question_hiddens, context_hiddens, self.FLAGS.question_len, self.FLAGS.context_len, 2*self.FLAGS.hidden_size, self.keep_prob) co_att_final = tf.contrib.layers.fully_connected(co_att, num_outputs=self.FLAGS.hidden_size) # Use softmax layer to compute probability distribution for start location with vs.variable_scope("StartDist") as scp: softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(co_att_final, self.context_mask, True) scp.reuse_variables() # Use softmax layer to compute probability distribution for end location with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(co_att_final, self.context_mask, True)
def build_graph(self): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.max_word_len: context_hiddens = encoder.build_graph( tf.concat([self.context_embs, self.context_char_hidden], 2), self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( tf.concat([self.qn_embs, self.qn_char_hidden], 2), self.qn_mask) # (batch_size, question_len, hidden_size*2) else: context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) attn_layer = BiDAF_Attn(self.keep_prob, self.FLAGS.hidden_size * 2, [ self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.question_len ]) output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) blended_reps_final = tf.contrib.layers.fully_connected( output, num_outputs=self.FLAGS.hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): attn_layer = DynamicAttention_Attn(self.keep_prob, self.FLAGS) output = attn_layer.build_graph( self.qn_embs, self.qn_mask, self.context_embs, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) encoder = RNNEncoder(self.FLAGS.embedding_size * 2, self.keep_prob) context_hiddens = encoder.build_graph( output, self.context_mask) # (batch_size, context_len, embedding_size*4) blended_reps_final = tf.contrib.layers.fully_connected( context_hiddens, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("e1c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("e1q"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # con_qn_hiddens = encoder.build_graph(self.con_qn_embs, self.con_qn_mask) # context_hiddens = con_qn_hiddens[:, :self.FLAGS.context_len, :] # question_hiddens = con_qn_hiddens[:, self.FLAGS.context_len:, :] # with vs.variable_scope("e2"): # encoder1 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # context_hiddens = encoder1.build_graph(context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2) # question_hiddens = encoder1.build_graph(question_hiddens, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) # _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) with vs.variable_scope("a1"): attn_layer = BidirectionalAttnNew(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps # blended_reps_c = tf.concat([context_hiddens, attn_output_val], axis=2) # (batch_size, context_len, hidden_size*4) # blended_reps_q = tf.concat([question_hiddens, attn_output_key], axis=2) with vs.variable_scope("e2_1c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens_f = encoder.build_graph( attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) # with vs.variable_scope("a2"): # attn_layer1 = BidirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # _, _, attn_output_val, attn_output_key = attn_layer1.build_graph(question_hiddens, # self.qn_mask, # context_hiddens, # self.context_mask) blended_reps_st = tf.concat([context_hiddens_f, attn_output], axis=2) with vs.variable_scope("e3c"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens_f_end = encoder.build_graph( context_hiddens_f, self.context_mask) blended_reps_end = tf.concat([context_hiddens_f_end, attn_output], axis=2) # with vs.variable_scope("AnsPoiStRNN"): # encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # start_hidden = encoder.build_graph(blended_reps, self.context_mask) # print "OK1" # with vs.variable_scope("AnsPoiStATT"): # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # start_att_dis, start_att_out = attn_layer.build_graph(question_hiddens, self.qn_mask, start_hidden) # print start_att_dis.shape, start_att_out.shape # print "OK2" # with vs.variable_scope("AnsPoiEnRNN"): # encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) # end_hidden = encoder.build_graph(start_att_out, self.context_mask) # print "OK3" # with vs.variable_scope("AnsPoiStATT"): # attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) # end_att_dis, _ = attn_layer.build_graph(end_hidden, self.context_mask, question_hiddens) # print "OK4" # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final_st = tf.contrib.layers.fully_connected( blended_reps_st, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final_end = tf.contrib.layers.fully_connected( blended_reps_end, num_outputs=self.FLAGS.hidden_size) # print "###", blended_reps_final.shape # print start_att_dis.shape, end_att_dis.shape # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final_st, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.h_hidden_size, self.keep_prob, num_layers=self.FLAGS.h_num_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type) if self.FLAGS.share_encoder: question_hiddens, question_states_fw, question_states_bw = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) else: question_encoder = RNNEncoder(self.FLAGS.h_hidden_size, self.keep_prob, num_layers=self.FLAGS.h_num_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='question_encoder') question_hiddens, question_states_fw, question_states_bw = question_encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if not self.FLAGS.reuse_question_states: question_states_fw, question_states_bw = None, None context_hiddens, _, _ = encoder.build_graph( self.context_embs, self.context_mask, initial_states_fw=question_states_fw, initial_states_bw=question_states_bw ) # (batch_size, context_len, hidden_size*2) if self.FLAGS.use_bidaf: attn_layer = BiDAF(self.keep_prob) context_att, question_att = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps = tf.concat([ context_hiddens, context_att, context_hiddens * context_att, context_hiddens * question_att ], axis=2) else: # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output, context_hiddens * attn_output], axis=2) # (batch_size, context_len, hidden_size*4) if self.FLAGS.modeling_layer_uses_rnn: modelling_encoder = RNNEncoder( self.FLAGS.h_model_size, self.keep_prob, num_layers=self.FLAGS.h_model_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='blended_reps_scope') blended_reps_final, model_states_fw, model_states_bw = modelling_encoder.build_graph( blended_reps, self.context_mask) else: # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.h_hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): if self.FLAGS.use_rnn_for_ends: end_encoder = RNNEncoder(self.FLAGS.h_model_size, self.keep_prob, num_layers=self.FLAGS.h_model_layers, combiner=self.FLAGS.h_combiner, cell_type=self.FLAGS.h_cell_type, scope='blended_reps_final') blended_reps_combined = tf.concat([ blended_reps_final, tf.expand_dims(self.probdist_start, 2) ], 2) blended_reps_final, _, _ = end_encoder.build_graph( blended_reps_combined, self.context_mask, initial_states_fw=model_states_fw, initial_states_bw=model_states_bw) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_embs = self.context_embs qn_embs = self.qn_embs if self.FLAGS.enable_cnn: context_embs = tf.concat( [self.context_embs, self.context_char_embs], axis=2) qn_embs = tf.concat([self.qn_embs, self.qn_char_embs], axis=2) context_hiddens = encoder.build_graph( context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Encode query-aware representations of the context words bidaf_attn_layer = BidafAttn(self.keep_prob, self.FLAGS.context_len, self.FLAGS.hidden_size * 2) bidaf_out = bidaf_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8) # Condense the information: hidden_size*8 --> hidden_size*2 bidaf_out = tf.contrib.layers.fully_connected( bidaf_out, num_outputs=self.FLAGS.hidden_size * 2, normalizer_fn=tf.contrib.layers.batch_norm ) # (batch_size, context_len, hidden_size*2) # Co-attention co_attn_layer = CoAttnLite(self.keep_prob, self.FLAGS.hidden_size, self.FLAGS.hidden_size * 2) co_out = co_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*2) bico_out = tf.concat([bidaf_out, co_out], 2) # (batch_size, context_len, hidden_size*4) # Capture interactions among context words conditioned on the query. gru_layer1 = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (hidden_size*4 + hidden_size) * hidden_size * 2 * 3 model_reps1 = gru_layer1.build_graph( bico_out, self.context_mask, variable_scope='ModelGRU1' ) # (batch_size, context_len, hidden_size*2) gru_layer2 = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3 model_reps2 = gru_layer2.build_graph( model_reps1, self.context_mask, variable_scope='ModelGRU2' ) # (batch_size, context_len, hidden_size*2) # Self Attention & GRU layer parallel to GRU layer2. with tf.variable_scope('SelfAttnGRU'): self_attn_layer = MulAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) se_attn = self_attn_layer.build_graph( model_reps1, self.context_mask, model_reps1, self.context_mask) # (batch_size, context_len, hidden_size*2) se_gru_layer = RNNEncoder( self.FLAGS.hidden_size, self.keep_prob ) # params: (2*hidden_size + hidden_size) * hidden_size * 2 * 3 se_out = se_gru_layer.build_graph( se_attn, self.context_mask, variable_scope='SelfGRU' ) # (batch_size, context_len, hidden_size*2) model_reps = tf.concat([model_reps2, se_out], 2) # (batch_size, context_len, hidden_size*4) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): start_reps = tf.concat( [bico_out, model_reps], 2) # (batch_size, context_len, hidden_size*10) softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( start_reps, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): gru_end_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) model_end_reps = gru_end_layer.build_graph( model_reps, self.context_mask, variable_scope='EndGRU' ) # (batch_size, context_len, hidden_size*2) end_reps = tf.concat( [bico_out, model_end_reps], 2) # (batch_size, context_len, hidden_size*10) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( end_reps, self.context_mask) for variable in tf.trainable_variables(): tf.summary.histogram(variable.name.replace(':', '/'), variable)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.model == "baseline" : encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn": print("INSIDE the BIDAF model") encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn": encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn": context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "baseline" : attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "coatt" : #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2) context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps_final = attn_output #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): contextLen = tf.reduce_sum(self.context_mask, axis=1) cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size) (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32) U_1 = tf.concat([fw_out, bw_out], axis=2) out = tf.nn.dropout(U_1, self.keep_prob) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask) elif self.FLAGS.model =="bidaf" or self.FLAGS.model=="bidaf_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) if self.FLAGS.model == "bidaf_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): # Concatenate the start logits with the modelling layer output to get the input to the # end word lstm #self.logits_start has a shape of #(batch_size, context_len) logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1) end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2) # LSTM end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob) blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask) blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask) elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) if self.FLAGS.model == "bidaf_dynamic_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # We now feed this to dynamic decoder module coded in Answer decoder # the output of the decoder are start, end, alpha_logits and beta_logits # start and end have a shape of (batch_size, num_iterations) #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim) decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = mod_layer_out[:,0,:] u_e_init = mod_layer_out[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1] elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn": context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) if self.FLAGS.model == "coatt_dynamic_self_attn": CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) attn_output = U #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = attn_output[:,0,:] u_e_init = attn_output[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print "Running Attention Model with... %s" % self.FLAGS.attention if self.FLAGS.attention == "BiDAF": encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) bidaf_attn_layer = BiDirectionalAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, self.FLAGS.question_len, self.FLAGS.context_len) _, context_to_question, _, question_to_context = bidaf_attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # Combine attention vectors and hidden context vector context_c2q = tf.multiply(context_hiddens, context_to_question) context_q2c = tf.multiply(context_hiddens, question_to_context) blended_reps = tf.concat( [ context_hiddens, context_to_question, context_c2q, context_q2c ], axis=2) # (batch_size, context_len, hidden_size*8) # Modeling Layers (2 layers of bidirectional LSTM) encodes the query-aware representations of context words. modeling_layer = BiRNN(self.FLAGS.hidden_size, self.keep_prob) blended_reps_1 = modeling_layer.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2). modeling_layer_2 = BiRNN2(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = modeling_layer_2.build_graph( blended_reps_1, self.context_mask) # (batch_size, context_len, hidden_size*2). else: # Default: self.FLAGS.attention == "BasicAttn" encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # character-level CNN to get hybrid word embeddings charCnn = CharCNN(self.FLAGS.word_len, self.FLAGS.char_embedding_size, self.FLAGS.num_filters, self.FLAGS.kernel_size) # (batch_size, context_len, num_filters) char_context_hiddens = charCnn.build_graph(self.char_context_embs, self.char_context_mask, self.FLAGS.context_len) # (batch_size, question_len, num_filters) char_qn_hiddens = charCnn.build_graph(self.char_qn_embs, self.char_qn_mask, self.FLAGS.question_len) # hybrid word embeddings hybrid_context_embs = tf.concat( [self.context_embs, char_context_hiddens], axis=-1) # (batch_size, context_len, emb_size+char_emb_size) hybrid_qn_embs = tf.concat( [self.qn_embs, char_qn_hiddens], axis=-1) # (batch_size, question_len, emb_size+char_emb_size) # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, "GRU") context_hiddens = encoder.build_graph( hybrid_context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( hybrid_qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # coattention has been the best attention model I've found attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) u = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens) # shape (batch_size, context_len, 8*hidden_size) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( u, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.cell_type in ['rnn_gru', 'rnn_lstm']: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, cell_type=self.FLAGS.cell_type) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) elif self.FLAGS.cell_type == 'qanet': encoder = QAEncoder(num_blocks=self.FLAGS.emb_num_blocks, num_layers=self.FLAGS.emb_num_layers, \ num_heads=self.FLAGS.emb_num_heads, \ filters=self.FLAGS.hidden_size, kernel_size=self.FLAGS.emb_kernel_size, \ keep_prob=self.keep_prob, input_mapping=True) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) if self.FLAGS.attention == 'basic': # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) elif self.FLAGS.attention == 'bidaf': attn_layer = BiDAFAttn(self.keep_prob) blended_reps = attn_layer.build_graph(context_hiddens, self.context_mask, question_hiddens, self.qn_mask) if self.FLAGS.modeling_layer == 'basic': # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size, weights_initializer=initializer_relu() ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask) elif self.FLAGS.modeling_layer == 'rnn': encoder_start = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m1') m1 = encoder_start.build_graph(blended_reps, self.context_mask) encoder_end = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, \ cell_type=self.FLAGS.cell_type, name='m2') m2 = encoder_end.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([blended_reps, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([blended_reps, m2], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet': modeling_encoder = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder') m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder.build_graph(m0, self.context_mask) m2 = modeling_encoder.build_graph(m1, self.context_mask) m3 = modeling_encoder.build_graph(m2, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m1, m2], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m1, m3], -1), self.context_mask) elif self.FLAGS.modeling_layer == 'qanet2': modeling_encoder1 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder1') ''' modeling_encoder2 = QAEncoder(num_blocks=self.FLAGS.model_num_blocks, \ num_layers=self.FLAGS.model_num_layers, \ num_heads=self.FLAGS.model_num_heads, \ filters=self.FLAGS.hidden_size, \ kernel_size=self.FLAGS.model_kernel_size, \ keep_prob=self.keep_prob, input_mapping=False, \ name='modeling_encoder2') ''' m0 = tf.layers.conv1d(blended_reps, filters=self.FLAGS.hidden_size, \ kernel_size=1, padding='SAME', name='attn_mapping') m1 = modeling_encoder1.build_graph(m0, self.context_mask) m2 = modeling_encoder1.build_graph(m1, self.context_mask) with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( tf.concat([m0, m1], -1), self.context_mask) with tf.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( tf.concat([m0, m2], -1), self.context_mask)
def build_graph(self): """ Builds the main part of the graph for the model Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # NOTE CHANGE: concantanate glove and elmo embedding # How to handle elmo context_len and glove context_len mismatch? # Just make the context_ids no max context_len context_embs_concat = tf.concat( [self.elmo_context_input, self.context_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) qn_embs_concat = tf.concat( [self.elmo_question_input, self.qn_embs], 2) #(batch_size, qn_len, 1024+self.FLAGS.embedding_size) #set shape so that it can pass to dynamic lstm context_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) qn_embs_concat.set_shape( (None, None, 1024 + self.FLAGS.embedding_size)) self.qn_mask.set_shape((None, None)) self.context_mask.set_shape((None, None)) with tf.variable_scope("biLSTM"): Encoder = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=1024 + self.FLAGS.embedding_size) #shared weights (same scope) context_hiddens = Encoder.build_graph( context_embs_concat, self.context_mask, scope="context_question_encoder" ) #(batch_size, context_len, hidden_size*2) question_hiddens = Encoder.build_graph( qn_embs_concat, self.qn_mask, scope="context_question_encoder" ) #(batch_size, question_len, hidden_size*2) with tf.variable_scope("bidaf"): bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob) b = bidaf_object.build_graph( context_hiddens, question_hiddens, self.context_mask, self.qn_mask) #(batch_size, context_len, hidden_size*8) with tf.variable_scope("self_attn_layer"): SelfAttn_object = SelfAttn(self.FLAGS.hidden_size, self.FLAGS.hidden_size * 2, self.keep_prob, input_size=self.FLAGS.hidden_size * 2) M = SelfAttn_object.build_graph( b, self.context_mask, cell_type="lstm") #(batch_size, context_len, hidden_size*2) #Make prediction with tf.variable_scope('prediction_layer'): #Encode the self-attended context first with tf.variable_scope("final_lstm_layer"): final_lstm_object = RNNEncoder( self.FLAGS.hidden_size, keep_prob=self.keep_prob, cell_type="lstm", input_size=self.FLAGS.hidden_size * 2) M_prime = final_lstm_object.build_graph( M, self.context_mask, scope="final_lstm") #(batch_size, context_len, h*2) #Get start distribution with tf.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( M_prime, self.context_mask) #both are (batch_size, context_len) with tf.variable_scope("EndDist"): logit_start_expand = tf.expand_dims( self.logits_start, axis=2) #(batch_size, context_len, 1) blended_end_rnn_input = tf.concat( [logit_start_expand, M_prime], axis=2) #(batch_size, context_len, hidden_size*2) end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size, keep_prob=self.keep_prob, direction="unidirectional") end_rnn_output = end_dist_rnn.build_graph( blended_end_rnn_input, self.context_mask, scope="end_dist_rnn") # Get the end dist softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( end_rnn_output, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # *********************** # *** Highway network *** # *********************** highway_net = HighWayNetwork() self.context_embs = highway_net.build_graph(self.context_embs) self.qn_embs = highway_net.build_graph(self.qn_embs) # ********************************** # *** Contextual Embedding layer *** # ********************************** # Use a biLSTM to get hidden states for the context and the question # Note: here the biLSTMEncoder is shared (i.e. the weights are the same) between the context and the question. # biLSTM encoding utilizes contextual clues from surrounding words to refine the embeddings. encoder = biLSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # **************************** # *** Attention Flow layer *** # **************************** # Couples query and context vectors and produces a set of query-aware feature vectors for ea. word in the document attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, c2q_attn_output, _, q2c_attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # c2q_attn_output is shape (batch_size, context_len, 2h), q2c_attn_output is (batch_size, 1, 2h) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, c2q_attn_output, tf.multiply(context_hiddens, c2q_attn_output), tf.multiply(context_hiddens, q2c_attn_output)], axis=2, name="blended_reps") # (batch_size, context_len, hidden_size*8) # ********************** # *** Modeling layer *** # ********************** # Scans the context Modeling_layer = Modeling_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = Modeling_layer.build_graph(blended_reps) # ******************** # *** Output layer *** # ******************** # Provide an answer to the query # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(tf.concat([blended_reps, blended_reps_final], 2), self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) encoder_out = Output_layer_biLSTM_Encoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final_hiddens = encoder_out.build_graph(blended_reps_final) # (batch_size, context_len, hidden_size*2) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(tf.concat([blended_reps, blended_reps_final_hiddens], 2), self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ if self.FLAGS.use_char_cnn: with vs.variable_scope('char_encoding'): self.context_char_encodings, self.cnn_filters1 = char_encoder2( self.context_char_embs, self.FLAGS.context_len, self.FLAGS.word_len, self.FLAGS.cnn_filter_width, self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters) #self.context_char_encodings = tf.nn.dropout(self.context_char_encodings, self.keep_prob) tf.get_variable_scope().reuse_variables() self.qn_char_encodings, self.cnn_filters2 = char_encoder2( self.qn_char_embs, self.FLAGS.question_len, self.FLAGS.word_len, self.FLAGS.cnn_filter_width, self.FLAGS.char_embedding_size, self.FLAGS.n_cnn_filters) #self.qn_char_encodings = tf.nn.dropout(self.qn_char_encodings, self.keep_prob) joined_context_embs = tf.concat( [self.context_embs, self.context_char_encodings], axis=2) joined_qn_embs = tf.concat([self.qn_embs, self.qn_char_encodings], axis=2) assert joined_context_embs.shape[ 2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters assert joined_qn_embs.shape[ 2] == self.FLAGS.embedding_size + self.FLAGS.n_cnn_filters else: joined_context_embs = self.context_embs joined_qn_embs = self.qn_embs # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope('embedding_layer'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( joined_context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( joined_qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) attn_layer = BDAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask, q2c=self.FLAGS.use_q2c_attention ) # attn_output is shape (batch_size, context_len, hidden_size*6) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) # Use context hidden states to attend to question hidden states #attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) with vs.variable_scope('layer1'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) layer1_reps = encoder.build_graph(blended_reps, self.context_mask) with vs.variable_scope('layer2'): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) layer2_reps = encoder.build_graph(layer1_reps, self.context_mask) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default final_reps = tf.contrib.layers.fully_connected( layer2_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) #final_reps = layer2_reps # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_start.build_graph( final_reps, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #end_hiddens = encoder.build_graph(final_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) softmax_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_end.build_graph( final_reps, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. context_input_lens = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.context_char_ids, tf.bool), tf.int32), axis=2), [-1]) qn_input_lens = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qn_char_ids, tf.bool), tf.int32), axis=2), [-1]) cell_fw = rnn_cell.GRUCell(self.FLAGS.hidden_size) cell_bw = rnn_cell.GRUCell(self.FLAGS.hidden_size) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.context_char_embs, context_input_lens, dtype=tf.float32) ch_emb = tf.reshape( tf.concat([state_fw, state_bw], axis=1), [-1, self.FLAGS.context_len, 2 * self.FLAGS.hidden_size]) self.context_embs = tf.concat([self.context_embs, ch_emb], axis=2) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.qn_char_embs, qn_input_lens, dtype=tf.float32) qh_emb = tf.reshape( tf.concat([state_fw, state_bw], axis=1), [-1, self.FLAGS.question_len, 2 * self.FLAGS.hidden_size]) self.qn_embs = tf.concat([self.qn_embs, qh_emb], axis=2) # ToDo Deep encoder encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) ##### Calculate multiple attention models ##### ### Use context hidden states to attend to question hidden states ### """ basicAttn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) # attn_output is shape (batch_size, context_len, hidden_size*2) _, basicAttn_output = basicAttn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) """ ### Bidirectional attention flow ### biDirAttn_layer = bidirectionalAttn(self.keep_prob, self.FLAGS.context_len, self.FLAGS.hidden_size*2, self.FLAGS.question_len, self.FLAGS.hidden_size*2) # attn_output is shape (batch_size, context_len, 2*hidden_size) biDirAttn_output = biDirAttn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) biDir_output = tf.contrib.layers.fully_connected( biDirAttn_output, num_outputs=self.FLAGS.hidden_size) ### Coattention ### coAttn_layer = coattention(self.keep_prob, self.FLAGS.context_len, self.FLAGS.hidden_size*2, self.FLAGS.question_len, self.FLAGS.hidden_size*2) # attn_output is shape (batch_size, context_len, 2*hidden_size) coAttn_output = coAttn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) coAttn_output = tf.contrib.layers.fully_connected( coAttn_output, num_outputs=self.FLAGS.hidden_size) ### Combine attention models ### # Weight attentions attentions = tf.concat([biDir_output, coAttn_output], axis=2) attn_weight_calc = get_attn_weights(2, self.FLAGS.question_len, self.FLAGS.context_len, self.FLAGS.hidden_size, self.keep_prob) attn_weights = attn_weight_calc.build_graph(question_hiddens, self.qn_mask, attentions) gatedAttns = tf.multiply(attentions, attn_weights) print("Gattn", gatedAttns.shape.as_list()) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, gatedAttns], axis=2) # (batch_size, context_len, hidden_size*12) """ # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_layer1 = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size*2) blended_reps_layer1_DO = tf.nn.dropout(blended_reps_layer1, self.keep_prob) blended_reps_layer2 = tf.contrib.layers.fully_connected(blended_reps_layer1_DO, num_outputs=self.FLAGS.hidden_size) blended_reps_layer2_DO = tf.nn.dropout(blended_reps_layer2, self.keep_prob) blended_reps_final = tf.layers.dense(blended_reps_layer2_DO, self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) """ attnSize = gatedAttns.shape.as_list()[2] FClayer1 = tf.contrib.layers.fully_connected(gatedAttns, attnSize, scope="FC1") FClayer1_DO = tf.nn.dropout(FClayer1, self.keep_prob) wordStart = tf.concat([FClayer1_DO[:,0,:], FClayer1_DO[:,0,:], FClayer1_DO[:,1,:]], axis=1, name="wStart") wordMidd = tf.concat([FClayer1_DO[:,:-2,:], FClayer1_DO[:,1:-1,:], FClayer1_DO[:,2:,:]], axis=2, name="wMidd") wordEnd = tf.concat([FClayer1_DO[:,-2,:], FClayer1_DO[:,-1,:], FClayer1_DO[:,-1,:]], axis=1, name="wEnd") wordCC = tf.concat([tf.expand_dims(wordStart, 1), wordMidd, tf.expand_dims(wordEnd, 1)], axis=1, name="wCC") print("wordCC", wordCC.shape.as_list()) FClayer2 = tf.contrib.layers.fully_connected(wordCC, 3*attnSize, scope="FC2") FClayer2_DO = tf.nn.dropout(FClayer2, self.keep_prob) conv1 = tf.layers.conv1d(FClayer2_DO, self.FLAGS.hidden_size, kernel_size=5, padding='same') conv1_DO = tf.nn.dropout(conv1, self.keep_prob) conv2 = tf.layers.conv1d(conv1_DO, self.FLAGS.hidden_size, kernel_size=5, padding='same') conv2_DO = tf.nn.dropout(conv2, self.keep_prob) print("conv2", conv1.shape.as_list()) lstmInp = tf.concat([conv1_DO, conv2_DO, attentions], axis=2) with vs.variable_scope("outputLSTM"): lstmCell = rnn_cell.LSTMCell(self.FLAGS.hidden_size*2) lstmDO = DropoutWrapper(lstmCell, input_keep_prob=self.keep_prob) lstmOutputs,states = tf.nn.dynamic_rnn(lstmDO, lstmInp, dtype=tf.float32) lstmOut_layer1 = tf.contrib.layers.fully_connected(lstmOutputs, num_outputs=self.FLAGS.hidden_size*2) lstmOut_layer1_DO = tf.nn.dropout(lstmOut_layer1, self.keep_prob) lstmOut_layer2 = tf.contrib.layers.fully_connected(lstmOut_layer1_DO, num_outputs=self.FLAGS.hidden_size) lstmOut_layer2_DO = tf.nn.dropout(lstmOut_layer2, self.keep_prob) lstmOut_final = tf.layers.dense(lstmOut_layer2_DO, self.FLAGS.hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(lstmOut_final, self.context_mask) #self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(lstmOut_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ print("Building Pointer Model") # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states # BIDAG LAYER bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size) _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*6) bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h #SELF ATTENTION LAYER self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size) _, self_attn_output = self_attn_layer.build_graph( bidaf_output, self.context_mask) # batch_size, context_len, 8 * hidden_size # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*16) self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="AttentionEncoder") blended_reps = self_attention_encoder.build_graph( blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2 # MODELING LAYER modeling_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder") modeling_output = modeling_encoder.build_graph(blended_reps, self.context_mask) modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, name="ModelingEncoder2") modeling_output_two = modeling_encoder_two.build_graph( modeling_output, self.context_mask) total_reps_start = tf.concat([blended_reps, modeling_output], axis=2) total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2) # OUTPUT LAYER with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( total_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( total_reps_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ with tf.variable_scope('context_conv1') as scope: context_conv1_filter = truncated_normal_var( name='context_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv1 = tf.nn.conv2d(self.context_character_embs, context_conv1_filter, strides, padding='SAME') context_conv1_bias = zero_var(name='context_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv1_add_bias = tf.nn.bias_add(context_conv1, context_conv1_bias) context_relu_conv1 = tf.nn.relu(context_conv1_add_bias) pool_size = [1, 1, 2, 1] context_pool1 = tf.nn.max_pool(context_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer1') with tf.variable_scope('context_conv2') as scope: context_conv2_filter = truncated_normal_var( name='context_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv2 = tf.nn.conv2d(context_pool1, context_conv2_filter, strides, padding='SAME') context_conv2_bias = zero_var(name='context_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv2_add_bias = tf.nn.bias_add(context_conv2, context_conv2_bias) context_relu_conv2 = tf.nn.relu(context_conv2_add_bias) pool_size = [1, 1, 3, 1] context_pool2 = tf.nn.max_pool(context_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer2') with tf.variable_scope('context_conv3') as scope: context_conv3_filter = truncated_normal_var( name='context_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] context_conv3 = tf.nn.conv2d(context_pool2, context_conv3_filter, strides, padding='SAME') context_conv3_bias = zero_var(name='context_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) context_conv3_add_bias = tf.nn.bias_add(context_conv3, context_conv3_bias) context_relu_conv3 = tf.nn.relu(context_conv3_add_bias) pool_size = [1, 1, 4, 1] context_pool3 = tf.nn.max_pool(context_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='context_pool_layer3') context_flattened_layer = tf.reshape( context_pool3, [-1, self.FLAGS.context_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,300,192 context_final = tf.concat([self.context_embs, context_flattened_layer], axis=2) with tf.variable_scope('qn_conv1') as scope: qn_conv1_filter = truncated_normal_var( name='qn_conv1_filter', shape=[1, 3, 50, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv1 = tf.nn.conv2d(self.qn_character_embs, qn_conv1_filter, strides, padding='SAME') qn_conv1_bias = zero_var(name='qn_conv1_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv1_add_bias = tf.nn.bias_add(qn_conv1, qn_conv1_bias) qn_relu_conv1 = tf.nn.relu(qn_conv1_add_bias) pool_size = [1, 1, 2, 1] qn_pool1 = tf.nn.max_pool(qn_relu_conv1, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer1') with tf.variable_scope('qn_conv2') as scope: qn_conv2_filter = truncated_normal_var( name='qn_conv2_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv2 = tf.nn.conv2d(qn_pool1, qn_conv2_filter, strides, padding='SAME') qn_conv2_bias = zero_var(name='qn_conv2_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv2_add_bias = tf.nn.bias_add(qn_conv2, qn_conv2_bias) qn_relu_conv2 = tf.nn.relu(qn_conv2_add_bias) pool_size = [1, 1, 3, 1] qn_pool2 = tf.nn.max_pool(qn_relu_conv2, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer2') with tf.variable_scope('qn_conv3') as scope: qn_conv3_filter = truncated_normal_var( name='qn_conv3_filter', shape=[1, 3, self.FLAGS.CONV_SHAPE, self.FLAGS.CONV_SHAPE], dtype=tf.float32) strides = [1, 1, 1, 1] qn_conv3 = tf.nn.conv2d(qn_pool2, qn_conv3_filter, strides, padding='SAME') qn_conv3_bias = zero_var(name='qn_conv3_bias', shape=[self.FLAGS.CONV_SHAPE], dtype=tf.float32) qn_conv3_add_bias = tf.nn.bias_add(qn_conv3, qn_conv3_bias) qn_relu_conv3 = tf.nn.relu(qn_conv3_add_bias) pool_size = [1, 1, 3, 1] qn_pool3 = tf.nn.max_pool(qn_relu_conv3, ksize=pool_size, strides=pool_size, padding='SAME', name='qn_pool_layer3') qn_flattened_layer = tf.reshape( qn_pool3, [-1, self.FLAGS.question_len, 2 * self.FLAGS.CONV_SHAPE ]) #batch,30,128 qn_final = tf.concat([self.qn_embs, qn_flattened_layer], axis=2) encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) print("context_final final shape %s" % (context_final.get_shape())) print("context_mask final shape %s" % (self.context_mask.get_shape())) print("qn_final final shape %s" % (qn_final.get_shape())) print("qn_mask final shape %s" % (self.qn_mask.get_shape())) context_hiddens = encoder.build_graph( context_final, self.context_mask) # (batch_size, context_len, hidden_size*2+192) question_hiddens = encoder.build_graph( qn_final, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. #encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) #_, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) #blended_reps=attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens,self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) #print("blended_reps shape %s" % (blended_reps.get_shape())) #model_encoder_1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob,'model_layer_1') #model_layer_1=model_encoder_1.build_graph(blended_reps,self.qn_mask) #model_encoder_2= RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, 'model_layer_2') #model_layer_2=model_encoder_2.build_graph(model_layer_1,self.context_mask) attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*8) blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*8) model_encoder1 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder1") blended_reps_thro_model_layer1 = model_encoder1.build_graph( blended_reps, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder2 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder2") blended_reps_thro_model_layer2 = model_encoder2.build_graph( blended_reps_thro_model_layer1, self.context_mask) # (batch_size, context_len, hidden_size*2) model_encoder3 = RNNModelEncoder(self.FLAGS.hidden_size, self.keep_prob, model_name="RNNModelEncoder3") blended_reps_thro_model_layer3 = model_encoder3.build_graph( blended_reps_thro_model_layer2, self.context_mask) # (batch_size, context_len, hidden_size*2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final = tf.contrib.layers.fully_connected( blended_reps_thro_model_layer3, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) #blended_reps_final = tf.contrib.layers.fully_connected(model_layer_1,num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into cross entropy function. self.pdist_start, self.pdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Apply EncoderBlock for the stacked embedding encoder layer with tf.variable_scope("StackedEmbeddingEncoder"): emb_encoder = EncoderBlock(self.flags.num_blocks_enc, self.keep_prob, self.flags.kernel_size_enc, self.flags.d_model, self.flags.num_conv_enc, self.flags.num_heads, self.flags.d_ff, l2_lambda=self.flags.l2_lambda) c_enc = emb_encoder.build_graph(self.c_embs, self.c_longest, self.c_mask, reduce_input_dim=True, reuse=None) q_enc = emb_encoder.build_graph(self.q_embs, self.q_longest, self.q_mask, reduce_input_dim=True, reuse=True) # Apply bidirectional attention for the context-query attention layer with tf.variable_scope("ContextQueryAttention"): bidaf = BiDAFAttn(self.keep_prob, l2_lambda=self.flags.l2_lambda) # Shape: [batch_size, context_len, vec_size*8]. attn_outputs = bidaf.build_graph(c_enc, self.c_mask, self.c_longest, q_enc, self.q_mask, self.q_longest) # Apply EncoderBlock x3 for the modeling layer with tf.variable_scope("ModelEncoder"): model_encoder = EncoderBlock(self.flags.num_blocks_mod, self.keep_prob, self.flags.kernel_size_mod, self.flags.d_model, self.flags.num_conv_mod, self.flags.num_heads, self.flags.d_ff, l2_lambda=self.flags.l2_lambda) model_1 = model_encoder.build_graph(attn_outputs, self.c_longest, self.c_mask, reduce_input_dim=True) model_2 = model_encoder.build_graph(model_1, self.c_longest, self.c_mask, reuse=True) model_3 = model_encoder.build_graph(model_2, self.c_longest, self.c_mask, reuse=True) # Use a simple softmax output layer to compute start and end probability distributions with tf.variable_scope("Output"): with tf.variable_scope("StartDistribution"): start_inputs = tf.concat([model_1, model_2], axis=-1) softmax_layer_start = SimpleSoftmaxLayer( l2_lambda=self.flags.l2_lambda) self.logits_start, self.pdist_start = softmax_layer_start.build_graph( start_inputs, self.c_mask) with tf.variable_scope("EndDistribution"): end_inputs = tf.concat([model_1, model_3], axis=-1) softmax_layer_end = SimpleSoftmaxLayer( l2_lambda=self.flags.l2_lambda) self.logits_end, self.pdist_end = softmax_layer_end.build_graph( end_inputs, self.c_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE): if self.FLAGS.use_stacked_encoder: encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob) else: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) if self.FLAGS.num_encoding_layers > 1: with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE): encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask) question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask) # Use context hidden states to attend to question hidden states if self.FLAGS.bidaf: attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2) blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8) else: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Self-attention layer if self.FLAGS.self_attend: self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz) blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked # from BiDAF paper) to the hidden states from the attention layer. if self.FLAGS.modeling_layer: with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE): model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1: with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE): model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask) # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob) # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask) if self.FLAGS.pointer_network: #TODO: define flag with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE): pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size) (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \ pointer_network.build_graph(blended_reps_final, self.context_mask) else: # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) question_variation = tf.layers.dense(question_hiddens, question_hiddens.get_shape()[2], activation=tf.tanh) # In[] #question_length = tf.placeholder(tf.int32, (None,), name='question_length') #document_length = tf.placeholder(tf.int32, (None,), name='paragraph_length') unmasked_affinity = tf.einsum( 'ndh,nqh->ndq', context_hiddens, question_variation) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity = maybe_mask_affinity(unmasked_affinity, self.document_length) attention_p = tf.nn.softmax(affinity, dim=1) unmasked_affinity_t = tf.transpose( unmasked_affinity, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t = maybe_mask_affinity(unmasked_affinity_t, self.question_length) attention_q = tf.nn.softmax(affinity_t, dim=1) summary_q = tf.einsum( 'ndh,ndq->nqh', context_hiddens, attention_p) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d = tf.einsum( 'nqh,nqd->ndh', question_variation, attention_q) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d = tf.einsum('nqh,nqd->ndh', summary_q, attention_q) encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) context2 = encoder1.build_graph( summary_d, self.context_mask) # (batch_size, context_len, hidden_size*2) question2 = encoder1.build_graph( summary_q, self.qn_mask) # (batch_size, question_len, hidden_size*2) unmasked_affinity1 = tf.einsum( 'ndh,nqh->ndq', context2, question2) # [N, D, Q] or [N, 1+D, 1+Q] if sentinel affinity1 = maybe_mask_affinity(unmasked_affinity1, self.document_length) attention_p1 = tf.nn.softmax(affinity1, dim=1) unmasked_affinity_t1 = tf.transpose( unmasked_affinity1, [0, 2, 1]) # [N, Q, D] or [N, 1+Q, 1+D] if sentinel affinity_t1 = maybe_mask_affinity(unmasked_affinity_t1, self.question_length) attention_q1 = tf.nn.softmax(affinity_t1, dim=1) summary_q1 = tf.einsum( 'ndh,ndq->nqh', context2, attention_p1) # [N, Q, 2H] or [N, 1+Q, 2H] if sentinel summary_d1 = tf.einsum( 'nqh,nqd->ndh', question2, attention_q1) # [N, D, 2H] or [N, 1+D, 2H] if sentinel coattention_d1 = tf.einsum('nqh,nqd->ndh', summary_q1, attention_q1) # In[] document_representations = [ context_hiddens, # E^D_1 context2, # E^D_2 summary_d, # S^D_1 summary_d1, # S^D_2 coattention_d, # C^D_1 coattention_d1, # C^D_2 ] document_representation = tf.concat(document_representations, 2) encoder2 = RNNEncoder2(self.FLAGS.hidden_size, self.keep_prob) U = encoder2.build_graph(document_representation, self.context_mask) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, U], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. with vs.variable_scope("EmbedLayer"): encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states attn_layer = BiDAFAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, a, c = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens, self.context_mask ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps c = tf.expand_dims(c, 1) blended_reps = tf.concat( [context_hiddens, a, context_hiddens * a, context_hiddens * c], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) with vs.variable_scope("startModelLayer"): modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) m1 = modeling_layer_encoder.build_graph(blended_reps, self.context_mask) with vs.variable_scope("endModelLayer"): modeling_layer_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) m2 = modeling_layer_encoder.build_graph(blended_reps, self.context_mask) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): blended_reps_final_start = tf.concat([m1, blended_reps], axis=2) softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): blended_reps_final_end = tf.concat([m2, blended_reps], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final_end, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ if self.FLAGS.more_single_dir_rnn: # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. encoder0 = RNNEncoder0(self.FLAGS.hidden_size, self.keep_prob) # (batch_size, context_len, hidden_size) context_hiddens0 = encoder0.build_graph(self.context_embs, self.context_mask) # (batch_size, question_len, hidden_size) question_hiddens0 = encoder0.build_graph(self.qn_embs, self.qn_mask) encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) # (batch_size, context_len, hidden_size*2) context_hiddens1 = encoder1.build_graph(context_hiddens0, self.context_mask) # (batch_size, question_len, hidden_size*2) question_hiddens1 = encoder1.build_graph(question_hiddens0, self.qn_mask) else: encoder1 = RNNEncoder1(self.FLAGS.hidden_size, self.keep_prob) # (batch_size, context_len, hidden_size*2) context_hiddens1 = encoder1.build_graph(self.context_embs, self.context_mask) # (batch_size, question_len, hidden_size*2) question_hiddens1 = encoder1.build_graph(self.qn_embs, self.qn_mask) # Use context hidden states to attend to question hidden states basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2, self.FLAGS.advanced_basic_attn) # attn_output is shape (batch_size, context_len, hidden_size*2) _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens1, self.qn_mask, context_hiddens1) # Concat basic_attn_output to context_hiddens to get blended_reps0 blended_reps0 = tf.concat([context_hiddens1, basic_attn_output], axis=2) # (batch_size, context_len, hidden_size*4) if self.FLAGS.more_single_dir_rnn: rnnBasicAttn = RNNBasicAttn(self.FLAGS.hidden_size*4, self.keep_prob) rnn_basic_attn_reps = rnnBasicAttn.build_graph(blended_reps0, self.context_mask) # (batch_size, context_len, hidden_size*4) else: rnn_basic_attn_reps = blended_reps0 # Gang: adding self attention (R-NET) # self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size*4) # # (batch_size, context_len, hidden_size*4) # _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask) # Gang: adding dot attention (Attention Is All You Need) dot_attn_layer = DotAttn(self.keep_prob, self.FLAGS.hidden_size*4, self.FLAGS.advanced_dot_attn) # (batch_size, context_len, hidden_size*4) _, dot_attn_output = dot_attn_layer.build_graph(rnn_basic_attn_reps, self.context_mask) # Concat dot_attn_output to blended_reps0 to get blended_reps1 blended_reps1 = tf.concat([rnn_basic_attn_reps, dot_attn_output], axis=2) # (batch_size, context_len, hidden_size*8) # Gang: adding gated representation (R-NET) if self.FLAGS.gated_reps: gated_reps_layer = GatedReps(self.FLAGS.hidden_size*8) gated_blended_reps = gated_reps_layer.build_graph(blended_reps1) else: gated_blended_reps = blended_reps1 rnnDotAttn = RNNDotAttn(self.FLAGS.hidden_size*8, self.keep_prob) # (batch_size, context_len, hidden_size*16) rnn_dot_attn_reps = rnnDotAttn.build_graph(gated_blended_reps, self.context_mask) if self.FLAGS.use_answer_pointer: # blended_reps_final = tf.contrib.layers.fully_connected(rnn_dot_attn_reps, # num_outputs = self.FLAGS.hidden_size*2) pointer_layer_start = AnswerPointerLayerStart(self.keep_prob, self.FLAGS.hidden_size, self.FLAGS.hidden_size*16) rQ, self.logits_start, self.probdist_start = pointer_layer_start.build_graph(question_hiddens1, self.qn_mask, rnn_dot_attn_reps, self.context_mask) pointer_layer_end = AnswerPointerLayerEnd(self.keep_prob, self.FLAGS.hidden_size*16) self.logits_end, self.probdist_end = pointer_layer_end.build_graph(self.probdist_start, rQ, rnn_dot_attn_reps, self.context_mask) else: # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default # blended_reps_final is shape (batch_size, context_len, hidden_size) blended_reps_final = tf.contrib.layers.fully_connected(rnn_dot_attn_reps, num_outputs=self.FLAGS.hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.self_attention: encoder = RNNEncoder(self.FLAGS.hidden_size_encoder, self.keep_prob) else: encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) context_hiddens = encoder.build_graph( self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph( self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Use context hidden states to attend to question hidden states if self.FLAGS.simple_attention: attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _, attn_output = attn_layer.build_graph( question_hiddens, self.qn_mask, context_hiddens ) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat( [context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected( blended_reps, num_outputs=self.FLAGS.hidden_size ) # blended_reps_final is shape (batch_size, context_len, hidden_size) if self.FLAGS.co_attention: #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space question_hiddens_t = tf.transpose( question_hiddens, perm=[0, 2, 1]) #(batch_size,hidden_size*2,question_len) trans_question_hiddens_t = tf.contrib.layers.fully_connected( question_hiddens_t, num_outputs=self.FLAGS.question_len, activation_fn=tf.nn.tanh ) #(batch_size,hidden_size*2,question_len) trans_question_hiddens = tf.transpose( trans_question_hiddens_t, perm=[0, 2, 1]) #(batch_size,question_len,hidden_size*2) #Computing the coattention context co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) co_attn_output = co_attn_layer.build_graph( trans_question_hiddens, self.qn_mask, self.context_mask, context_hiddens) #(batch_size,context_len,6*hidden_size) # performing the fusion of temporal information to the coattention context via a bidirectional GRU with tf.variable_scope("co-attn-encoder"): co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) blended_reps_final = co_attn_encoder.build_graph( co_attn_output, self.context_mask) if self.FLAGS.self_attention: # implemrntation of self attention of the rnet paper self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder, self.FLAGS.hidden_size_qp, self.FLAGS.hidden_size_pp, self.keep_prob) v_p = self_attention_encoder.build_graph_qp( context_hiddens, question_hiddens, self.context_mask, self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len) h_p = self_attention_encoder.build_graph_pp( context_hiddens, question_hiddens, self.context_mask, self.qn_mask, v_p, self.FLAGS.context_len, self.FLAGS.question_len) blended_reps_final = tf.concat( [context_hiddens, v_p, h_p], axis=2) #(batch_size,context_len,5*hidden_size) if self.FLAGS.answer_pointer: #implementation of answer pointer as used in R-Net paper if self.FLAGS.co_attention: hidden_size_attn = self.FLAGS.hidden_size * 2 elif self.FLAGS.self_attention: hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp else: hidden_size_attn = self.FLAGS.hidden_size answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder, hidden_size_attn, self.FLAGS.question_len, self.keep_prob) p, logits = answer_decoder.build_graph_answer_pointer( question_hiddens, context_hiddens, blended_reps_final, self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask, self.context_mask) self.logits_start = logits[0] self.probdist_start = p[0] self.logits_end = logits[1] self.probdist_end = p[1] if self.FLAGS.simple_softmax: # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph( blended_reps_final, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph( blended_reps_final, self.context_mask)