def build_graph(self, values, values_mask, keys, keys_mask): sentinel_padding = tf.constant(1, shape=[1, 1]) batch_size = self.FLAGS.batch_size with vs.variable_scope("Attention"): # Calculate attention distribution dense_layer = partial( tf.layers.dense, activation=tf.nn.tanh, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) projected_values_t = dense_layer(values, self.FLAGS.embedding_size) values_t = tf.concat([ projected_values_t, tf.broadcast_to(self.values_sentinel, [batch_size, 1, self.FLAGS.embedding_size]) ], 1) # (batch_size, value_vec_size, num_values) #augmented context vectors. keys_t = tf.concat([ keys, tf.broadcast_to(self.keys_sentinel, [batch_size, 1, self.FLAGS.embedding_size]) ], 1) affinity_scores = tf.matmul( keys_t, tf.transpose(values_t, perm=[ 0, 2, 1 ])) # shape (batch_size, num_keys, num_values) values_mask_1 = tf.expand_dims( tf.concat([ values_mask, tf.broadcast_to(sentinel_padding, [batch_size, 1]) ], 1), 1) #shape (batch_size, 1, num_values). _, C2Q_softmax = masked_softmax( affinity_scores, values_mask_1, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values attn_output_1 = tf.matmul( C2Q_softmax, values_t) # shape (batch_size, num_keys, value_vec_size) keys_mask_1 = tf.expand_dims( tf.concat([ keys_mask, tf.broadcast_to(sentinel_padding, [batch_size, 1]) ], 1), 2) #shape (batch_size, num_keys, 1) _, Q2C_softmax = masked_softmax(affinity_scores, keys_mask_1, 1) Q2C_output = tf.matmul(tf.transpose(Q2C_softmax, perm=[0, 2, 1]), keys_t) attn_output_2 = tf.matmul(C2Q_softmax, Q2C_output) key_hidden = tf.concat([attn_output_2, attn_output_1], 2) key_hidden = key_hidden[:, :self.FLAGS.context_len, :] # Apply dropout output = tf.nn.dropout(key_hidden, self.keep_prob) return output
def build_graph(self, values, values_mask, keys): """ Keys attend to values. For each key, return an attention distribution and an attention output vector. Inputs: values: Tensor shape (batch_size, num_values, value_vec_size). values_mask: Tensor shape (batch_size, num_values). 1s where there's real input, 0s where there's padding keys: Tensor shape (batch_size, num_keys, key_vec_size) Outputs: attn_dist: Tensor shape (batch_size, num_keys, num_values). For each key, the distribution should sum to 1, and should be 0 in the value locations that correspond to padding. output: Tensor shape (batch_size, num_keys, hidden_size). This is the attention output; the weighted sum of the values (using the attention distribution as weights). """ with vs.variable_scope("MultiplicativeAttn"): keys_shape = keys.get_shape().as_list( ) # (batch_size, num_keys, key_vec_size) values_shape = values.get_shape().as_list( ) # (batch_size, num_values, value_vec_size) # Calculate attention distribution W = tf.get_variable( 'W_mul_attn', shape=(self.key_vec_size, self.value_vec_size), initializer=tf.contrib.layers.xavier_initializer()) keys_r = tf.reshape( keys, [-1, keys_shape[2]]) # (batch_size * num_keys, key_vec_size) attn_logits = tf.matmul( keys_r, W) # (batch_size * num_keys, value_vec_size) attn_logits = tf.reshape( attn_logits, [-1, keys_shape[1], values_shape[2] ]) # (batch_size, num_keys, value_vec_size) values_t = tf.transpose( values, perm=[0, 2, 1]) # (batch_size, value_vec_size, num_values) attn_logits = tf.matmul( attn_logits, values_t) # (batch_size, num_keys, num_values) attn_logits_mask = tf.expand_dims( values_mask, 1) # shape (batch_size, 1, num_values) attn_masked_logits, attn_prob_dist = masked_softmax( attn_logits, attn_logits_mask, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values # Use attention distribution to take weighted sum of values output = tf.matmul( attn_prob_dist, values) # shape (batch_size, num_keys, value_vec_size) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) return attn_masked_logits, attn_prob_dist, output
def build_graph(self, values, values_mask, keys): """ Keys attend to values. For each key, return an attention distribution and an attention output vector. Inputs: values: Tensor shape (batch_size, num_values, value_vec_size). values_mask: Tensor shape (batch_size, num_values). 1s where there's real input, 0s where there's padding keys: Tensor shape (batch_size, num_keys, value_vec_size) Outputs: attn_dist: Tensor shape (batch_size, num_keys, num_values). For each key, the distribution should sum to 1, and should be 0 in the value locations that correspond to padding. output: Tensor shape (batch_size, num_keys, hidden_size). This is the attention output; the weighted sum of the values (using the attention distribution as weights). """ with vs.variable_scope("GatedDotAttn"): # Calculate attention distribution values_t = tf.transpose( values, perm=[0, 2, 1]) # (batch_size, value_vec_size, num_values) attn_logits = tf.matmul( keys, values_t) # shape (batch_size, num_keys, num_values) attn_logits_mask = tf.expand_dims( values_mask, 1) # shape (batch_size, 1, num_values) _, attn_dist = masked_softmax( attn_logits, attn_logits_mask, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values # Use attention distribution to take weighted sum of values output = tf.matmul( attn_dist, values) # shape (batch_size, num_keys, value_vec_size) # Blend output = tf.concat([keys, output], axis=2) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) # Compute gate with tf.variable_scope('c2qgate'): shape = tf.shape(output) dim = output.get_shape().as_list()[-1] flatten = tf.reshape(output, (-1, dim)) W = tf.get_variable('Wc2gate', (dim, dim)) gate = tf.matmul(flatten, W) gate = tf.reshape(gate, shape) gate = tf.nn.sigmoid(gate) output = gate * output return attn_dist, output
def build_graph(self, keys, keys_mask): with vs.variable_scope("Attention"): dense_layer_1 = partial( tf.layers.dense, activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) dense_layer_2 = partial( tf.layers.dense, activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) projected_keys_1 = dense_layer_1( keys, self.hidden_vec_size ) # (batch_size, num_keys, hidden_vec_size) projected_keys_2 = dense_layer_2( keys, self.hidden_vec_size ) # (batch_size, num_keys, hidden_vec_size) keys_t = tf.expand_dims(projected_keys_1, 2) + tf.expand_dims( projected_keys_2, 1) keys_t.set_shape([ self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.context_len, self.hidden_vec_size ]) keys_t = tf.nn.tanh(keys_t) V = partial( tf.layers.dense, activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) self_attn_keys = tf.squeeze(V(keys_t, 1)) _, self_attn_softmax = masked_softmax(self_attn_keys, tf.expand_dims(keys_mask, 1), 1) output = tf.matmul( self_attn_softmax, keys ) #no tranpose needed due to symmetric, shape (batch_size, num_keys, value_vec_size) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) return output
def build_mult_graph(self, values, values_mask, keys, FLAGS): values = tf.nn.dropout(values, self.keep_prob) keys = tf.nn.dropout(keys, self.keep_prob) with vs.variable_scope("GatedDotAttn"): # Calculate attention distribution values_ = tf.nn.relu(dense(values, FLAGS.hidden_size, 'values')) values_t = tf.transpose( values_, perm=[0, 2, 1]) # (batch_size, value_vec_size, num_values) keys_ = tf.nn.relu(dense(keys, FLAGS.hidden_size, 'keys')) attn_logits = tf.matmul(keys_, values_t) / ( FLAGS.hidden_size**0.5 ) # shape (batch_size, num_keys, num_values) attn_logits_mask = tf.expand_dims( values_mask, 1) # shape (batch_size, 1, num_values) _, attn_dist = masked_softmax( attn_logits, attn_logits_mask, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values # Use attention distribution to take weighted sum of values output = tf.matmul( attn_dist, values) # shape (batch_size, num_keys, value_vec_size) # Blend output = tf.concat([keys, output], axis=2) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) # Compute gate with tf.variable_scope('c2qgate'): shape = tf.shape(output) dim = output.get_shape().as_list()[-1] flatten = tf.reshape(output, (-1, dim)) W = tf.get_variable('Wc2gate', (dim, dim)) gate = tf.matmul(flatten, W) gate = tf.reshape(gate, shape) gate = tf.nn.sigmoid(gate) output = gate * output return attn_dist, output
def build_graph(self): # ENCODING unstack_context = self.e_context_embs unstack_qn = self.e_qn_embs with tf.variable_scope('encoding') as scope: # Change to dynamic bidrectional rnn # WE CAN CHANGE THE GRU LATER WITH DROPOUT OUR # ADD ENCODE SIZE emb_fwd_cell = tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size) emb_back_cell = tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size) (c_fwd, c_back), _ = tf.nn.bidirectional_dynamic_rnn( emb_fwd_cell, emb_back_cell, unstack_context, tf.reduce_sum(self.context_mask, reduction_indices=1), dtype='float32') tf.get_variable_scope().reuse_variables() (qn_fwd, qn_back), _ = tf.nn.bidirectional_dynamic_rnn( emb_fwd_cell, emb_back_cell, unstack_qn, tf.reduce_sum(self.qn_mask, reduction_indices=1), dtype='float32') u_Q = tf.concat( [qn_fwd, qn_back], 2 ) # [batch, q_len, 2 * hidden_size] because bidirectional stacks the forward and backward u_P = tf.concat([c_fwd, c_back], 2) # [batch, c_len, 2 * hidden_size] u_Q = tf.nn.dropout(u_Q, self.keep_prob) u_P = tf.nn.dropout(u_P, self.keep_prob) # GATED ATTENTION v_P = [] # All attention states across time # each element of v_P is an attention state for one time point with dim [batch_size, hidden_size] print "Gated Attention" with tf.variable_scope('Attention_gated') as scope: W_uQ = tf.get_variable( 'W_uQ', shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) W_uP = tf.get_variable( 'W_uP', shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) W_vP = tf.get_variable( 'W_vP', shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) v_QP = tf.get_variable( 'v_QP', shape=(self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) W_g_QP = tf.get_variable('W_g_QP', shape=(4 * self.FLAGS.hidden_size, 4 * self.FLAGS.hidden_size)) # TO DO: add drop prob in FLAGS QP_cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob) zeros_dim = tf.stack([tf.shape(u_Q)[0], self.FLAGS.hidden_size]) QP_cell_hidden = tf.fill(zeros_dim, 0.0) for t in range(0, self.FLAGS.context_len): # TODO: MOVE THE VARIABLES TO SOMEWHERE ELSE APPROPRIATE WuQ_uQ = tf.tensordot(u_Q, W_uQ, axes=[[2], [0] ]) # [batch, q_len, hidden_size] u_P_t = tf.reshape( u_P[:, t, :], (-1, 1, 2 * self.FLAGS.hidden_size) ) # slice only 1 context word, [batch_size, 1, 2 * hidden_size] WuP_uP = tf.tensordot(u_P_t, W_uP, axes=[[2], [0]]) # [batch, 1, hidden_size] if t == 0: s_t = tf.tensordot(tf.tanh(WuQ_uQ + WuP_uP), v_QP, axes=[[2], [0]]) # returns [batch, q_len] else: v_P_t = tf.reshape(v_P[t - 1], (-1, 1, self.FLAGS.hidden_size )) # [batch_size, 1, hidden_size] WvP_vP = tf.tensordot( v_P_t, W_vP, axes=[[2], [0]]) # [batch_size, 1, hidden_size] s_t = tf.tensordot(tf.tanh(WuQ_uQ + WuP_uP + WvP_vP), v_QP, axes=[[2], [0]]) # returns [batch, q_len] #a_t = tf.nn.softmax(s_t, 1) _, a_t = masked_softmax(s_t, self.qn_mask, 1) # [batch, q_len] # [batch, q_len] , [batch,q_len,2*hidden_size] -> [batch, 2*hidden_size] c_t = tf.einsum('ij,ijk->ik', a_t, u_Q) #[batch,2*hidden_size] uPt_ct = tf.concat([tf.squeeze(u_P_t), c_t], 1) # [batch, 2 * 2 * hidden_size] g_t = tf.nn.sigmoid(tf.matmul( uPt_ct, W_g_QP)) # [batch, 2 * 2 * hidden_size] uPt_ct_star = tf.einsum('ij,ij->ij', g_t, uPt_ct) if t > 0: tf.get_variable_scope().reuse_variables() QP_output, QP_cell_hidden = QP_cell( uPt_ct_star, QP_cell_hidden ) # both output and hidden [batch_size, hidden_size] v_P.append(QP_output) v_P = tf.stack(v_P, 1) # [batch, context_len, hidden_size] v_P = tf.nn.dropout(v_P, self.keep_prob) #SELF ATTN print "self attention" with tf.variable_scope("self_matching_attn") as scope: SM_input = [] W_v_P = tf.get_variable( 'W_v_P', shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) W_v_P_tot = tf.get_variable( 'W_v_P_tot', shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size), initializer=tf.contrib.layers.xavier_initializer()) v_SM = tf.get_variable('v_SM', shape=(self.FLAGS.hidden_size)) for t in range(0, self.FLAGS.context_len): v_j_P = tf.reshape( v_P[:, t, :], (-1, 1, self.FLAGS.hidden_size )) #Slice 1 v_P in time t [batch_size, 1, hidden_size] WvP_vj = tf.tensordot(v_j_P, W_v_P, axes=[[2], [0]]) # [batch, 1, hidden_size] WvPtot_vP = tf.tensordot( v_P, W_v_P_tot, axes=[[2], [0]]) # [batch, context_len, hidden_size] s_t = tf.tensordot(tf.tanh(WvP_vj + WvPtot_vP), v_SM, axes=[[2], [0]]) # [batch, context_len] #a_t = tf.nn.softmax(s_t, 1) _, a_t = masked_softmax(s_t, self.context_mask, 1) c_t = tf.einsum('ij,ijk->ik', a_t, v_P) #[batch, hidden_size] # add the gate vPt_ct = tf.concat([tf.squeeze(v_j_P), c_t], 1) #[batch, 2 * hidden_size] g_t = tf.nn.sigmoid(vPt_ct) vPt_ct_star = tf.einsum('ij,ij->ij', g_t, vPt_ct) # [batch, 2*hidden_size] SM_input.append(vPt_ct_star) # Someone here just stacked and then unstack, not sure why so I will just directly use SM_input SM_input = tf.stack(SM_input, 1) # [batch, context_len, 2 * hidden_size] SM_fwd_cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob) SM_back_cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob) (h_P_fwd, h_P_back), SM_final = tf.nn.bidirectional_dynamic_rnn( SM_fwd_cell, SM_back_cell, SM_input, tf.reduce_sum(self.context_mask, reduction_indices=1), dtype=tf.float32) h_P = tf.concat([h_P_fwd, h_P_back], 2) h_P = tf.nn.dropout( h_P, self.keep_prob) #[batch, context_len, 2*hidden_size] # OUTPUT print "output" with tf.variable_scope("Output") as scope: W_ruQ = tf.get_variable('W_ruQ', shape=(2 * self.FLAGS.hidden_size, 2 * self.FLAGS.hidden_size)) V_rQ = tf.get_variable('V_rQ', shape=(self.FLAGS.question_len, 2 * self.FLAGS.hidden_size)) W_vQ = tf.get_variable('W_vQ', shape=(2 * self.FLAGS.hidden_size, 2 * self.FLAGS.hidden_size)) v_rQ = tf.get_variable('v_rQ', shape=(2 * self.FLAGS.hidden_size)) WuQ_ujQ = tf.tensordot( u_Q, W_ruQ, [[2], [0]]) #[batch, q_len, 2 * hidden_size] WvQ_VrQ = tf.tensordot(V_rQ, W_vQ, [[1], [0]]) #[q_len, 2*hidden_size] s_t = tf.tensordot( tf.tanh(WuQ_ujQ + WvQ_VrQ), v_rQ, axes=[ [2], [0] ]) # The addition will broadcast # final shape: [batch, q_len] _, a_t = masked_softmax(s_t, self.qn_mask, 1) rQ = tf.einsum('ij,ijk->ik', a_t, u_Q) rQ = tf.nn.dropout(rQ, self.keep_prob) #[batch, 2*hidden_size] h_a = rQ # initial ans pointer p_t = [None] * 2 W_hP = tf.get_variable('W_hP', shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size)) W_ha = tf.get_variable('W_ha', shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size)) v_ap = tf.get_variable( 'v_ap', shape=(self.FLAGS.hidden_size)) # answer pointer bias ans_cell = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(2 * self.FLAGS.hidden_size), self.keep_prob) for t in range(0, 2): # run thru RNN 2 times (cuz one start one end) WhP_hP = tf.tensordot( h_P, W_hP, [[2], [0]]) #[batch, context_len, hidden_size] Wha_ha = tf.reshape( tf.tensordot(h_a, W_ha, [[1], [0]]), (-1, 1, self.FLAGS.hidden_size)) #[batch, 1, encode] s_t = tf.tensordot(tf.tanh(WhP_hP + Wha_ha), v_ap, axes=[[2], [0]]) # [batch, context_len] #a_t = tf.nn.softmax(s_t, 1) _, a_t = masked_softmax(s_t, self.context_mask, 1) if t == 0: self.logits_start = a_t #[batch, context_alen] else: self.logits_end = a_t c_t = tf.einsum('ij,ijk->ik', a_t, h_P) #[batch, 2*encode] if t == 0: h_a, _ = ans_cell(c_t, h_a) # h_a = [batch, 2*encode] print "complete"
def build_graph(self): """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span. Defines: self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len). These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution. Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function. self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1. These are the result of taking (masked) softmax of logits_start and logits_end. """ # Use a RNN to get hidden states for the context and the question # Note: here the RNNEncoder is shared (i.e. the weights are the same) # between the context and the question. if self.FLAGS.model == "baseline" : encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn": print("INSIDE the BIDAF model") encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob) elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn": encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob) if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn": context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2) question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "baseline" : attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2) _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2) # Concat attn_output to context_hiddens to get blended_reps blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask) # Attention model # Use context hidden states to attend to question hidden states if self.FLAGS.model == "coatt" : #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2) #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2) context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) blended_reps_final = attn_output #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): contextLen = tf.reduce_sum(self.context_mask, axis=1) cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size) (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32) U_1 = tf.concat([fw_out, bw_out], axis=2) out = tf.nn.dropout(U_1, self.keep_prob) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask) elif self.FLAGS.model =="bidaf" or self.FLAGS.model=="bidaf_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) if self.FLAGS.model == "bidaf_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): softmax_layer_start = SimpleSoftmaxLayer() self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask) # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): # Concatenate the start logits with the modelling layer output to get the input to the # end word lstm #self.logits_start has a shape of #(batch_size, context_len) logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1) end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2) # LSTM end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob) blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask) blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2) softmax_layer_end = SimpleSoftmaxLayer() self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask) elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn": attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8) if self.FLAGS.model == "bidaf_dynamic_self_attn": self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: attn_output = attn_output_tmp # Set of vectors which produces a set of query aware feature vectors for each word in the context #blended_reps = attn_output #(batch_size, num_keys, 4*value_vec_size) # In BIDAF the attention output is feed to a modeling layer # The Modeling layer is a 2 layer lstm mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob) mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask) # (batch_size, context_len, hidden_size*2) blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2) # (batch_size, context_len, hidden_size*10) # We now feed this to dynamic decoder module coded in Answer decoder # the output of the decoder are start, end, alpha_logits and beta_logits # start and end have a shape of (batch_size, num_iterations) #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim) decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = mod_layer_out[:,0,:] u_e_init = mod_layer_out[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1] elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn": context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask) attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2) if self.FLAGS.model == "coatt_dynamic_self_attn": CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8) _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask) # (batch_size, conetx_len, 8*hidden_size) attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size) else: U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) attn_output = U #blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # Apply fully connected layer to each blended representation # Note, blended_reps_final corresponds to b' in the handout # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size) u_s_init = attn_output[:,0,:] u_e_init = attn_output[:,0,:] start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init) # Use softmax layer to compute probability distribution for start location # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len) with vs.variable_scope("StartDist"): #softmax_layer_start = SimpleSoftmaxLayer() logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits] self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp) self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1] # Use softmax layer to compute probability distribution for end location # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len) with vs.variable_scope("EndDist"): logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits] self.beta_logits , beta_logits_probs = zip(*logits_end_tmp) self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
def build_graph(self, values, values_mask, keys, keys_mask): with vs.variable_scope("Attention"): dense_layer1 = partial( tf.layers.dense, activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) dense_layer2 = partial( tf.layers.dense, activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001)) score1 = dense_layer1(keys, 1) #shape (batch_size, num_keys, 1) score2 = dense_layer2(values, 1) #shape (batch_size, num_values, 1) #version1. too much memory. Or do (batch, k_len, 1, ndim) * (batch, 1, v_len, ndim). #k = tf.expand_dims(tf.traspose(keys, perm=[0,2,1]), 3) # shape (batch_size, hidden_size, num_keys, 1). #v = tf.expand_dims(tf.traspose(values, perm=[0,2,1]), 2) #matrix = tf.traspose(tf.matmul(k, v), perm=[0,2,3,1]) #version2. seems infeasible. # def matrix_func(keys, values, weight): # mat = np.zeros(self.shape) # for k in xrange(self.shape[0]): # for i in xrange(self.shape[1]): # for j in xrange(self.shape[2]): # for m in xrange(self.vec_size): # mat[k,i,j] += weight[m]*keys[k,i,m]*values[k,j,m] # return mat # weight = tf.Variable(tf.random_normal([self.vec_size]), dtype=tf.float32, name="similarity_weight_3") # similarity_scores = tf.cast(tf.py_func(matrix_func, [keys, values, weight], tf.double), tf.float32) # similarity_scores.set_shape(self.shape[0:]) #version3. memory efficient. associate the channel weight weight with keys in advance, then multiply the result with values. weight = tf.Variable(tf.random_normal([1, 1, self.hidden_vec_size]), dtype=tf.float32, name="similarity_weight_3") weighted_keys = weight * keys similarity_scores = tf.matmul(weighted_keys, tf.transpose(values, perm=[0, 2, 1])) similarity_scores = score1 + tf.transpose(score2, perm=[ 0, 2, 1 ]) + similarity_scores # shape (batch_size, num_keys, num_values) attn_logits_mask = tf.expand_dims( values_mask, 1) # shape (batch_size, 1, num_values) _, C2Q_softmax = masked_softmax( similarity_scores, attn_logits_mask, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values C2Q_output = tf.matmul( C2Q_softmax, values) # shape (batch_size, num_keys, value_vec_size) max_i = tf.reduce_max(similarity_scores, 2) _, Q2C_softmax = masked_softmax(max_i, keys_mask, 1) # shape(batch_size, num_keys) Q2C_softmax = tf.expand_dims(Q2C_softmax, -1) Q2C_output = tf.reduce_sum( Q2C_softmax * keys, 1, keepdims=True ) #or Q2C_output = tf.matmul(tf.transpose(keys, (0, 2, 1)), tf.expand_dims(Q2C_softmax, -1)) output = tf.concat([ keys, C2Q_output, tf.broadcast_to(Q2C_output, tf.shape(keys)) ], 2) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) return output