def add_global_voting_op(self): with tf.variable_scope("global_voting"): self.final_scores_before_global = - (1 - self.loss_mask) * 50 + self.final_scores gmask = tf.to_float(((self.final_scores_before_global - self.args.global_thr) >= 0)) # [b,s,30] masked_entity_emb = self.pure_entity_embeddings * tf.expand_dims(gmask, axis=3) # [b,s,30,300] * [b,s,30,1] batch_size = tf.shape(masked_entity_emb)[0] all_voters_emb = tf.reduce_sum(tf.reshape(masked_entity_emb, [batch_size, -1, 300]), axis=1, keep_dims=True) # [b, 1, 300] span_voters_emb = tf.reduce_sum(masked_entity_emb, axis=2) # [batch, num_of_spans, 300] valid_voters_emb = all_voters_emb - span_voters_emb # [b, 1, 300] - [batch, spans, 300] = [batch, spans, 300] (broadcasting) # [300] - [batch, spans, 300] = [batch, spans, 300] (broadcasting) valid_voters_emb = tf.nn.l2_normalize(valid_voters_emb, dim=2) self.global_voting_scores = tf.squeeze(tf.matmul(self.pure_entity_embeddings, tf.expand_dims(valid_voters_emb, axis=3)), axis=3) # [b,s,30,300] matmul [b,s,300,1] --> [b,s,30,1]-->[b,s,30] scalar_predictors = tf.stack([self.final_scores_before_global, self.global_voting_scores], 3) #print("scalar_predictors = ", scalar_predictors) #[b, s, 30, 2] with tf.variable_scope("psi_and_global_ffnn"): if self.args.global_score_ffnn[0] == 0: self.final_scores = util.projection(scalar_predictors, 1) else: hidden_layers, hidden_size = self.args.global_score_ffnn[0], self.args.global_score_ffnn[1] self.final_scores = util.ffnn(scalar_predictors, hidden_layers, hidden_size, 1, self.dropout if self.args.ffnn_dropout else None) # [batch, num_mentions, 30, 1] squeeze to [batch, num_mentions, 30] self.final_scores = tf.squeeze(self.final_scores, axis=3)
def add_cand_ent_scores_op(self): self.log_cand_entities_scores = tf.log( tf.minimum(1.0, tf.maximum(self.args.zero, self.cand_entities_scores))) stack_values = [] if self.args.nn_components.find("lstm") != -1: stack_values.append(self.similarity_scores) if self.args.nn_components.find("pem") != -1: stack_values.append(self.log_cand_entities_scores) if self.args.nn_components.find("attention") != -1: stack_values.append(self.attention_scores) scalar_predictors = tf.stack(stack_values, 3) #print("scalar_predictors = ", scalar_predictors) # [batch, num_mentions, 30, 3] with tf.variable_scope("similarity_and_prior_ffnn"): if self.args.final_score_ffnn[0] == 0: self.final_scores = util.projection( scalar_predictors, 1) # [batch, num_mentions, 30, 1] else: hidden_layers, hidden_size = self.args.final_score_ffnn[ 0], self.args.final_score_ffnn[1] self.final_scores = util.ffnn( scalar_predictors, hidden_layers, hidden_size, 1, self.dropout if self.args.ffnn_dropout else None) self.final_scores = tf.squeeze( self.final_scores, axis=3) # squeeze to [batch, num_mentions, 30]
def add_cand_ent_scores_op(self): # now add the cand_entity_scores maybe also some extra features and through a simple ffnn stack_values = [] if self.args.nn_components.find("lstm") != -1: stack_values.append(self.similarity_scores) if self.args.nn_components.find("pem") != -1: # TODO rename to pem_scores self.log_cand_entities_scores = self.custom_pem( self.args.pem_without_log, self.args.pem_buckets_boundaries) stack_values.append(self.log_cand_entities_scores) if self.args.nn_components.find("attention") != -1: stack_values.append(self.attention_scores) if len(stack_values) == 1: # since only one scalar omit the final ffnn self.final_scores = stack_values[0] return scalar_predictors = tf.stack(stack_values, 3) #print("scalar_predictors = ", scalar_predictors) #[batch, num_mentions, 30, 2] with tf.variable_scope("similarity_and_prior_ffnn"): if self.args.final_score_ffnn[0] == 0: self.final_scores = util.projection(scalar_predictors, 1, model=self) else: hidden_layers, hidden_size = self.args.final_score_ffnn[ 0], self.args.final_score_ffnn[1] self.final_scores = util.ffnn( scalar_predictors, hidden_layers, hidden_size, 1, self.dropout if self.args.ffnn_dropout else None, model=self) self.final_scores = tf.squeeze(self.final_scores, axis=3)
def add_lstm_score_op(self): #print("cand_entities = ", self.cand_entities) with tf.variable_scope("span_emb_ffnn"): # [batch, num_mentions, 300] # the span embedding can have different size depending on the chosen hyperparameters. We project it to 300 # dims to match the entity embeddings (formula 4) if self.args.span_emb_ffnn[0] == 0: span_emb_projected = util.projection(self.span_emb, 300, model=self) else: hidden_layers, hidden_size = self.args.span_emb_ffnn[ 0], self.args.span_emb_ffnn[1] span_emb_projected = util.ffnn( self.span_emb, hidden_layers, hidden_size, 300, self.dropout if self.args.ffnn_dropout else None, model=self) #print("span_emb_projected = ", span_emb_projected) # formula (6) <x^m, y_j> computation. this is the lstm score scores = tf.matmul(tf.expand_dims(span_emb_projected, 2), self.entity_embeddings, transpose_b=True) #print("scores = ", scores) self.similarity_scores = tf.squeeze( scores, axis=2) # [batch, num_mentions, 1, 30]
def add_lstm_score_op(self): #print("cand_entities = ", self.cand_entities) with tf.variable_scope("span_emb_ffnn"): # [batch, num_mentions, 300] if self.args.span_emb_ffnn[0] == 0: span_emb_projected = util.projection(self.span_emb, 300, model=self) else: hidden_layers, hidden_size = self.args.span_emb_ffnn[ 0], self.args.span_emb_ffnn[1] span_emb_projected = util.ffnn( self.span_emb, hidden_layers, hidden_size, 300, self.dropout if self.args.ffnn_dropout else None, model=self) #print("span_emb_projected = ", span_emb_projected) scores = tf.matmul(tf.expand_dims(span_emb_projected, 2), self.entity_embeddings, transpose_b=True) #print("scores = ", scores) self.similarity_scores = tf.squeeze( scores, axis=2) # [batch, num_mentions, 1, 30]
def add_lstm_score_op(self): with tf.variable_scope("span_emb_ffnn"): # [batch, num_mentions, 300] # the span embedding can have different size depending on the chosen hyperparameters. We project it to 300 # dims to match the entity embeddings (formula 4) if self.args.span_emb_ffnn[0] == 0: span_emb_projected = util.projection(self.span_emb, 256) else: hidden_layers, hidden_size = self.args.span_emb_ffnn[0], self.args.span_emb_ffnn[1] span_emb_projected = util.ffnn(self.span_emb, hidden_layers, hidden_size, 300, self.dropout if self.args.ffnn_dropout else None) #print("span_emb_projected = ", span_emb_projected) # formula (6) <x^m, y_j> computation. this is the lstm score coeffs = tf.nn.softmax(tf.matmul(span_emb_projected[:, :, None, None, :], self.entity_embeddings, transpose_b=True)) coeffs = tf.transpose(coeffs, [0, 1, 2, 4, 3]) ent_emb = tf.reduce_sum(coeffs * self.entity_embeddings, -2) scores = tf.matmul(tf.expand_dims(span_emb_projected, 2), ent_emb, transpose_b=True) #print("scores = ", scores) self.similarity_scores = tf.squeeze(scores, axis=2) # [batch, num_mentions, 1, 30]
def add_local_attention_op(self): attention_entity_emb = self.pure_entity_embeddings if self.args.attention_ent_vecs_no_regularization else self.entity_embeddings with tf.variable_scope("attention"): K = self.args.attention_K left_mask = self._sequence_mask_v13( self.begin_span, K) # number of words on the left (left window) right_mask = self._sequence_mask_v13( tf.expand_dims(self.words_len, 1) - self.end_span, K) # number of words on the right. of course i don't get more than K even if more words exist. ctxt_mask = tf.concat([left_mask, right_mask], 2) # [batch, num_of_spans, 2*K] ctxt_mask = tf.log( tf.minimum(1.0, tf.maximum(self.args.zero, ctxt_mask))) # T, T, T, F, F | T, T, F, F, F # -1, -2, -3, -4, -5 +0, +1, +2, +3, +4 leftctxt_indices = tf.maximum( 0, tf.range(-1, -K - 1, -1) + tf.expand_dims(self.begin_span, 2)) # [batch, num_mentions, K] rightctxt_indices = tf.minimum( tf.shape(self.pure_word_embeddings)[1] - 1, tf.range(K) + tf.expand_dims(self.end_span, 2)) # [batch, num_mentions, K] ctxt_indices = tf.concat([leftctxt_indices, rightctxt_indices], 2) # [batch, num_mentions, 2*K] batch_index = tf.tile( tf.expand_dims( tf.expand_dims(tf.range(tf.shape(ctxt_indices)[0]), 1), 2), [1, tf.shape(ctxt_indices)[1], tf.shape(ctxt_indices)[2]]) ctxt_indices = tf.stack([batch_index, ctxt_indices], 3) # [batch, num_of_spans, 2*K, 2] the last dimension is row,col for gather_nd # [batch, num_of_spans, 2*K, [row,col]] att_x_w = self.pure_word_embeddings # [batch, max_sent_len, 300] if self.args.attention_on_lstm and self.args.nn_components.find( "lstm") != -1: # ablation: here the attention is computed on the output of the lstm layer x_k instead of using the # pure word2vec vectors. (word2vec used in paper). att_x_w = util.projection( self.context_emb, 300 ) # if tf.shape(self.context_emb)[-1] != 300 else self.context_emb ctxt_word_emb = tf.gather_nd(att_x_w, ctxt_indices) # [batch, num_of_spans, 2K, emb_size] emb_size = 300 only pure word emb used (word2vec) # and not after we add char emb and dropout # in this implementation we don't use the diagonal A and B arrays that are mentioned in # Ganea and Hoffmann 2017 (only used in the ablations) temp = attention_entity_emb if self.args.attention_use_AB: att_A = tf.get_variable("att_A", [300]) temp = att_A * attention_entity_emb scores = tf.matmul(ctxt_word_emb, temp, transpose_b=True) scores = tf.reduce_max( scores, reduction_indices=[-1] ) # max score of each word for each span acquired from any cand entity scores = scores + ctxt_mask # some words are not valid out of window so we assign to them very low score top_values, _ = tf.nn.top_k(scores, self.args.attention_R) # [batch, num_of_spans, R] R_value = top_values[:, :, -1] # [batch, num_of_spans] R_value = tf.maximum(self.args.zero, R_value) # so to avoid keeping words that # have max score with any of the entities <=0 (also score = 0 can have words with # padding candidate entities) threshold = tf.tile(tf.expand_dims(R_value, 2), [1, 1, 2 * K]) # [batch, num_of_spans, 2K] scores = scores - tf.to_float( ((scores - threshold) < 0)) * 50 # 50 where score<thr, 0 where score>=thr scores = tf.nn.softmax(scores, dim=2) # [batch, num_of_spans, 2K] scores = tf.expand_dims(scores, 3) # [batch, num_of_spans, 2K, 1] # [batch, num_of_spans, 2K, 1] * [batch, num_of_spans, 2K, emb_size] # = [batch, num_of_spans, 2K, emb_size] x_c = tf.reduce_sum(scores * ctxt_word_emb, 2) # = [batch, num_of_spans, emb_size] if self.args.attention_use_AB: att_B = tf.get_variable("att_B", [300]) x_c = att_B * x_c x_c = tf.expand_dims(x_c, 3) # [batch, num_of_spans, emb_size, 1] # [batch, num_of_spans, 30, emb_size=300] mul with [batch, num_of_spans, emb_size, 1] x_e__x_c = tf.matmul(attention_entity_emb, x_c) # [batch, num_of_spans, 30, 1] x_e__x_c = tf.squeeze(x_e__x_c, axis=3) # [batch, num_of_spans, 30] self.attention_scores = x_e__x_c
def add_span_emb_op(self): mention_emb_list = [] # span embedding based on boundaries (start, end) and head mechanism. but do that on top of contextual bilistm # output or on top of original word+char embeddings. this flag determines that. The parer reports results when # using the contextual lstm emb as it achieves better score. Used for ablation studies. boundaries_input_vecs = self.word_embeddings if self.args.span_boundaries_from_wordemb else self.context_emb # the span embedding is modeled by g^m = [x_q; x_r; \hat(x)^m] (formula (2) of paper) # "boundaries" mean use x_q and x_r. "head" means use also the head mechanism \hat(x)^m (formula (3)) if self.args.span_emb.find("boundaries") != -1: # shape (batch, num_of_cand_spans, emb) mention_start_emb = tf.gather_nd( boundaries_input_vecs, tf.stack([ tf.tile( tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]), self.begin_span ], 2)) # extracts the x_q embedding for each candidate span # the tile command creates a 2d tensor with the batch information. first lines contains only zeros, second # line ones etc... because the begin_span tensor has the information which word inside this sentence is the # beginning of the candidate span. mention_emb_list.append(mention_start_emb) mention_end_emb = tf.gather_nd( boundaries_input_vecs, tf.stack([ tf.tile( tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]), tf.nn.relu(self.end_span - 1) ], 2)) # -1 because the end of span in exclusive [start, end) # relu so that the 0 doesn't become -1 of course no valid candidate span end index is zero since [0,0) is empty mention_emb_list.append(mention_end_emb) #print("mention_start_emb = ", mention_start_emb) #print("mention_end_emb = ", mention_end_emb) mention_width = self.end_span - self.begin_span # [batch, num_mentions] the width of each candidate span if self.args.span_emb.find( "head") != -1: # here the attention is computed # here the \hat(x)^m is computed (formula (2) and (3)) self.max_mention_width = tf.minimum( self.args.max_mention_width, tf.reduce_max(self.end_span - self.begin_span)) mention_indices = tf.range(self.max_mention_width) + \ tf.expand_dims(self.begin_span, 2) # [batch, num_mentions, max_mention_width] mention_indices = tf.minimum( tf.shape(self.word_embeddings)[1] - 1, mention_indices) # [batch, num_mentions, max_mention_width] #print("mention_indices = ", mention_indices) batch_index = tf.tile( tf.expand_dims( tf.expand_dims(tf.range(tf.shape(mention_indices)[0]), 1), 2), [ 1, tf.shape(mention_indices)[1], tf.shape(mention_indices)[2] ]) mention_indices = tf.stack([batch_index, mention_indices], 3) # [batch, num_mentions, max_mention_width, [row,col] ] 4d tensor # for the boundaries we had the option to take them either from x_k (output of bilstm) or from v_k # the head is derived either from the same option as boundaries or from the v_k. head_input_vecs = boundaries_input_vecs if self.args.model_heads_from_bilstm else self.word_embeddings mention_text_emb = tf.gather_nd(head_input_vecs, mention_indices) # [batch, num_mentions, max_mention_width, 500 ] 4d tensor #print("mention_text_emb = ", mention_text_emb) with tf.variable_scope("head_scores"): # from [batch, max_sent_len, 300] to [batch, max_sent_len, 1] self.head_scores = util.projection(boundaries_input_vecs, 1) # [batch, num_mentions, max_mention_width, 1] mention_head_scores = tf.gather_nd(self.head_scores, mention_indices) # print("mention_head_scores = ", mention_head_scores) # depending on tensorflow version we do the same with different operations (since each candidate span is not # of the same length we mask out the invalid indices created above (mention_indices)). temp_mask = self._sequence_mask_v13(mention_width, self.max_mention_width) # still code for masking invalid indices for the head computation mention_mask = tf.expand_dims( temp_mask, 3) # [batch, num_mentions, max_mention_width, 1] mention_mask = tf.minimum(1.0, tf.maximum(self.args.zero, mention_mask)) # 1e-3 # formula (3) computation mention_attention = tf.nn.softmax( mention_head_scores + tf.log(mention_mask), dim=2) # [batch, num_mentions, max_mention_width, 1] mention_head_emb = tf.reduce_sum(mention_attention * mention_text_emb, 2) # [batch, num_mentions, emb] #print("mention_head_emb = ", mention_head_emb) mention_emb_list.append(mention_head_emb) self.span_emb = tf.concat( mention_emb_list, 2 ) # [batch, num_mentions, emb i.e. 1700] formula (2) concatenation
def add_global_voting_op(self): with tf.variable_scope("global_voting"): self.final_scores_before_global = -( 1 - self.loss_mask) * 50 + self.final_scores if self.args.global_topkfromallspans: batch_num = tf.shape(self.final_scores)[0] spans_num = tf.shape(self.final_scores)[1] # num of spans cand_ent_num = tf.shape(self.final_scores)[2] # 30 new_size = spans_num * cand_ent_num temp = tf.diag(tf.ones([spans_num])) temp = tf.tile(tf.expand_dims(temp, axis=2), [1, 1, cand_ent_num]) temp = tf.reshape(temp, [spans_num, new_size]) mask = tf.reshape( tf.tile(tf.expand_dims(temp, axis=1), [1, cand_ent_num, 1]), [new_size, new_size]) mask = 1 - mask all_entities = tf.reshape(self.pure_entity_embeddings, [batch_num, new_size, 300]) all_scores = tf.matmul( all_entities, all_entities, transpose_b=True) # [batch, new_size, new_size] filtered_scores = all_scores * mask top_values, _ = tf.nn.top_k(filtered_scores, self.args.global_topkfromallspans) # [batch, new_size, K] if self.args.global_topkfromallspans_onlypositive: top_values = tf.maximum(top_values, self.args.zero) # so to avoid keeping cand ent that have score < of this value even if they are the self.global_voting_scores = tf.reduce_mean( top_values, axis=2) # [batch, new_size] self.global_voting_scores = tf.reshape( self.global_voting_scores, [batch_num, spans_num, cand_ent_num]) else: if self.args.global_gmask_unambigious: gmask = self._sequence_mask_v13( tf.equal(self.cand_entities_len, 1), tf.shape(self.final_scores)[2]) elif not self.args.global_topk: gmask = tf.to_float( ((self.final_scores_before_global - self.args.global_thr) >= 0)) # [b,s,30] else: top_values, _ = tf.nn.top_k( self.final_scores_before_global, self.args.global_topk) # [batch, num_of_spans, K] K_value = top_values[:, :, -1] # [batch, num_of_spans] #if hasattr(self.args, 'global_topkthr'): if self.args.global_topkthr: K_value = tf.maximum(self.args.global_topkthr, K_value) # so to avoid keeping cand ent that have score < of this value even if they are the # top for this span. 30 threshold = tf.tile( tf.expand_dims(K_value, 2), [1, 1, tf.shape(self.final_scores)[-1]]) # [batch, num_of_spans, 30] gmask = tf.to_float( ((self.final_scores_before_global - threshold) >= 0)) gmask = gmask * self.loss_mask if self.args.global_mask_scale_each_mention_voters_to_one: temp = tf.reduce_sum( gmask, axis=2, keep_dims=True) # [batch, num_of_spans, 1] temp = tf.where(tf.less(temp, 1e-4), temp, 1. / (temp + 1e-4)) gmask = gmask * temp elif self.args.global_gmask_based_on_localscore: gmask = gmask * tf.nn.softmax( self.final_scores_before_global) self.gmask = gmask masked_entity_emb = self.pure_entity_embeddings * tf.expand_dims( gmask, axis=3) # [b,s,30,300] * [b,s,30,1] batch_size = tf.shape(masked_entity_emb)[0] all_voters_emb = tf.reduce_sum(tf.reshape( masked_entity_emb, [batch_size, -1, 300]), axis=1, keep_dims=True) # [b, 1, 300] span_voters_emb = tf.reduce_sum( masked_entity_emb, axis=2) # [batch, num_of_spans, 300] valid_voters_emb = all_voters_emb - span_voters_emb # [b, 1, 300] - [batch, spans, 300] = [batch, spans, 300] (broadcasting) # [300] - [batch, spans, 300] = [batch, spans, 300] (broadcasting) if self.args.global_norm_or_mean == "norm": valid_voters_emb = tf.nn.l2_normalize(valid_voters_emb, dim=2) else: all_voters_num = tf.reduce_sum(gmask) # scalar span_voters_num = tf.reduce_sum(gmask, axis=2) # [batch, spans] valid_voters_emb = valid_voters_emb / tf.expand_dims( all_voters_num - span_voters_num, axis=2) self.global_voting_scores = tf.squeeze(tf.matmul( self.pure_entity_embeddings, tf.expand_dims(valid_voters_emb, axis=3)), axis=3) # [b,s,30,300] matmul [b,s,300,1] --> [b,s,30,1]-->[b,s,30] stack_values = [] if self.args.stage2_nn_components.find("pem") != -1: # TODO rename to pem_scores self.gpem_scores = self.custom_pem( self.args.gpem_without_log, self.args.gpem_buckets_boundaries) stack_values.append(self.gpem_scores) if self.args.stage2_nn_components.find("local") != -1: stack_values.append(self.final_scores_before_global) stack_values.append(self.global_voting_scores) scalar_predictors = tf.stack(stack_values, 3) #print("scalar_predictors = ", scalar_predictors) #[b, s, 30, 2] with tf.variable_scope("psi_and_global_ffnn"): if self.args.global_score_ffnn[0] == 0: self.final_scores = util.projection(scalar_predictors, 1, model=self) else: hidden_layers, hidden_size = self.args.global_score_ffnn[ 0], self.args.global_score_ffnn[1] self.final_scores = util.ffnn( scalar_predictors, hidden_layers, hidden_size, 1, self.dropout if self.args.ffnn_dropout else None, model=self) # [batch, num_mentions, 30, 1] squeeze to [batch, num_mentions, 30] self.final_scores = tf.squeeze(self.final_scores, axis=3)
def add_local_attention_op(self): # shape=(b, num_of_spans, 30, 300) attention_entity_emb = self.pure_entity_embeddings if self.args.attention_ent_vecs_no_regularization else self.entity_embeddings with tf.variable_scope("attention"): K = self.args.attention_K left_mask = self._sequence_mask_v13(self.begin_span, K) #left_mask = tf.sequence_mask(self.begin_span, K, dtype=tf.float32) right_mask = self._sequence_mask_v13( tf.expand_dims(self.words_len, 1) - self.end_span, K) #right_mask = tf.sequence_mask(tf.expand_dims(self.words_len, 1) - self.end_span, # number of words on the right # K, dtype=tf.float32) # but maximum i get K not more ctxt_mask = tf.concat([left_mask, right_mask], 2) # [batch, num_of_spans, 2*K] ctxt_mask = tf.log( tf.minimum(1.0, tf.maximum(self.args.zero, ctxt_mask))) # T, T, T, F, F | T, T, F, F, F # -1, -2, -3, -4, -5 +0, +1, +2, +3, +4 leftctxt_indices = tf.maximum(0, tf.range(-1, -K - 1, -1) + \ tf.expand_dims(self.begin_span, 2)) # [batch, num_mentions, K] rightctxt_indices = tf.minimum(tf.shape(self.pure_word_embeddings)[1] - 1, tf.range(K) + \ tf.expand_dims(self.end_span, 2)) # [batch, num_mentions, K] ctxt_indices = tf.concat([leftctxt_indices, rightctxt_indices], 2) # [batch, num_mentions, 2*K] batch_index = tf.tile( tf.expand_dims( tf.expand_dims(tf.range(tf.shape(ctxt_indices)[0]), 1), 2), [1, tf.shape(ctxt_indices)[1], tf.shape(ctxt_indices)[2]]) ctxt_indices = tf.stack([batch_index, ctxt_indices], 3) # [batch, num_of_spans, 2*K, 2] the last dimension is row,col for gather_nd # [batch, num_of_spans, 2*K, [row,col]] att_x_w = self.pure_word_embeddings # [batch, max_sent_len, 300] if self.args.attention_on_lstm and self.args.nn_components.find( "lstm") != -1: # [batch, max_sent_len, 600] hidden_size_of_lstm*2 so project it to 300 # TODO maybe omit projection if already in 300 dimention? but projection allows transormation... att_x_w = util.projection( self.context_emb, 300, model=self ) # if tf.shape(self.context_emb)[-1] != 300 else self.context_emb ctxt_word_emb = tf.gather_nd(att_x_w, ctxt_indices) # [batch, num_of_spans, 2K, emb_size] emb_size = 300 only pure word emb used # and not after we add char emb and dropout x_c_voters = attention_entity_emb # restrict the number of entities that participate in the forming of the x_c context vector if self.args.attention_retricted_num_of_entities: x_c_voters = tf.slice(attention_entity_emb, [0, 0, 0, 0], [ -1, -1, self.args.attention_retricted_num_of_entities, -1 ]) if self.args.attention_use_AB: att_A = tf.get_variable("att_A", [300]) x_c_voters = att_A * x_c_voters # [b, num_of_spans, 2*K, 300] mul [b, num_of_spans, 30, 300] instead of 30 it can be the reduced number of entities scores = tf.matmul(ctxt_word_emb, x_c_voters, transpose_b=True) # [b, spans, 2K, 30] scores = tf.reduce_max( scores, reduction_indices=[-1] ) # max score of each word for each span acquired from any cand entity scores = scores + ctxt_mask # some words are not valid out of window # so we assign to them very low score top_values, _ = tf.nn.top_k(scores, self.args.attention_R) # [batch, num_of_spans, R] #R_value = tf.reduce_min(top_values, axis=-1) R_value = top_values[:, :, -1] # [batch, num_of_spans] # same as above command but probably faster R_value = tf.maximum(self.args.zero, R_value) # so to avoid keeping words that # have max score with any of the entities <=0 (also score = 0 can have words with # padding candidate entities) threshold = tf.tile(tf.expand_dims(R_value, 2), [1, 1, 2 * K]) # [batch, num_of_spans, 2K] scores = scores - tf.to_float( ((scores - threshold) < 0)) * 50 # 50 where score<thr, 0 where score>=thr scores = tf.nn.softmax(scores, dim=2) # [batch, num_of_spans, 2K] scores = tf.expand_dims(scores, 3) # [batch, num_of_spans, 2K, 1] # [batch, num_of_spans, 2K, 1] * [batch, num_of_spans, 2K, emb_size] # = [batch, num_of_spans, 2K, emb_size] x_c = tf.reduce_sum(scores * ctxt_word_emb, 2) # = [batch, num_of_spans, emb_size] if self.args.attention_use_AB: att_B = tf.get_variable("att_B", [300]) x_c = att_B * x_c x_c = tf.expand_dims(x_c, 3) # [batch, num_of_spans, emb_size, 1] # [batch, num_of_spans, 30, emb_size=300] mul with [batch, num_of_spans, emb_size, 1] x_e__x_c = tf.matmul(attention_entity_emb, x_c) # [batch, num_of_spans, 30, 1] x_e__x_c = tf.squeeze(x_e__x_c, axis=3) # [batch, num_of_spans, 30] self.attention_scores = x_e__x_c
def add_span_emb_op(self): mention_emb_list = [] # span embedding based on boundaries (start, end) and head mechanism. but do that on top of contextual bilistm # output or on top of original word+char embeddings. this flag determines that. Of course by default head # is on top of word+char emb instead of bilstm output. boundaries_input_vecs = self.word_embeddings if self.args.span_boundaries_from_wordemb else self.context_emb if self.args.span_emb.find("boundaries") != -1: mention_start_emb = tf.gather_nd( boundaries_input_vecs, tf.stack([ tf.tile( tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]), self.begin_span ], 2)) #mention_start_emb = tf.gather(text_outputs, mention_starts) # [num_mentions, emb] mention_emb_list.append(mention_start_emb) mention_end_emb = tf.gather_nd( boundaries_input_vecs, tf.stack([ tf.tile( tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]), tf.nn.relu(self.end_span - 1) ], 2)) # -1 because the end of span in exclusive [start, end) # relu so the 0 don't become -1 #mention_end_emb = tf.gather(text_outputs, mention_ends) # [num_mentions, emb] mention_emb_list.append(mention_end_emb) #print("mention_start_emb = ", mention_start_emb) #print("mention_end_emb = ", mention_end_emb) mention_width = self.end_span - self.begin_span # [batch, num_mentions] # TODO remove the comment code below """ if self.args.use_features: mention_width_index = mention_width - 1 # [num_mentions] mention_width_emb = tf.gather(tf.get_variable("mention_width_embeddings", [self.args["max_mention_width"], self.args["feature_size"]]), mention_width_index) # [batch, num_mentions, emb] mention_width_emb = tf.nn.dropout(mention_width_emb, self.dropout) #print("mention_width_emb = ", mention_width_emb) mention_emb_list.append(mention_width_emb) """ if self.args.span_emb.find( "head") != -1: # here the attention is computed self.max_mention_width = tf.minimum( self.args.max_mention_width, tf.reduce_max(self.end_span - self.begin_span)) mention_indices = tf.range(self.max_mention_width) + \ tf.expand_dims(self.begin_span, 2) # [batch, num_mentions, max_mention_width] mention_indices = tf.minimum( tf.shape(self.word_embeddings)[1] - 1, mention_indices) # [batch, num_mentions, max_mention_width] #print("mention_indices = ", mention_indices) batch_index = tf.tile( tf.expand_dims( tf.expand_dims(tf.range(tf.shape(mention_indices)[0]), 1), 2), [ 1, tf.shape(mention_indices)[1], tf.shape(mention_indices)[2] ]) mention_indices = tf.stack([batch_index, mention_indices], 3) # [batch, num_mentions, max_mention_width, [row,col] ] 4d tensor # this means that head will be either the same as boundaries or boundaries from bilstm and head from wordemb head_input_vecs = boundaries_input_vecs if self.args.model_heads_from_bilstm else self.word_embeddings mention_text_emb = tf.gather_nd(head_input_vecs, mention_indices) # [batch, num_mentions, max_mention_width, 500 ] 4d tensor #print("mention_text_emb = ", mention_text_emb) with tf.variable_scope("head_scores"): # from [batch, max_sent_len, 600] to [batch, max_sent_len, 1] self.head_scores = util.projection(boundaries_input_vecs, 1, model=self) # [batch, num_mentions, max_mention_width, 1] mention_head_scores = tf.gather_nd(self.head_scores, mention_indices) #print("mention_head_scores = ", mention_head_scores) if not tf.__version__.startswith("1.4"): temp_shape = tf.shape(mention_width) temp = tf.sequence_mask(tf.reshape(mention_width, [-1]), self.max_mention_width, dtype=tf.float32) temp_mask = tf.reshape( temp, [temp_shape[0], temp_shape[1], tf.shape(temp)[-1]]) else: temp_mask = tf.sequence_mask(mention_width, self.max_mention_width, dtype=tf.float32) mention_mask = tf.expand_dims( temp_mask, 3) # [batch, num_mentions, max_mention_width, 1] mention_mask = tf.minimum(1.0, tf.maximum(self.args.zero, mention_mask)) # 1e-3 mention_attention = tf.nn.softmax( mention_head_scores + tf.log(mention_mask), dim=2) # [batch, num_mentions, max_mention_width, 1] mention_head_emb = tf.reduce_sum(mention_attention * mention_text_emb, 2) # [batch, num_mentions, emb] #print("mention_head_emb = ", mention_head_emb) mention_emb_list.append(mention_head_emb) self.span_emb = tf.concat(mention_emb_list, 2) # [batch, num_mentions, emb i.e. 1700]