def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None): with tf.variable_scope(scope or "Double_Linear_Logits"): first = tf.tanh( linear(args, size, bias, bias_start=bias_start, scope='first', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)) second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) if mask is not None: second = exp_mask(second, mask) return second
def reverse_softsel(target, logits, mask=None, scope=None): with tf.name_scope(scope or "reverse_softsel"): logits_rank = len(logits.get_shape().as_list()) if mask is not None: logits = exp_mask(logits, mask) a = tf.nn.softmax(logits, logits_rank-2) target_rank = len(target.get_shape().as_list()) out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2) return out
def sum_logits(args, mask=None, name=None): with tf.name_scope(name or "sum_logits"): if args is None or (isinstance(args, (tuple, list)) and not args): raise ValueError("`args` must be specified") if not isinstance(args, (tuple, list)): args = [args] rank = len(args[0].get_shape()) logits = sum(tf.reduce_sum(arg, rank - 1) for arg in args) if mask is not None: logits = exp_mask(logits, mask) return logits
def forward(self, hidden_states, rep_mask, attn_mask, **kwargs): bs, sl, hn = hidden_states.size() # co attn_scores = torch.bmm( # bs,sl,sl hidden_states, torch.transpose(hidden_states, 1, 2))/(sl ** 0.5) # change graph edge [bs,n,2] to 2d index [N, 3] graph_mask = attn_mask attn_prob = self._attn_softmax(exp_mask(graph_mask, attn_scores)) # bs,sl,sl attn_res = torch.bmm(attn_prob, hidden_states) # [bs,sl,sl]x[bs,sl,hn] ==> [bs,sl,hn] final_res = zero_mask(rep_mask, attn_res, high_rank=True) return final_res
def pooling_with_mask(rep_tensor, rep_mask, dim = -1, pooling_method='max', scope=None): # rep_tensor and rep_mask must have sampe shape with tf.name_scope(scope or '%s_pooling_with_mask_'%pooling_method): if pooling_method == 'max': pooling_out = tf.reduce_max(exp_mask(rep_tensor, rep_mask), dim) # bs,sl,ql -> bs,xl elif pooling_method == 'mean': sum_out = tf.reduce_sum(normal_mask(rep_tensor, rep_mask), dim) # bs,sl,ql -> bs,xl num = tf.reduce_sum(tf.cast(rep_tensor, tf.int32), dim) # bs,xl num = tf.where(tf.equal(num, tf.zeros_like(num, tf.int32)), tf.ones_like(num, tf.int32), num) pooling_out = sum_out / tf.cast(num, tf.float32) # bs,xl else: raise AttributeError('No pooling method \'%s\'' % pooling_method) return pooling_out
def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None): with tf.variable_scope(scope or "Linear_Logits"): logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) if mask is not None: logits = exp_mask(logits, mask) return logits
def softmax(logits, mask=None, scope=None): with tf.name_scope(scope or "Softmax"): if mask is not None: logits = exp_mask(logits, mask) out = tf.nn.softmax(logits, -1) return out
def normal_attention(tensor_base, tensor_to_attend, mask_for_tensor_base, mask_for_tensor_to_attend, similarity_method='inner', hn=100, use_pooling=False, pooling_method='max', reverse=False, scope=None): """ normal_attention for attention strategy 2 :param tensor_base: rank 3 [bs,sl,vec] :param tensor_to_attend: rank 3 [bs,ql,vec] :param mask_for_tensor_base: [bs,ql] :param mask_for_tensor_to_attend: [bs,sl] :param similarity_method: 'inner' 'tri_linear' 'map_linear' :param hn: some method need :param use_pooling: True or False :param pooling_method: 'max' or 'mean' :param reverse: if use strategy 3 :param scope: :return: use_pooling==True: [bs,sl,hn] else [bs,hn] """ with tf.variable_scope(scope or 'normal_attention'): # --------parameters-------- t_main = tensor_base # [bs,sl,vec] t_sec = tensor_to_attend # [bs,ql,vec] mask_main = mask_for_tensor_base # [bs,sl] mask_sec = mask_for_tensor_to_attend # [bs,ql] bs, sl, vec = tf.shape(t_main)[0], tf.shape(t_main)[1], tf.shape(t_main)[2] ql = tf.shape(t_sec)[1] # ------------------------------- # --------similarity_mat-------- mask_main_etd = tf.expand_dims(mask_main, 2) # bs,sl,1 mask_sec_etd = tf.expand_dims(mask_sec, 1) # bs,1,ql mask_similarity_mat = tf.logical_and(mask_main_etd, mask_sec_etd) # bs,sl,ql if similarity_method == 'inner': t_main_etd = tf.expand_dims(t_main, 2) # bs,sl,1,vec t_sec_etd = tf.expand_dims(t_sec, 1) # bs,1,ql,vec similarity_mat = tf.reduce_sum(t_main_etd*t_sec_etd, -1) # bs,sl,ql elif similarity_method == 'tri_linear': t_main_tiled = tf.tile(tf.expand_dims(t_main, 2), [1, 1, ql, 1]) # bs,sl,ql,vec t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1]) # bs,sl,ql,vec similarity_mat = get_logits([t_main_tiled, t_sec_tiled], None, False, scope='tri_linear_tri_linear', func='tri_linear') elif similarity_method == 'map_linear': t_main_map = tf.nn.relu(linear([t_main], hn, True, scope='linear_map_main')) t_sec_map = tf.nn.relu(linear([t_sec], hn, True, scope='linear_map_sec')) t_main_map_etd = tf.expand_dims(t_main_map, 2) # bs,sl,1,hn t_sec_map_etd = tf.expand_dims(t_sec_map, 1) # bs,1,ql,hn similarity_mat = tf.reduce_sum(t_main_map_etd * t_sec_map_etd, -1) # bs,sl,ql else: raise AttributeError('No similarity matrix calculation method \'%s\'' % similarity_method) # ------------------------------- if use_pooling: # pool mat along -2 if pooling_method == 'max': pooling_out = tf.reduce_max(exp_mask(similarity_mat, mask_similarity_mat), -2) # bs,sl,ql -> bs,ql elif pooling_method == 'mean': sum_out = tf.reduce_sum(normal_mask(similarity_mat, mask_similarity_mat), -2) # bs,sl,ql -> bs,ql num = tf.reduce_sum(tf.cast(mask_similarity_mat, tf.int32), -2) # bs,ql num = tf.where(tf.equal(num, tf.zeros_like(num, tf.int32)), tf.ones_like(num, tf.int32), num) pooling_out = sum_out / tf.cast(num, tf.float32) # bs,ql else: raise AttributeError('No pooling method \'%s\'' % pooling_method) return softsel(t_sec, pooling_out, mask_sec) # bs,ql,vec -> bs,ql else: t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1]) # bs,sl,ql,vec # target: q_tiled:[bs,sl,ql,hn]; logits: [bs,sl,ql] if not reverse: out = normal_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat) else: out = reverse_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat) return out # bs,sl,vec
def multi_head_attention(rep_tensor, rep_mask, head_num=8, hidden_units_num=64, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'multi_head_attention'): with tf.variable_scope('positional_encoding'): seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1), [1, ivec]) # sl, ivec feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0), [sl, 1]) # sl, ivec pos_enc = tf.where( tf.equal(tf.mod(feature_idxs, 2), 0), tf.sin( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs, tf.float32) / (1.0 * ivec))), tf.cos( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) / (1.0 * ivec))), ) rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc, rep_mask) # bs, sl, ivec with tf.variable_scope('multi_head_attention'): W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num], tf.float32) rep_tile = tf.tile( tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0), [3, head_num, 1, 1, 1]) # 3,head_num,bs,sl,ivec rep_tile_reshape = tf.reshape( rep_tile, [3, head_num, bs * sl, ivec]) # head_num,bs*sl,ivec maps = tf.reshape( # 3,head_num,bs*sl,hn -> 3,head_num,bs,sl,hn tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W), [3, head_num, bs, sl, hidden_units_num]) Q_map, K_map, V_map = tf.split(maps, 3, 0) Q_map = tf.squeeze(Q_map, [0]) # head_num,bs,sl,hn K_map = tf.squeeze(K_map, [0]) # head_num,bs,sl,hn V_map = tf.squeeze(V_map, [0]) # head_num,bs,sl,hn # head_num,bs,sl,sl # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num) similarity_mat = tf.matmul(Q_map, tf.transpose( K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num) # mask: bs,sl -> head_num,bs,sl multi_mask = tf.tile(tf.expand_dims(rep_mask, 0), [head_num, 1, 1]) # head_num,bs,sl multi_mask_tile_1 = tf.expand_dims(multi_mask, 2) # head_num,bs,1,sl multi_mask_tile_2 = tf.expand_dims(multi_mask, 3) # head_num,bs,sl,1 multi_mask_tile = tf.logical_and( multi_mask_tile_1, multi_mask_tile_2) # head_num,bs,sl,sl similarity_mat_masked = exp_mask( similarity_mat, multi_mask_tile) # head_num,bs,sl,sl prob_dist = tf.nn.softmax( similarity_mat_masked) # head_num,bs,sl,sl prob_dist_dp = dropout(prob_dist, keep_prob, is_train) attn_res = tf.matmul(prob_dist_dp, V_map) # head_num,bs,sl,hn attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3]) output = tf.reshape(attn_res_tran, [bs, sl, head_num * hidden_units_num]) if wd > 0.: add_reg_without_bias() return output