def traditional_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'traditional_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_res = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) return attn_res
def visit_multi_dimensional_attention(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu'): # bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope('multi_dimensional_attention'): map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1', activation, False, wd, keep_prob, is_train) map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear', False, wd, keep_prob, is_train) soft = tf.nn.softmax(map2, 1) # bs,sl,vec attn_output = tf.reduce_sum(soft * rep_tensor, 1) # bs, vec return attn_output
def bi_sru_recurrent_network( rep_tensor, rep_mask, is_train=None, keep_prob=1., wd=0., scope=None, hn=None, reuse=None): """ :param rep_tensor: [Tensor/tf.float32] rank is 3 with shape [batch_size/bs, max_sent_len/sl, vec] :param rep_mask: [Tensor/tf.bool]rank is 2 with shape [bs,sl] :param is_train: [Scalar Tensor/tf.bool]scalar tensor to indicate whether the mode is training or not :param keep_prob: [float] dropout keep probability in the range of (0,1) :param wd: [float]for L2 regularization, if !=0, add tensors to tf collection "reg_vars" :param scope: [str]variable scope name :param hn: :param :return: [Tensor/tf.float32] with shape [bs, sl, 2vec] for forward and backward """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'bi_sru_recurrent_network'): # U_d = bn_dense_layer([rep_tensor], 6 * ivec, False, 0., 'get_frc', 'linear', # False, wd, keep_prob, is_train) # bs, sl, 6vec # U_d_fw, U_d_bw = tf.split(U_d, 2, 2) with tf.variable_scope('forward'): U_d_fw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0., 'get_frc_fw', 'linear', False, wd, keep_prob, is_train) # bs, sl, 6vec U_fw = tf.concat([rep_tensor, U_d_fw], -1) fw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob) fw_output, _ = dynamic_rnn( fw_SRUCell, U_fw, tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='forward_sru') # bs, sl, vec with tf.variable_scope('backward'): U_d_bw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0., 'get_frc_bw', 'linear', False, wd, keep_prob, is_train) # bs, sl, 6vec U_bw = tf.concat([rep_tensor, U_d_bw], -1) bw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob) bw_output, _ = bw_dynamic_rnn( bw_SRUCell, U_bw, tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='backward_sru') # bs, sl, vec all_output = tf.concat([fw_output, bw_output], -1) # bs, sl, 2vec return all_output
def first_level_sa(rep_tensor, rep_mask, keep_prob=1., is_train=None, wd=0., activation='relu'): # bs, sw, cl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2], tf.shape(rep_tensor)[3] ivec = rep_tensor.get_shape()[3] with tf.variable_scope('first_level_sa'): map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1', activation, False, wd, keep_prob, is_train) map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear', False, wd, keep_prob, is_train) map2_masked = exp_mask_for_high_rank(map2, rep_mask) soft = tf.nn.softmax(map2_masked, 2) # bs,sk,code_len,vec attn_output = tf.reduce_sum(soft * rep_tensor, 2) # bs, sk, vec return attn_output
def normal_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'normal_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_result = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_tensor_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # bs,sl,vec return output
def __call__(self, inputs, state, scope=None): """ :param inputs: [bs, vec] :param state: :param scope: :return: """ with tf.variable_scope(scope or "SRU_cell"): b_f = tf.get_variable('b_f', [self._num_units], dtype=tf.float32, initializer=tf.constant_initializer(0)) b_r = tf.get_variable('b_r', [self._num_units], dtype=tf.float32, initializer=tf.constant_initializer(0)) U_d = bn_dense_layer(inputs, 3 * self._num_units, False, 0., 'get_frc', 'linear') # bs, 3vec x_t = tf.identity(inputs, 'x_t') x_dt, f_t, r_t = tf.split(U_d, 3, 1) f_t = tf.nn.sigmoid(f_t + b_f) r_t = tf.nn.sigmoid(r_t + b_r) c_t = f_t * state + (1 - f_t) * x_dt h_t = r_t * self._activation(c_t) + (1 - r_t) * x_t return h_t, c_t
def visit_sa_with_dense(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu', hn=None, is_scale=True, is_plus_sa=True): batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope('temporal_attention'): # mask generation attn_mask = tf.cast( tf.diag(-tf.ones([sw_len], tf.int32)) + 1, tf.bool) # batch_size, code_len, code_len # non-linear for context rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sw_len, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear( rep_map_dp, ivec, False, scope='linear_dependent') # batch_size, code_len, vec_size dependent_etd = tf.expand_dims( dependent, 1) # batch_size, code_len,code_len, vec_size head = linear( rep_map_dp, ivec, False, scope='linear_head') # batch_size, code_len, vec_size head_etd = tf.expand_dims( head, 2) # batch_size, code_len,code_len, vec_size if is_plus_sa: attention_fact = dependent_etd + head_etd + f_bias else: return rep_map if is_scale: logits = scaled_tanh(attention_fact, 5.0) # bs,sl,sl,vec else: logits = linear(tf.nn.tanh(attention_fact), ivec, True, scope='linear_attn_fact') logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result return output
def directional_attention_with_dense(rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None, hn=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask = tf.cast( tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, ivec, False, scope='linear_dependent') # bs,sl,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,vec head = linear(rep_map_dp, ivec, False, scope='linear_head') # bs,sl,vec head_etd = tf.expand_dims(head, 2) # bs,sl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score tensor_dict[name + '_gate'] = fusion_gate return output