def multi_dimensional_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'multi_dimensional_attention'): map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1', activation, False, wd, keep_prob, is_train) map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear', False, wd, keep_prob, is_train) map2_masked = exp_mask_for_high_rank(map2, rep_mask) soft = tf.nn.softmax(map2_masked, 1) # bs,sl,vec attn_output = tf.reduce_sum(soft * rep_tensor, 1) # bs, vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = soft return attn_output
def bi_sru_recurrent_network(rep_tensor, rep_mask, is_train=None, keep_prob=1., wd=0., scope=None, hn=None, reuse=None): """ :param rep_tensor: [Tensor/tf.float32] rank is 3 with shape [batch_size/bs, max_sent_len/sl, vec] :param rep_mask: [Tensor/tf.bool]rank is 2 with shape [bs,sl] :param is_train: [Scalar Tensor/tf.bool]scalar tensor to indicate whether the mode is training or not :param keep_prob: [float] dropout keep probability in the range of (0,1) :param wd: [float]for L2 regularization, if !=0, add tensors to tf collection "reg_vars" :param scope: [str]variable scope name :param hn: :param :return: [Tensor/tf.float32] with shape [bs, sl, 2vec] for forward and backward """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'bi_sru_recurrent_network'): # U_d = bn_dense_layer([rep_tensor], 6 * ivec, False, 0., 'get_frc', 'linear', # False, wd, keep_prob, is_train) # bs, sl, 6vec # U_d_fw, U_d_bw = tf.split(U_d, 2, 2) with tf.variable_scope('forward'): U_d_fw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0., 'get_frc_fw', 'linear', False, wd, keep_prob, is_train) # bs, sl, 6vec U_fw = tf.concat([rep_tensor, U_d_fw], -1) fw_SRUCell = SwitchableDropoutWrapper( SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob) fw_output, _ = dynamic_rnn(fw_SRUCell, U_fw, tf.reduce_sum( tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='forward_sru') # bs, sl, vec with tf.variable_scope('backward'): U_d_bw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0., 'get_frc_bw', 'linear', False, wd, keep_prob, is_train) # bs, sl, 6vec U_bw = tf.concat([rep_tensor, U_d_bw], -1) bw_SRUCell = SwitchableDropoutWrapper( SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob) bw_output, _ = bw_dynamic_rnn(bw_SRUCell, U_bw, tf.reduce_sum( tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='backward_sru') # bs, sl, vec all_output = tf.concat([fw_output, bw_output], -1) # bs, sl, 2vec return all_output
def traditional_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'traditional_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_res = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) return attn_res
def visit_multi_dimensional_attention(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu'): # bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope('multi_dimensional_attention'): map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1', activation, False, wd, keep_prob, is_train) map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear', False, wd, keep_prob, is_train) soft = tf.nn.softmax(map2, 1) # bs,sl,vec attn_output = tf.reduce_sum(soft * rep_tensor, 1) # bs, vec return attn_output
def first_level_sa(rep_tensor, rep_mask, keep_prob=1., is_train=None, wd=0., activation='relu'): # bs, sw, cl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2], tf.shape(rep_tensor)[3] ivec = rep_tensor.get_shape()[3] with tf.variable_scope('first_level_sa'): print('original: ', rep_tensor.get_shape()) map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1', activation, False, wd, keep_prob, is_train) print('map1: ', map1.get_shape()) map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear', False, wd, keep_prob, is_train) print('map2: ', map2.get_shape()) map2_masked = exp_mask_for_high_rank(map2, rep_mask) soft = tf.nn.softmax(map2_masked, 2) # bs,sk,code_len,vec attn_output = tf.reduce_sum(soft * rep_tensor, 2) # bs, sk, vec return attn_output
def normal_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'normal_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_result = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_tensor_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # bs,sl,vec return output
def __call__(self, inputs, state, scope=None): """ :param inputs: [bs, vec] :param state: :param scope: :return: """ with tf.variable_scope(scope or "SRU_cell"): b_f = tf.get_variable('b_f', [self._num_units], dtype=tf.float32, initializer=tf.constant_initializer(0)) b_r = tf.get_variable('b_r', [self._num_units], dtype=tf.float32, initializer=tf.constant_initializer(0)) U_d = bn_dense_layer(inputs, 3 * self._num_units, False, 0., 'get_frc', 'linear') # bs, 3vec x_t = tf.identity(inputs, 'x_t') x_dt, f_t, r_t = tf.split(U_d, 3, 1) f_t = tf.nn.sigmoid(f_t + b_f) r_t = tf.nn.sigmoid(r_t + b_r) c_t = f_t * state + (1 - f_t) * x_dt h_t = r_t * self._activation(c_t) + (1 - r_t) * x_t return h_t, c_t
def sentence_encoding_models( rep_tensor, rep_mask, method, activation_function, scope=None, wd=0., is_train=None, keep_prob=1., **kwargs): method_name_list = [ 'cnn_kim', 'no_ct', 'lstm', 'gru', 'sru', 'sru_normal', # rnn 'multi_cnn', 'hrchy_cnn', 'multi_head', 'multi_head_git', 'disa', 'block' ] if 'hn' in kwargs.keys(): hn = kwargs['hn'] else: hn = None ivec = hn or rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'sentence_encoding_models'): if method == 'cnn_kim': assert 2 * ivec % 3 == 0 sub_hn = 2 * ivec // 3 sent_encoding = cnn_for_sentence_encoding( rep_tensor, rep_mask, (3,4,5), sub_hn, 'sent_encoding_cnn_kim', is_train, keep_prob, wd) else: ct_rep = None if method == 'no_ct': ct_rep = bn_dense_layer( rep_tensor, 2*ivec, True, 0., 'no_ct', activation_function, False, wd, keep_prob, is_train) else: ct_rep = context_fusion_layers( rep_tensor, rep_mask, method, activation_function, None, wd, is_train, keep_prob, **kwargs) sent_encoding = multi_dimensional_attention( ct_rep, rep_mask, 'multi_dim_attn_for_%s' % method, keep_prob, is_train, wd, activation_function) return sent_encoding
def bi_sru_recurrent_network(rep_tensor, rep_mask, is_train=None, keep_prob=1., wd=0., scope=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'bi_sru_recurrent_network'): U_d = bn_dense_layer([rep_tensor], 6 * ivec, True, 0., 'get_frc', 'linear', False, wd, keep_prob, is_train) # bs, sl, 6vec U_d_fw, U_d_bw = tf.split(U_d, 2, 2) with tf.variable_scope('forward'): U_fw = tf.concat([rep_tensor, U_d_fw], -1) fw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh), is_train, keep_prob) fw_output, _ = dynamic_rnn(fw_SRUCell, U_fw, tf.reduce_sum( tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='forward_sru') # bs, sl, vec with tf.variable_scope('backward'): U_bw = tf.concat([rep_tensor, U_d_bw], -1) bw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh), is_train, keep_prob) bw_output, _ = bw_dynamic_rnn(bw_SRUCell, U_bw, tf.reduce_sum( tf.cast(rep_mask, tf.int32), -1), dtype=tf.float32, scope='backward_sru') # bs, sl, vec all_output = tf.concat([fw_output, bw_output], -1) # bs, sl, 2vec return all_output
def build_loss_optimizer(self): with tf.name_scope('loss_optimization'): logits = bn_dense_layer(self.output, 1, True, 0., 'bn_dense_map', 'sigmoid', False, wd=0., keep_prob=1., is_train=True) losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.cast(self.labels, tf.float32)) tf.add_to_collection('losses', tf.reduce_mean(losses, name='loss_mean')) loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss') tf.summary.scalar(loss.op.name, loss) tf.add_to_collection('ema/scalar', loss) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.round(logits), tf.cast(self.labels, tf.float32)) with tf.name_scope('accuracy'): # Mean accuracy over all labels: # http://stackoverflow.com/questions/37746670/tensorflow-multi-label-accuracy-calculation accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return loss, optimizer, accuracy, logits, tf.round(logits)
def build_network(self): tds, tel, hn = self.tds, self.tel, self.hn bs, sn, sl, ql = self.bs, self.sn, self.sl, self.ql with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, scope='gene_token_emb_mat') c_emb = tf.nn.embedding_lookup(token_emb_mat, self.context_token) # bs,sn,sl,tel q_emb = tf.nn.embedding_lookup(token_emb_mat, self.question_token) # s,ql,tel with tf.variable_scope('prepro'): q_rep = multi_dimensional_attention(q_emb, self.question_token_mask, 'q2coding', cfg.dropout, self.is_train, cfg.wd, 'relu') # bs, hn q_rep_map = bn_dense_layer(q_rep, hn, True, 0., 'q_rep_map', 'relu', False, cfg.wd, cfg.dropout, self.is_train) # bs, hn with tf.variable_scope('sent_emb'): c_emb_rshp = tf.reshape(c_emb, [bs * sn, sl, tel], 'c_emb_rshp') # bs*sn,sl,tel c_mask_rshp = tf.reshape(self.context_token_mask, [bs * sn, sl], 'c_mask_rshp') # bs*sn,sl,tel sent_enc_rshp = sentence_encoding_models( c_emb_rshp, c_mask_rshp, cfg.context_fusion_method, 'relu', 'sent2enc', cfg.wd, self.is_train, cfg.dropout, hn, block_len=cfg.block_len) # bs*sn, 2*hn sent_enc = tf.reshape(sent_enc_rshp, [bs, sn, 2 * hn]) # bs,sn, 2*hn sent_enc_map = bn_dense_layer(sent_enc, hn, True, 0., 'sent_enc_map', 'relu', False, cfg.wd, cfg.dropout, self.is_train) with tf.variable_scope('fusion'): q_rep_map_ex = tf.tile(tf.expand_dims(q_rep_map, 1), [1, sn, 1]) # bs, sn, hn fusion_rep = tf.concat([ sent_enc_map, q_rep_map_ex, sent_enc_map - q_rep_map_ex, sent_enc_map * q_rep_map_ex ], -1) # bs,sn,4hn with tf.variable_scope('output'): out_cf = context_fusion_layers(fusion_rep, self.context_sent_mask, cfg.context_fusion_method, 'relu', 'out_cf', cfg.wd, self.is_train, cfg.dropout, hn, block_len=4) pre_output = bn_dense_layer(out_cf, hn, True, 0., 'pre_output', 'relu', False, cfg.wd, cfg.dropout, self.is_train) logits = get_logits( # exp masked pre_output, None, True, 0., 'logits', self.context_sent_mask, cfg.wd, cfg.dropout, self.is_train, 'linear') return logits
def directional_attention_with_dense(rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None, hn=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = hn or rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask = tf.cast( tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, ivec, False, scope='linear_dependent') # bs,sl,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,vec head = linear(rep_map_dp, ivec, False, scope='linear_head') # bs,sl,vec head_etd = tf.expand_dims(head, 2) # bs,sl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score tensor_dict[name + '_gate'] = fusion_gate return output
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1./scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask0 = tf.greater(sl_row + sigma, sl_col) direct_mask1 = tf.greater(sl_col + sigma, sl_row) direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool) direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,1 f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, 1, False, scope='linear_dependent') # bs,sl,1 dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,1 head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1 head_etd = tf.expand_dims(head, 2) # bs,sl,1,1 logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,1 logits_masked = exp_mask_for_high_rank(logits, attn_mask) if direction is not None: dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + tf.diag(tf.ones([sl], tf.int32)), tf.float32)) logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec]) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): output = attn_result # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score return output
def simple_block_attention(rep_tensor, rep_mask, block_len=5, scope=None, direction=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None): assert direction is not None def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'block_simple'): # @1. split sequence with tf.variable_scope('split_seq'): block_num = tf.cast( tf.ceil( tf.divide(tf.cast(sl, tf.float32), tf.cast(block_len, tf.float32))), tf.int32) comp_len = block_num * block_len - sl rep_tensor_comp = tf.concat( [rep_tensor, tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1) rep_mask_comp = tf.concat([ rep_mask, tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool) ], 1) rep_tensor_split = tf.reshape( rep_tensor_comp, [bs, block_num, block_len, org_ivec]) # bs,bn,bl,d rep_mask_split = tf.reshape(rep_mask_comp, [bs, block_num, block_len]) # bs,bn,bl # non-linear rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # bs,bn,bl,vec rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2), [1, 1, block_len, 1, 1]) # bs,bn,bl,bl,vec # rep_map_dp = dropout(rep_map, keep_prob, is_train) bn = block_num bl = block_len with tf.variable_scope('self_attention'): # @2.self-attention in block # mask generation sl_indices = tf.range(block_len, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) # bl,bl else: direct_mask = tf.greater(sl_col, sl_row) # bl,bl direct_mask_tile = tf.tile( tf.expand_dims(tf.expand_dims(direct_mask, 0), 0), [bs, bn, 1, 1]) # bs,bn,bl,bl rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2), [1, 1, bl, 1]) # bs,bn,bl,bl rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3), [1, 1, 1, bl]) # bs,bn,bl,bl rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2) attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile, name='attn_mask') # bs,bn,bl,bl # attention f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent_head = linear(rep_map, 2 * ivec, False, 0., 'linear_dependent_head', False, wd, keep_prob, is_train) # bs,bn,bl,2vec dependent, head = tf.split(dependent_head, 2, 3) dependent_etd = tf.expand_dims(dependent, 2) # bs,bn,1,bl,vec head_etd = tf.expand_dims(head, 3) # bs,bn,bl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,bn,bl,bl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 3) # bs,bn,bl,bl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) # bs,bn,bl,bl,vec self_attn_result = tf.reduce_sum(attn_score * rep_map_tile, 3) # bs,bn,bl,vec with tf.variable_scope('source2token_self_attn'): inter_block_logits = bn_dense_layer(self_attn_result, ivec, True, 0., 'bn_dense_map', 'linear', False, wd, keep_prob, is_train) # bs,bn,bl,vec inter_block_logits_masked = exp_mask_for_high_rank( inter_block_logits, rep_mask_split) # bs,bn,bl,vec inter_block_soft = tf.nn.softmax(inter_block_logits_masked, 2) # bs,bn,bl,vec inter_block_attn_output = tf.reduce_sum( self_attn_result * inter_block_soft, 2) # bs,bn,vec with tf.variable_scope('self_attn_inter_block'): inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32), tf.bool) block_ct_res = directional_attention_with_dense( inter_block_attn_output, inter_block_attn_output_mask, direction, 'disa', keep_prob, is_train, wd, activation) # [bs,bn,vec] block_ct_res_tile = tf.tile(tf.expand_dims( block_ct_res, 2), [1, 1, bl, 1]) #[bs,bn,vec]->[bs,bn,bl,vec] with tf.variable_scope('combination'): # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec] rep_tensor_with_ct = tf.concat( [rep_map, self_attn_result, block_ct_res_tile], -1) # [bs,bn,bl,3vec] new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True, 0., 'linear_new_context_and_gate', False, wd, keep_prob, is_train) # [bs,bn,bl,2vec] new_context, gate = tf.split(new_context_and_gate, 2, 3) # bs,bn,bl,vec if activation == "relu": new_context_act = tf.nn.relu(new_context) elif activation == "elu": new_context_act = tf.nn.elu(new_context) elif activation == "linear": new_context_act = tf.identity(new_context) else: raise RuntimeError gate_sig = tf.nn.sigmoid(gate) combination_res = gate_sig * new_context_act + ( 1 - gate_sig) * rep_map # bs,bn,bl,vec with tf.variable_scope('restore_original_length'): combination_res_reshape = tf.reshape( combination_res, [bs, bn * bl, ivec]) # bs,bn*bl,vec output = combination_res_reshape[:, :sl, :] return output
def gated_self_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None, position_mask_type=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'gated_self_attention_%s' % (position_mask_type or 'None')): rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # mask generation rep_mask_epd1 = tf.expand_dims(rep_mask, 1) # bs,1,sl rep_mask_epd2 = tf.expand_dims(rep_mask, 2) # bs,sl,1 rep_mask_mat = tf.logical_and(rep_mask_epd1, rep_mask_epd2) # bs,sl,sl if position_mask_type in ['forward', 'backward']: sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if position_mask_type == 'forward': position_mask = tf.greater(sl_row, sl_col) else: position_mask = tf.greater(sl_col, sl_row) position_mask = tf.tile(tf.expand_dims(position_mask, 0), [bs, 1, 1]) position_mask = tf.logical_and(rep_mask_mat, position_mask) else: position_mask = rep_mask_mat position_mask_ft = tf.cast(position_mask, tf.float32) # attention with tf.variable_scope('intra_sent_attn'): # bs,sl,hn # rep_tensor_mean = pooling_with_mask(rep_tensor, rep_mask, 'mean') # bs, hn rep_tensor_for_attn = rep_map pre_align_score = bn_dense_layer( # bs,sl,hn rep_tensor_for_attn, ivec, True, 0., 'intra_sent_map1', activation, False, wd, keep_prob, is_train) align_score = bn_dense_layer( # bs,sl,hn pre_align_score, ivec, True, 0., 'intra_sent_map2', 'linear', False, wd, keep_prob, is_train) align_score_w_mask = exp_mask_for_high_rank(align_score, rep_mask) # bs,sl,hn exp_align_score = tf.exp(align_score_w_mask) # bs,sl,hn accum_z_deno = tf.matmul(position_mask_ft, exp_align_score) accum_z_deno = tf.where( tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)), accum_z_deno, tf.ones_like(accum_z_deno)) rep_mul_score = rep_map * exp_align_score accum_rep_mul_score = tf.matmul(position_mask_ft, rep_mul_score) attn_res = accum_rep_mul_score / accum_z_deno with tf.variable_scope('context_fusion_gate'): fusion_gate = tf.nn.sigmoid( bn_dense_layer([rep_map, attn_res], hn, True, 0., 'linear_fusion_gate', activation, False, wd, keep_prob, is_train)) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_res output = mask_for_high_rank(output, rep_mask) return output
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('hard_network'): # s1_act, s1_logpa, s2_act, s2_logpa, choose_percentage s1_act = self.sent1_token_mask s1_logpa = tf.cast(s1_act, tf.float32) s2_act = self.sent2_token_mask s2_logpa = tf.cast(s2_act, tf.float32) s1_percentage = tf.ones([bs], tf.float32) s2_percentage = tf.ones([bs], tf.float32) with tf.variable_scope('ct_attn'): s1_fw = directional_attention_with_dense( s1_emb, self.sent1_token_mask, 'forward', 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_fw_attn') s1_bw = directional_attention_with_dense( s1_emb, self.sent1_token_mask, 'backward', 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_bw_attn') s1_seq_rep = tf.concat([s1_fw, s1_bw], -1) tf.get_variable_scope().reuse_variables() s2_fw = directional_attention_with_dense( s2_emb, self.sent2_token_mask, 'forward', 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_fw_attn') s2_bw = directional_attention_with_dense( s2_emb, self.sent2_token_mask, 'backward', 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_bw_attn') s2_seq_rep = tf.concat([s2_fw, s2_bw], -1) with tf.variable_scope('sentence_enc'): s1_rep = multi_dimensional_attention( s1_seq_rep, self.sent1_token_mask, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_attn') tf.get_variable_scope().reuse_variables() s2_rep = multi_dimensional_attention( s2_seq_rep, self.sent2_token_mask, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_attn') with tf.variable_scope('output'): out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1) out_rep_map = bn_dense_layer( out_rep, hn, True, 0., 'out_rep_map', 'elu', False, cfg.wd, cfg.dropout, self.is_train) pre_output1 = highway_network( out_rep_map, hn, True, 0., 'pre_output1', 'elu', False, cfg.wd, cfg.dropout, self.is_train) logits = linear([pre_output1], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage) # logits
def directional_attention_with_selections( rep_tensor, rep_mask, dep_selection, head_selection, direction=None, hn=None, keep_unselected=True, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu'): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # ensure the seletion is right dep_selection = tf.logical_and(rep_mask, dep_selection) head_selection = tf.logical_and(rep_mask, head_selection) rep_dep_tensor, rep_dep_mask, dep_org_idx = reduce_data_rep_max_len(rep_map, dep_selection) rep_head_tensor,rep_head_mask, head_org_idx = reduce_data_rep_max_len(rep_map, head_selection) sl_dep, sl_head = tf.shape(rep_dep_tensor)[1], tf.shape(rep_head_tensor)[1] if keep_unselected: unhead_selection = tf.logical_and(rep_mask, tf.logical_not(head_selection)) rep_unhead_tensor, rep_unhead_mask, unhead_org_idx = reduce_data_rep_max_len(rep_map, unhead_selection) sl_unhead = tf.shape(rep_unhead_tensor)[1] attn_result = tf.cond( tf.equal(sl_head, 0), lambda: tf.zeros([bs, 0, hn], tf.float32), lambda: self_attention_for_selected_head( head_selection, head_org_idx, sl_head, rep_head_mask, dep_selection, dep_org_idx, sl_dep, rep_dep_mask, rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec ) ) if keep_unselected: input_idx = tf.tile(tf.expand_dims(tf.range(sl), 0), [bs, 1]) pooling_result = tf.cond( tf.equal(sl_unhead, 0), lambda: tf.zeros([bs, 0, hn], tf.float32), lambda: mean_pooling_for_unselected_head( unhead_org_idx, sl_unhead, rep_unhead_mask, input_idx, sl, rep_mask, rep_map, None) # todo: point ! ) with tf.variable_scope('output'): if keep_unselected: range_head = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_head]) scatter_attn = tf.cond( tf.equal(sl_head, 0), lambda: tf.zeros([bs, sl+1, hn], tf.float32), lambda: tf.scatter_nd( tf.stack([range_head, head_org_idx], -1), attn_result, [bs, sl+1, hn]) ) range_unhead = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_unhead]) scatter_pooling = tf.cond( tf.equal(sl_unhead, 0), lambda: tf.zeros([bs, sl+1, hn], tf.float32), lambda: tf.scatter_nd( tf.stack([range_unhead, unhead_org_idx], -1), pooling_result, [bs, sl+1, hn]) ) self_attn_input = rep_map context_features = tf.add(scatter_attn[:, :-1], scatter_pooling[:, :-1], 'context_features') output_mask = rep_mask else: self_attn_input = rep_head_tensor context_features = attn_result output_mask = rep_head_mask # context fusion gate o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) fusion_gate = tf.nn.sigmoid( linear(self_attn_input, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(context_features, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * self_attn_input + (1 - fusion_gate) * context_features return output, output_mask
def generate_mask_with_rl_real(rep_tensor, rep_mask, is_mat=False,scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None): """ :param rep_tensor: 3d tensor :param rep_mask: 2d tensor :param is_mat: [True|False] :param start_rl: :param end_rl_gain: :param scope: :param keep_prob: :param is_train: :param wd: :param activation: :param global_step: :return: """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = hn or rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'generate_mask_with_rl_real'): if is_mat: rep_row = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1]) rep_col = tf.tile(tf.expand_dims(rep_tensor, 2), [1, 1, sl, 1]) rep_h0 = tf.concat([rep_row, rep_col], -1) else: rep_h0 = rep_tensor rep_h1 = bn_dense_layer([rep_h0], ivec, True, 0., 'dense_rep_mat_h1', activation, False, wd, keep_prob, is_train) rep_h2 = bn_dense_layer([rep_h1], 1, True, 0., 'dense_rep_mat_h2', 'linear', False, wd, keep_prob, is_train) rep_h2 = tf.squeeze(rep_h2, 3 if is_mat else 2) # bs,sl,sl / bs,sl rep_prob = tf.nn.sigmoid(rep_h2) # sampling # Here, need a dynamic policy to add the random # todo:text # mode_is_train = tf.constant(mode == 'train', tf.bool, [], 'mode_is_train') # random_values = tf.cond( # tf.logical_and(mode_is_train, is_train), # lambda: tf.random_uniform([bs, sl, sl] if is_mat else [bs, sl]), # lambda: tf.ones([bs, sl, sl] if is_mat else [bs, sl], tf.float32) * 0.5 # ) random_values = tf.random_uniform([bs, sl, sl] if is_mat else [bs, sl]) # if global_step is not None: # policy_rep_prob = tf.cond(tf.logical_and(mode_is_train, # tf.less(global_step, # tf.constant(int(x2), tf.int32))), # lambda: rep_prob + prob_gain, # lambda: rep_prob) # # else: # policy_rep_prob = rep_prob policy_rep_prob = rep_prob actions = tf.less_equal(random_values, policy_rep_prob) actions = tf.stop_gradient(actions) if is_mat: rep_mask_new = tf.logical_and( tf.expand_dims(rep_mask, 1), tf.expand_dims(rep_mask, 2) ) else: rep_mask_new = rep_mask actions = tf.logical_and(actions, rep_mask_new) # log p(a) logpa = - binary_entropy(rep_prob, actions) * tf.cast(rep_mask_new, tf.float32) if is_mat: logpa = - tf.reshape(logpa, [bs, sl * sl]) # percentage actions_flat = tf.reshape(actions, [bs, -1]) rep_mask_new_flat = tf.reshape(rep_mask_new, [bs, -1]) percentage = tf.reduce_sum(tf.cast(actions_flat, tf.float32), -1) / \ tf.reduce_sum(tf.cast(rep_mask_new_flat, tf.float32), -1) return logpa, actions, percentage
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('hard_network'): # for sentence 1 s1_emb_new = sequence_conditional_feature(s1_emb, self.sent1_token_mask) s1_logpa_dep, s1_act_dep, s1_percentage_dep = generate_mask_with_rl( s1_emb_new, self.sent1_token_mask, False, 'generate_mask_with_rl_dep', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s1_logpa_head, s1_act_head, s1_percentage_head = generate_mask_with_rl( s1_emb_new, self.sent1_token_mask, False, 'generate_mask_with_rl_head', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s1_logpa = tf.concat([s1_logpa_dep, s1_logpa_head], -1) s1_act = tf.logical_and(tf.expand_dims(s1_act_dep, 1), tf.expand_dims(s1_act_head, 2)) s1_percentage = s1_percentage_dep * s1_percentage_head tf.get_variable_scope().reuse_variables() # for sentence 2 s2_emb_new = sequence_conditional_feature(s2_emb, self.sent2_token_mask) s2_logpa_dep, s2_act_dep, s2_percentage_dep = generate_mask_with_rl( s2_emb_new, self.sent2_token_mask, False, 'generate_mask_with_rl_dep', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s2_logpa_head, s2_act_head, s2_percentage_head = generate_mask_with_rl( s2_emb_new, self.sent2_token_mask, False, 'generate_mask_with_rl_head', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s2_logpa = tf.concat([s2_logpa_dep, s2_logpa_head], -1) s2_act = tf.logical_and(tf.expand_dims(s2_act_dep, 1), tf.expand_dims(s2_act_head, 2)) s2_percentage = s2_percentage_dep * s2_percentage_head keep_unselected = True with tf.variable_scope('ct_attn'): s1_fw, s1_token_mask_new = directional_attention_with_selections( s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head, 'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, 'relu') s1_bw, _ = directional_attention_with_selections( s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head, 'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, 'relu') s1_seq_rep = tf.concat([s1_fw, s1_bw], -1) tf.get_variable_scope().reuse_variables() s2_fw, s2_token_mask_new = directional_attention_with_selections( s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head, 'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, 'relu') s2_bw, _ = directional_attention_with_selections( s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head, 'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, 'relu') s2_seq_rep = tf.concat([s2_fw, s2_bw], -1) with tf.variable_scope('sentence_enc'): s1_rep = multi_dimensional_attention(s1_seq_rep, s1_token_mask_new, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='s1_attn') tf.get_variable_scope().reuse_variables() s2_rep = multi_dimensional_attention(s2_seq_rep, s2_token_mask_new, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='s2_attn') with tf.variable_scope('output'): out_rep = tf.concat([s1_rep * s2_rep, tf.abs(s1_rep - s2_rep)], -1) out_rep_map = bn_dense_layer(out_rep, hn, True, 0., 'out_rep_map', 'relu', False, cfg.wd, cfg.dropout, self.is_train) if cfg.use_mse and cfg.mse_logits: logits = tf.nn.sigmoid( linear(out_rep_map, 1, True, 0., scope='logits', squeeze=True, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) * 2. + 3. else: logits = linear([out_rep_map], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage ) # logits
def visit_sa_with_dense(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu', hn=None, is_scale=True, is_plus_sa=True): batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope('temporal_attention'): # mask generation attn_mask = tf.cast( tf.diag(-tf.ones([sw_len], tf.int32)) + 1, tf.bool) # batch_size, code_len, code_len # non-linear for context rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sw_len, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear( rep_map_dp, ivec, False, scope='linear_dependent') # batch_size, code_len, vec_size dependent_etd = tf.expand_dims( dependent, 1) # batch_size, code_len,code_len, vec_size head = linear( rep_map_dp, ivec, False, scope='linear_head') # batch_size, code_len, vec_size head_etd = tf.expand_dims( head, 2) # batch_size, code_len,code_len, vec_size if is_plus_sa: attention_fact = dependent_etd + head_etd + f_bias else: return rep_map if is_scale: logits = scaled_tanh(attention_fact, 5.0) # bs,sl,sl,vec else: logits = linear(tf.nn.tanh(attention_fact), ivec, True, scope='linear_attn_fact') logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result return output
def build_network(self): with tf.name_scope('code_embeddings'): if self.model_type == 'raw': # init_code_embed = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0) # code_embeddings = tf.Variable(init_code_embed) init_code_embed = tf.one_hot(self.inputs, self.vocabulary_size,on_value=1.0, off_value=0.0,axis=-1) inputs_embed = bn_dense_layer(init_code_embed, self.embedding_size, True, 0., 'bn_dense_map_linear', 'linear', False, wd=0., keep_prob=1., is_train=True) elif self.model_type == 'tesa': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'delta': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'sa': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'normal': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'cbow': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'sg': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'mce': init_code_embed = mce_trans() # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'glove': init_code_embed = glove_trans() # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) else: init_code_embed = med2vec_trans() # code_embeddings = tf.constant(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) with tf.name_scope('visit_embedding'): # bs, max_visits, max_len_visit, embed_size inputs_masked = mask_for_high_rank(inputs_embed, self.inputs_mask) inputs_reduced = tf.reduce_mean(inputs_masked, 2) # batch_size, max_visits, embed_size with tf.name_scope('visit_masking'): visit_mask = tf.reduce_sum(tf.cast(self.inputs_mask, tf.int32), -1) # [bs,max_visits] visit_mask = tf.cast(visit_mask, tf.bool) tensor_len = tf.reduce_sum(tf.cast(visit_mask, tf.int32), -1) # [bs] with tf.name_scope('RNN_computaion'): reuse = None if not tf.get_variable_scope().reuse else True if cfg.cell_type == 'gru': cell = tf.contrib.rnn.GRUCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'lstm': cell = tf.contrib.rnn.LSTMCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'basic_lstm': cell = tf.contrib.rnn.BasicLSTMCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'basic_rnn': cell = tf.contrib.rnn.BasicRNNCell(cfg.hn, reuse=reuse) outputs, final_state = dynamic_rnn(cell, inputs_reduced, tensor_len, dtype=tf.float32) return outputs, final_state, tensor_len