Exemplo n.º 1
0
def multi_dimensional_attention(rep_tensor,
                                rep_mask,
                                scope=None,
                                keep_prob=1.,
                                is_train=None,
                                wd=0.,
                                activation='elu',
                                tensor_dict=None,
                                name=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'multi_dimensional_attention'):
        map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1',
                              activation, False, wd, keep_prob, is_train)
        map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear',
                              False, wd, keep_prob, is_train)
        map2_masked = exp_mask_for_high_rank(map2, rep_mask)

        soft = tf.nn.softmax(map2_masked, 1)  # bs,sl,vec
        attn_output = tf.reduce_sum(soft * rep_tensor, 1)  # bs, vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = soft

        return attn_output
Exemplo n.º 2
0
def bi_sru_recurrent_network(rep_tensor,
                             rep_mask,
                             is_train=None,
                             keep_prob=1.,
                             wd=0.,
                             scope=None,
                             hn=None,
                             reuse=None):
    """

    :param rep_tensor: [Tensor/tf.float32] rank is 3 with shape [batch_size/bs, max_sent_len/sl, vec]
    :param rep_mask: [Tensor/tf.bool]rank is 2 with shape [bs,sl]
    :param is_train: [Scalar Tensor/tf.bool]scalar tensor to indicate whether the mode is training or not
    :param keep_prob: [float] dropout keep probability in the range of (0,1)
    :param wd: [float]for L2 regularization, if !=0, add tensors to tf collection "reg_vars"
    :param scope: [str]variable scope name
    :param hn:
    :param
    :return: [Tensor/tf.float32] with shape [bs, sl, 2vec] for forward and backward
    """
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec

    with tf.variable_scope(scope or 'bi_sru_recurrent_network'):
        # U_d = bn_dense_layer([rep_tensor], 6 * ivec, False, 0., 'get_frc', 'linear',
        #                    False, wd, keep_prob, is_train)  # bs, sl, 6vec
        # U_d_fw, U_d_bw = tf.split(U_d, 2, 2)
        with tf.variable_scope('forward'):
            U_d_fw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0.,
                                    'get_frc_fw', 'linear', False, wd,
                                    keep_prob, is_train)  # bs, sl, 6vec
            U_fw = tf.concat([rep_tensor, U_d_fw], -1)
            fw_SRUCell = SwitchableDropoutWrapper(
                SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob)
            fw_output, _ = dynamic_rnn(fw_SRUCell,
                                       U_fw,
                                       tf.reduce_sum(
                                           tf.cast(rep_mask, tf.int32), -1),
                                       dtype=tf.float32,
                                       scope='forward_sru')  # bs, sl, vec

        with tf.variable_scope('backward'):
            U_d_bw = bn_dense_layer([rep_tensor], 3 * ivec, False, 0.,
                                    'get_frc_bw', 'linear', False, wd,
                                    keep_prob, is_train)  # bs, sl, 6vec
            U_bw = tf.concat([rep_tensor, U_d_bw], -1)
            bw_SRUCell = SwitchableDropoutWrapper(
                SRUCell(ivec, tf.nn.tanh, reuse), is_train, keep_prob)
            bw_output, _ = bw_dynamic_rnn(bw_SRUCell,
                                          U_bw,
                                          tf.reduce_sum(
                                              tf.cast(rep_mask, tf.int32), -1),
                                          dtype=tf.float32,
                                          scope='backward_sru')  # bs, sl, vec

        all_output = tf.concat([fw_output, bw_output], -1)  # bs, sl, 2vec
        return all_output
Exemplo n.º 3
0
def traditional_attention(rep_tensor,
                          rep_mask,
                          scope=None,
                          keep_prob=1.,
                          is_train=None,
                          wd=0.,
                          activation='elu',
                          tensor_dict=None,
                          name=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'traditional_attention'):
        rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0.,
                                        'bn_dense_map', activation, False, wd,
                                        keep_prob, is_train)

        rep_tensor_logits = get_logits([rep_tensor_map],
                                       None,
                                       False,
                                       scope='self_attn_logits',
                                       mask=rep_mask,
                                       input_keep_prob=keep_prob,
                                       is_train=is_train)  # bs,sl
        attn_res = softsel(rep_tensor, rep_tensor_logits, rep_mask)  # bs,vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = tf.nn.softmax(rep_tensor_logits)

        return attn_res
Exemplo n.º 4
0
def visit_multi_dimensional_attention(rep_tensor,
                                      keep_prob=1.,
                                      is_train=None,
                                      wd=0.,
                                      activation='relu'):
    # bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]

    with tf.variable_scope('multi_dimensional_attention'):
        map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1',
                              activation, False, wd, keep_prob, is_train)
        map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear',
                              False, wd, keep_prob, is_train)

        soft = tf.nn.softmax(map2, 1)  # bs,sl,vec
        attn_output = tf.reduce_sum(soft * rep_tensor, 1)  # bs, vec

        return attn_output
Exemplo n.º 5
0
def first_level_sa(rep_tensor,
                   rep_mask,
                   keep_prob=1.,
                   is_train=None,
                   wd=0.,
                   activation='relu'):
    # bs, sw, cl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2], tf.shape(rep_tensor)[3]
    ivec = rep_tensor.get_shape()[3]
    with tf.variable_scope('first_level_sa'):
        print('original: ', rep_tensor.get_shape())
        map1 = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map1',
                              activation, False, wd, keep_prob, is_train)
        print('map1: ', map1.get_shape())
        map2 = bn_dense_layer(map1, ivec, True, 0., 'bn_dense_map2', 'linear',
                              False, wd, keep_prob, is_train)
        print('map2: ', map2.get_shape())
        map2_masked = exp_mask_for_high_rank(map2, rep_mask)

        soft = tf.nn.softmax(map2_masked, 2)  # bs,sk,code_len,vec
        attn_output = tf.reduce_sum(soft * rep_tensor, 2)  # bs, sk, vec

        return attn_output
Exemplo n.º 6
0
def normal_attention(rep_tensor,
                     rep_mask,
                     scope=None,
                     keep_prob=1.,
                     is_train=None,
                     wd=0.,
                     activation='elu',
                     tensor_dict=None,
                     name=None):
    batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'normal_attention'):
        rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0.,
                                        'bn_dense_map', activation, False, wd,
                                        keep_prob, is_train)

        rep_tensor_logits = get_logits([rep_tensor_map],
                                       None,
                                       False,
                                       scope='self_attn_logits',
                                       mask=rep_mask,
                                       input_keep_prob=keep_prob,
                                       is_train=is_train)  # bs,sl
        attn_result = softsel(rep_tensor, rep_tensor_logits,
                              rep_mask)  # bs,vec

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name] = tf.nn.softmax(rep_tensor_logits)

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i',
                       False, wd, keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_tensor_map + (1 -
                                                     fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)  # bs,sl,vec
        return output
Exemplo n.º 7
0
    def __call__(self, inputs, state, scope=None):
        """

        :param inputs: [bs, vec]
        :param state:
        :param scope:
        :return:
        """
        with tf.variable_scope(scope or "SRU_cell"):
            b_f = tf.get_variable('b_f', [self._num_units], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0))
            b_r = tf.get_variable('b_r', [self._num_units], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0))
            U_d = bn_dense_layer(inputs, 3 * self._num_units, False, 0., 'get_frc', 'linear')  # bs, 3vec
            x_t = tf.identity(inputs, 'x_t')
            x_dt, f_t, r_t = tf.split(U_d, 3, 1)
            f_t = tf.nn.sigmoid(f_t + b_f)
            r_t = tf.nn.sigmoid(r_t + b_r)
            c_t = f_t * state + (1 - f_t) * x_dt
            h_t = r_t * self._activation(c_t) + (1 - r_t) * x_t
            return h_t, c_t
Exemplo n.º 8
0
def sentence_encoding_models(
        rep_tensor, rep_mask, method, activation_function,
        scope=None, wd=0., is_train=None, keep_prob=1., **kwargs):
    method_name_list = [
        'cnn_kim',
        'no_ct',
        'lstm', 'gru', 'sru', 'sru_normal',  # rnn
        'multi_cnn', 'hrchy_cnn',
        'multi_head', 'multi_head_git', 'disa',
        'block'
    ]

    if 'hn' in kwargs.keys():
        hn = kwargs['hn']
    else:
        hn = None
    ivec = hn or rep_tensor.get_shape().as_list()[2]

    with tf.variable_scope(scope or 'sentence_encoding_models'):
        if method == 'cnn_kim':
            assert 2 * ivec % 3 == 0
            sub_hn = 2 * ivec // 3
            sent_encoding = cnn_for_sentence_encoding(
                rep_tensor, rep_mask, (3,4,5), sub_hn, 'sent_encoding_cnn_kim', is_train, keep_prob, wd)
        else:
            ct_rep = None
            if method == 'no_ct':
                ct_rep = bn_dense_layer(
                    rep_tensor, 2*ivec, True, 0., 'no_ct', activation_function, False, wd, keep_prob, is_train)
            else:
                ct_rep = context_fusion_layers(
                    rep_tensor, rep_mask, method, activation_function,
                    None, wd, is_train, keep_prob, **kwargs)

            sent_encoding = multi_dimensional_attention(
                ct_rep, rep_mask, 'multi_dim_attn_for_%s' % method,
                keep_prob, is_train, wd, activation_function)

        return sent_encoding
Exemplo n.º 9
0
def bi_sru_recurrent_network(rep_tensor,
                             rep_mask,
                             is_train=None,
                             keep_prob=1.,
                             wd=0.,
                             scope=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    with tf.variable_scope(scope or 'bi_sru_recurrent_network'):
        U_d = bn_dense_layer([rep_tensor], 6 * ivec, True, 0., 'get_frc',
                             'linear', False, wd, keep_prob,
                             is_train)  # bs, sl, 6vec
        U_d_fw, U_d_bw = tf.split(U_d, 2, 2)
        with tf.variable_scope('forward'):
            U_fw = tf.concat([rep_tensor, U_d_fw], -1)
            fw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh),
                                                  is_train, keep_prob)
            fw_output, _ = dynamic_rnn(fw_SRUCell,
                                       U_fw,
                                       tf.reduce_sum(
                                           tf.cast(rep_mask, tf.int32), -1),
                                       dtype=tf.float32,
                                       scope='forward_sru')  # bs, sl, vec

        with tf.variable_scope('backward'):
            U_bw = tf.concat([rep_tensor, U_d_bw], -1)
            bw_SRUCell = SwitchableDropoutWrapper(SRUCell(ivec, tf.nn.tanh),
                                                  is_train, keep_prob)
            bw_output, _ = bw_dynamic_rnn(bw_SRUCell,
                                          U_bw,
                                          tf.reduce_sum(
                                              tf.cast(rep_mask, tf.int32), -1),
                                          dtype=tf.float32,
                                          scope='backward_sru')  # bs, sl, vec

        all_output = tf.concat([fw_output, bw_output], -1)  # bs, sl, 2vec
        return all_output
Exemplo n.º 10
0
    def build_loss_optimizer(self):
        with tf.name_scope('loss_optimization'):
            logits = bn_dense_layer(self.output, 1, True, 0.,
                                    'bn_dense_map', 'sigmoid',
                                    False, wd=0., keep_prob=1.,
                                    is_train=True)

            losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.cast(self.labels, tf.float32))
            tf.add_to_collection('losses', tf.reduce_mean(losses, name='loss_mean'))
            loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss')
            tf.summary.scalar(loss.op.name, loss)
            tf.add_to_collection('ema/scalar', loss)

            optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)

            with tf.name_scope('correct_prediction'):
                correct_prediction = tf.equal(tf.round(logits), tf.cast(self.labels, tf.float32))
            with tf.name_scope('accuracy'):
                # Mean accuracy over all labels:
                # http://stackoverflow.com/questions/37746670/tensorflow-multi-label-accuracy-calculation
                accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            return loss, optimizer, accuracy, logits, tf.round(logits)
Exemplo n.º 11
0
    def build_network(self):
        tds, tel, hn = self.tds, self.tel, self.hn
        bs, sn, sl, ql = self.bs, self.sn, self.sl, self.ql

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                scope='gene_token_emb_mat')
            c_emb = tf.nn.embedding_lookup(token_emb_mat,
                                           self.context_token)  # bs,sn,sl,tel
            q_emb = tf.nn.embedding_lookup(token_emb_mat,
                                           self.question_token)  # s,ql,tel

        with tf.variable_scope('prepro'):
            q_rep = multi_dimensional_attention(q_emb,
                                                self.question_token_mask,
                                                'q2coding', cfg.dropout,
                                                self.is_train, cfg.wd,
                                                'relu')  # bs, hn
            q_rep_map = bn_dense_layer(q_rep, hn, True, 0., 'q_rep_map',
                                       'relu', False, cfg.wd, cfg.dropout,
                                       self.is_train)  # bs, hn

        with tf.variable_scope('sent_emb'):
            c_emb_rshp = tf.reshape(c_emb, [bs * sn, sl, tel],
                                    'c_emb_rshp')  # bs*sn,sl,tel
            c_mask_rshp = tf.reshape(self.context_token_mask, [bs * sn, sl],
                                     'c_mask_rshp')  # bs*sn,sl,tel
            sent_enc_rshp = sentence_encoding_models(
                c_emb_rshp,
                c_mask_rshp,
                cfg.context_fusion_method,
                'relu',
                'sent2enc',
                cfg.wd,
                self.is_train,
                cfg.dropout,
                hn,
                block_len=cfg.block_len)  # bs*sn, 2*hn
            sent_enc = tf.reshape(sent_enc_rshp,
                                  [bs, sn, 2 * hn])  # bs,sn, 2*hn
            sent_enc_map = bn_dense_layer(sent_enc, hn, True, 0.,
                                          'sent_enc_map', 'relu', False,
                                          cfg.wd, cfg.dropout, self.is_train)

        with tf.variable_scope('fusion'):
            q_rep_map_ex = tf.tile(tf.expand_dims(q_rep_map, 1),
                                   [1, sn, 1])  # bs, sn, hn
            fusion_rep = tf.concat([
                sent_enc_map, q_rep_map_ex, sent_enc_map - q_rep_map_ex,
                sent_enc_map * q_rep_map_ex
            ], -1)  # bs,sn,4hn

        with tf.variable_scope('output'):
            out_cf = context_fusion_layers(fusion_rep,
                                           self.context_sent_mask,
                                           cfg.context_fusion_method,
                                           'relu',
                                           'out_cf',
                                           cfg.wd,
                                           self.is_train,
                                           cfg.dropout,
                                           hn,
                                           block_len=4)
            pre_output = bn_dense_layer(out_cf, hn, True, 0., 'pre_output',
                                        'relu', False, cfg.wd, cfg.dropout,
                                        self.is_train)

        logits = get_logits(  # exp masked
            pre_output, None, True, 0., 'logits', self.context_sent_mask,
            cfg.wd, cfg.dropout, self.is_train, 'linear')
        return logits
Exemplo n.º 12
0
def directional_attention_with_dense(rep_tensor,
                                     rep_mask,
                                     direction=None,
                                     scope=None,
                                     keep_prob=1.,
                                     is_train=None,
                                     wd=0.,
                                     activation='elu',
                                     tensor_dict=None,
                                     name=None,
                                     hn=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = hn or rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction
                           or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask = tf.cast(
                tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0),
                                   [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1),
                                [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(rep_map_dp,
                               ivec,
                               False,
                               scope='linear_dependent')  # bs,sl,vec
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,vec
            head = linear(rep_map_dp, ivec, False,
                          scope='linear_head')  # bs,sl,vec
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,vec

            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,sl,sl,vec

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result
            output = mask_for_high_rank(output, rep_mask)

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
            tensor_dict[name + '_gate'] = fusion_gate
        return output
Exemplo n.º 13
0
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None,
                                     keep_prob=1., is_train=None, wd=0., activation='elu',
                                     tensor_dict=None, name=None):
    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1./scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape()[2]
    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # mask generation
        sl_indices = tf.range(sl, dtype=tf.int32)
        sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
        if direction is None:
            direct_mask0 = tf.greater(sl_row + sigma, sl_col)
            direct_mask1 = tf.greater(sl_col + sigma, sl_row)
            direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool)
            direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2)
        else:
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)
            else:
                direct_mask = tf.greater(sl_col, sl_row)
        direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1])  # bs,sl,sl
        rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1])  # bs,sl,sl
        attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile)  # bs,sl,sl

        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,1
            f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.))
            dependent = linear(rep_map_dp, 1, False, scope='linear_dependent')  # bs,sl,1
            dependent_etd = tf.expand_dims(dependent, 1)  # bs,1,sl,1
            head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1
            head_etd = tf.expand_dims(head, 2)  # bs,sl,1,1

            logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0)  # bs,sl,sl,1

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            if direction is not None:
                dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + 
                                           tf.diag(tf.ones([sl], tf.int32)), tf.float32))
                logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)
            attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec])

            attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2)  # bs,sl,vec

        with tf.variable_scope('output'):
            output = attn_result

        # save attn
        if tensor_dict is not None and name is not None:
            tensor_dict[name + '_dependent'] = dependent
            tensor_dict[name + '_head'] = head
            tensor_dict[name] = attn_score
        return output
Exemplo n.º 14
0
def simple_block_attention(rep_tensor,
                           rep_mask,
                           block_len=5,
                           scope=None,
                           direction=None,
                           keep_prob=1.,
                           is_train=None,
                           wd=0.,
                           activation='elu',
                           hn=None):
    assert direction is not None

    def scaled_tanh(x, scale=5.):
        return scale * tf.nn.tanh(1. / scale * x)

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec
    with tf.variable_scope(scope or 'block_simple'):
        # @1. split sequence
        with tf.variable_scope('split_seq'):
            block_num = tf.cast(
                tf.ceil(
                    tf.divide(tf.cast(sl, tf.float32),
                              tf.cast(block_len, tf.float32))), tf.int32)
            comp_len = block_num * block_len - sl

            rep_tensor_comp = tf.concat(
                [rep_tensor,
                 tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1)
            rep_mask_comp = tf.concat([
                rep_mask,
                tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool)
            ], 1)

            rep_tensor_split = tf.reshape(
                rep_tensor_comp,
                [bs, block_num, block_len, org_ivec])  # bs,bn,bl,d
            rep_mask_split = tf.reshape(rep_mask_comp,
                                        [bs, block_num, block_len])  # bs,bn,bl

            # non-linear
            rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0.,
                                     'bn_dense_map', activation, False, wd,
                                     keep_prob, is_train)  # bs,bn,bl,vec
            rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2),
                                   [1, 1, block_len, 1, 1])  # bs,bn,bl,bl,vec
            # rep_map_dp = dropout(rep_map, keep_prob, is_train)
            bn = block_num
            bl = block_len

        with tf.variable_scope('self_attention'):
            # @2.self-attention in block
            # mask generation
            sl_indices = tf.range(block_len, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if direction == 'forward':
                direct_mask = tf.greater(sl_row, sl_col)  # bl,bl
            else:
                direct_mask = tf.greater(sl_col, sl_row)  # bl,bl
            direct_mask_tile = tf.tile(
                tf.expand_dims(tf.expand_dims(direct_mask, 0), 0),
                [bs, bn, 1, 1])  # bs,bn,bl,bl
            rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2),
                                      [1, 1, bl, 1])  # bs,bn,bl,bl
            rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3),
                                      [1, 1, 1, bl])  # bs,bn,bl,bl
            rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2)
            attn_mask = tf.logical_and(direct_mask_tile,
                                       rep_mask_tile,
                                       name='attn_mask')  # bs,bn,bl,bl

            # attention
            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent_head = linear(rep_map, 2 * ivec, False, 0.,
                                    'linear_dependent_head', False, wd,
                                    keep_prob, is_train)  # bs,bn,bl,2vec
            dependent, head = tf.split(dependent_head, 2, 3)
            dependent_etd = tf.expand_dims(dependent, 2)  # bs,bn,1,bl,vec
            head_etd = tf.expand_dims(head, 3)  # bs,bn,bl,1,vec
            logits = scaled_tanh(dependent_etd + head_etd + f_bias,
                                 5.0)  # bs,bn,bl,bl,vec
            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 3)  # bs,bn,bl,bl,vec
            attn_score = mask_for_high_rank(attn_score,
                                            attn_mask)  # bs,bn,bl,bl,vec
            self_attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                             3)  # bs,bn,bl,vec

        with tf.variable_scope('source2token_self_attn'):
            inter_block_logits = bn_dense_layer(self_attn_result, ivec, True,
                                                0., 'bn_dense_map', 'linear',
                                                False, wd, keep_prob,
                                                is_train)  # bs,bn,bl,vec
            inter_block_logits_masked = exp_mask_for_high_rank(
                inter_block_logits, rep_mask_split)  # bs,bn,bl,vec
            inter_block_soft = tf.nn.softmax(inter_block_logits_masked,
                                             2)  # bs,bn,bl,vec
            inter_block_attn_output = tf.reduce_sum(
                self_attn_result * inter_block_soft, 2)  # bs,bn,vec

        with tf.variable_scope('self_attn_inter_block'):
            inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32),
                                                   tf.bool)
            block_ct_res = directional_attention_with_dense(
                inter_block_attn_output, inter_block_attn_output_mask,
                direction, 'disa', keep_prob, is_train, wd,
                activation)  # [bs,bn,vec]

            block_ct_res_tile = tf.tile(tf.expand_dims(
                block_ct_res, 2), [1, 1, bl, 1])  #[bs,bn,vec]->[bs,bn,bl,vec]

        with tf.variable_scope('combination'):
            # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec]
            rep_tensor_with_ct = tf.concat(
                [rep_map, self_attn_result, block_ct_res_tile],
                -1)  # [bs,bn,bl,3vec]
            new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True,
                                          0., 'linear_new_context_and_gate',
                                          False, wd, keep_prob,
                                          is_train)  # [bs,bn,bl,2vec]
            new_context, gate = tf.split(new_context_and_gate, 2,
                                         3)  # bs,bn,bl,vec
            if activation == "relu":
                new_context_act = tf.nn.relu(new_context)
            elif activation == "elu":
                new_context_act = tf.nn.elu(new_context)
            elif activation == "linear":
                new_context_act = tf.identity(new_context)
            else:
                raise RuntimeError
            gate_sig = tf.nn.sigmoid(gate)
            combination_res = gate_sig * new_context_act + (
                1 - gate_sig) * rep_map  # bs,bn,bl,vec

        with tf.variable_scope('restore_original_length'):
            combination_res_reshape = tf.reshape(
                combination_res, [bs, bn * bl, ivec])  # bs,bn*bl,vec
            output = combination_res_reshape[:, :sl, :]
            return output
Exemplo n.º 15
0
def gated_self_attention(rep_tensor,
                         rep_mask,
                         scope=None,
                         keep_prob=1.,
                         is_train=None,
                         wd=0.,
                         activation='elu',
                         hn=None,
                         position_mask_type=None):
    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(
        rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope(scope or 'gated_self_attention_%s' %
                           (position_mask_type or 'None')):
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)

        # mask generation
        rep_mask_epd1 = tf.expand_dims(rep_mask, 1)  # bs,1,sl
        rep_mask_epd2 = tf.expand_dims(rep_mask, 2)  # bs,sl,1
        rep_mask_mat = tf.logical_and(rep_mask_epd1, rep_mask_epd2)  # bs,sl,sl

        if position_mask_type in ['forward', 'backward']:
            sl_indices = tf.range(sl, dtype=tf.int32)
            sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices)
            if position_mask_type == 'forward':
                position_mask = tf.greater(sl_row, sl_col)
            else:
                position_mask = tf.greater(sl_col, sl_row)
            position_mask = tf.tile(tf.expand_dims(position_mask, 0),
                                    [bs, 1, 1])
            position_mask = tf.logical_and(rep_mask_mat, position_mask)

        else:
            position_mask = rep_mask_mat

        position_mask_ft = tf.cast(position_mask, tf.float32)

        # attention
        with tf.variable_scope('intra_sent_attn'):  # bs,sl,hn
            # rep_tensor_mean = pooling_with_mask(rep_tensor, rep_mask, 'mean')  # bs, hn
            rep_tensor_for_attn = rep_map

            pre_align_score = bn_dense_layer(  # bs,sl,hn
                rep_tensor_for_attn, ivec, True, 0., 'intra_sent_map1',
                activation, False, wd, keep_prob, is_train)
            align_score = bn_dense_layer(  # bs,sl,hn
                pre_align_score, ivec, True, 0., 'intra_sent_map2', 'linear',
                False, wd, keep_prob, is_train)
            align_score_w_mask = exp_mask_for_high_rank(align_score,
                                                        rep_mask)  # bs,sl,hn
            exp_align_score = tf.exp(align_score_w_mask)  # bs,sl,hn

            accum_z_deno = tf.matmul(position_mask_ft, exp_align_score)
            accum_z_deno = tf.where(
                tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
                accum_z_deno, tf.ones_like(accum_z_deno))

            rep_mul_score = rep_map * exp_align_score
            accum_rep_mul_score = tf.matmul(position_mask_ft, rep_mul_score)

            attn_res = accum_rep_mul_score / accum_z_deno

        with tf.variable_scope('context_fusion_gate'):
            fusion_gate = tf.nn.sigmoid(
                bn_dense_layer([rep_map, attn_res], hn, True, 0.,
                               'linear_fusion_gate', activation, False, wd,
                               keep_prob, is_train))
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_res

        output = mask_for_high_rank(output, rep_mask)
        return output
Exemplo n.º 16
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)

        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('hard_network'):
            # s1_act, s1_logpa, s2_act, s2_logpa, choose_percentage
            s1_act = self.sent1_token_mask
            s1_logpa = tf.cast(s1_act, tf.float32)

            s2_act = self.sent2_token_mask
            s2_logpa = tf.cast(s2_act, tf.float32)

            s1_percentage = tf.ones([bs], tf.float32)
            s2_percentage = tf.ones([bs], tf.float32)

        with tf.variable_scope('ct_attn'):
            s1_fw = directional_attention_with_dense(
                s1_emb, self.sent1_token_mask, 'forward', 'dir_attn_fw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_fw_attn')
            s1_bw = directional_attention_with_dense(
                s1_emb, self.sent1_token_mask, 'backward', 'dir_attn_bw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_bw_attn')

            s1_seq_rep = tf.concat([s1_fw, s1_bw], -1)

            tf.get_variable_scope().reuse_variables()

            s2_fw = directional_attention_with_dense(
                s2_emb, self.sent2_token_mask, 'forward', 'dir_attn_fw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_fw_attn')
            s2_bw = directional_attention_with_dense(
                s2_emb, self.sent2_token_mask, 'backward', 'dir_attn_bw',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_bw_attn')
            s2_seq_rep = tf.concat([s2_fw, s2_bw], -1)

        with tf.variable_scope('sentence_enc'):
            s1_rep = multi_dimensional_attention(
                s1_seq_rep, self.sent1_token_mask, 'multi_dimensional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s1_attn')
            tf.get_variable_scope().reuse_variables()
            s2_rep = multi_dimensional_attention(
                s2_seq_rep, self.sent2_token_mask, 'multi_dimensional_attention',
                cfg.dropout, self.is_train, cfg.wd,
                tensor_dict=self.tensor_dict, name='s2_attn')

        with tf.variable_scope('output'):
            out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1)
            out_rep_map = bn_dense_layer(
                out_rep, hn, True, 0., 'out_rep_map', 'elu', False, cfg.wd, cfg.dropout, self.is_train)
            pre_output1 = highway_network(
                out_rep_map, hn, True, 0., 'pre_output1', 'elu', False, cfg.wd, cfg.dropout, self.is_train)
            logits = linear([pre_output1], self.output_class, True, 0., scope='logits', squeeze=False,
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)
        return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage)  # logits
Exemplo n.º 17
0
def directional_attention_with_selections(
        rep_tensor, rep_mask, dep_selection, head_selection, direction=None, hn=None, keep_unselected=True,
        scope=None, keep_prob=1., is_train=None, wd=0., activation='elu'):

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    org_ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or org_ivec

    with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'):
        # non-linear
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation,
                                 False, wd, keep_prob, is_train)
        # ensure the seletion is right
        dep_selection = tf.logical_and(rep_mask, dep_selection)
        head_selection = tf.logical_and(rep_mask, head_selection)
        rep_dep_tensor, rep_dep_mask, dep_org_idx = reduce_data_rep_max_len(rep_map, dep_selection)
        rep_head_tensor,rep_head_mask, head_org_idx = reduce_data_rep_max_len(rep_map, head_selection)
        sl_dep, sl_head = tf.shape(rep_dep_tensor)[1], tf.shape(rep_head_tensor)[1]

        if keep_unselected:
            unhead_selection = tf.logical_and(rep_mask, tf.logical_not(head_selection))
            rep_unhead_tensor, rep_unhead_mask, unhead_org_idx = reduce_data_rep_max_len(rep_map, unhead_selection)
            sl_unhead = tf.shape(rep_unhead_tensor)[1]

        attn_result = tf.cond(
            tf.equal(sl_head, 0),
            lambda: tf.zeros([bs, 0, hn], tf.float32),
            lambda: self_attention_for_selected_head(
                head_selection, head_org_idx, sl_head, rep_head_mask,
                dep_selection, dep_org_idx, sl_dep, rep_dep_mask,
                rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec
            )
        )

        if keep_unselected:
            input_idx = tf.tile(tf.expand_dims(tf.range(sl), 0), [bs, 1])
            pooling_result = tf.cond(
                tf.equal(sl_unhead, 0),
                lambda: tf.zeros([bs, 0, hn], tf.float32),
                lambda: mean_pooling_for_unselected_head(
                    unhead_org_idx, sl_unhead, rep_unhead_mask,
                    input_idx, sl, rep_mask, rep_map, None)  # todo: point !
            )

        with tf.variable_scope('output'):
            if keep_unselected:
                range_head = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_head])
                scatter_attn = tf.cond(
                    tf.equal(sl_head, 0),
                    lambda: tf.zeros([bs, sl+1, hn], tf.float32),
                    lambda: tf.scatter_nd(
                        tf.stack([range_head, head_org_idx], -1), attn_result, [bs, sl+1, hn])
                )

                range_unhead = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_unhead])
                scatter_pooling = tf.cond(
                    tf.equal(sl_unhead, 0),
                    lambda: tf.zeros([bs, sl+1, hn], tf.float32),
                    lambda: tf.scatter_nd(
                        tf.stack([range_unhead, unhead_org_idx], -1), pooling_result, [bs, sl+1, hn])
                )

                self_attn_input = rep_map
                context_features = tf.add(scatter_attn[:, :-1], scatter_pooling[:, :-1], 'context_features')
                output_mask = rep_mask
            else:
                self_attn_input = rep_head_tensor
                context_features = attn_result
                output_mask = rep_head_mask

            # context fusion gate
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.))
            fusion_gate = tf.nn.sigmoid(
                linear(self_attn_input, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) +
                linear(context_features, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) +
                o_bias)
            output = fusion_gate * self_attn_input + (1 - fusion_gate) * context_features

        return output, output_mask
Exemplo n.º 18
0
def generate_mask_with_rl_real(rep_tensor, rep_mask, is_mat=False,scope=None,
                               keep_prob=1., is_train=None, wd=0., activation='elu', hn=None):
    """

    :param rep_tensor: 3d tensor
    :param rep_mask: 2d tensor
    :param is_mat: [True|False]
    :param start_rl:
    :param end_rl_gain:
    :param scope:
    :param keep_prob:
    :param is_train:
    :param wd:
    :param activation:
    :param global_step:
    :return:
    """

    bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = hn or rep_tensor.get_shape()[2]

    with tf.variable_scope(scope or 'generate_mask_with_rl_real'):
        if is_mat:
            rep_row = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1])
            rep_col = tf.tile(tf.expand_dims(rep_tensor, 2), [1, 1, sl, 1])
            rep_h0 = tf.concat([rep_row, rep_col], -1)
        else:
            rep_h0 = rep_tensor

        rep_h1 = bn_dense_layer([rep_h0], ivec, True, 0., 'dense_rep_mat_h1',
                                    activation, False, wd, keep_prob, is_train)
        rep_h2 = bn_dense_layer([rep_h1], 1, True, 0., 'dense_rep_mat_h2',
                                    'linear', False, wd, keep_prob, is_train)
        rep_h2 = tf.squeeze(rep_h2, 3 if is_mat else 2)  # bs,sl,sl / bs,sl
        rep_prob = tf.nn.sigmoid(rep_h2)

        # sampling
        # Here, need a dynamic policy to add the random

        # todo:text
        # mode_is_train = tf.constant(mode == 'train', tf.bool, [], 'mode_is_train')
        # random_values = tf.cond(
        #     tf.logical_and(mode_is_train, is_train),
        #     lambda: tf.random_uniform([bs, sl, sl] if is_mat else [bs, sl]),
        #     lambda: tf.ones([bs, sl, sl] if is_mat else [bs, sl], tf.float32) * 0.5
        # )
        random_values = tf.random_uniform([bs, sl, sl] if is_mat else [bs, sl])

        # if global_step is not None:
        #     policy_rep_prob = tf.cond(tf.logical_and(mode_is_train,
        #                                              tf.less(global_step,
        #                                                      tf.constant(int(x2), tf.int32))),
        #                               lambda: rep_prob + prob_gain,
        #                               lambda: rep_prob)
        #
        # else:
        #     policy_rep_prob = rep_prob

        policy_rep_prob = rep_prob

        actions = tf.less_equal(random_values, policy_rep_prob)

        actions = tf.stop_gradient(actions)

        if is_mat:
            rep_mask_new = tf.logical_and(
                tf.expand_dims(rep_mask, 1),
                tf.expand_dims(rep_mask, 2)
            )
        else:
            rep_mask_new = rep_mask

        actions = tf.logical_and(actions, rep_mask_new)

        # log p(a)
        logpa = - binary_entropy(rep_prob, actions) * tf.cast(rep_mask_new, tf.float32)
        if is_mat:
            logpa = - tf.reshape(logpa, [bs, sl * sl])

        # percentage
        actions_flat = tf.reshape(actions, [bs, -1])
        rep_mask_new_flat = tf.reshape(rep_mask_new, [bs, -1])

        percentage = tf.reduce_sum(tf.cast(actions_flat, tf.float32), -1) / \
                     tf.reduce_sum(tf.cast(rep_mask_new_flat, tf.float32), -1)

        return logpa, actions, percentage
Exemplo n.º 19
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' %
                    cfg.network_type)

        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl1, sl2 = self.bs, self.sl1, self.sl2

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(
                tds,
                tel,
                init_mat=self.token_emb_mat,
                extra_mat=self.glove_emb_mat,
                extra_trainable=self.finetune_emb,
                scope='gene_token_emb_mat')
            s1_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent1_token)  # bs,sl1,tel
            s2_emb = tf.nn.embedding_lookup(token_emb_mat,
                                            self.sent2_token)  # bs,sl2,tel
            self.tensor_dict['s1_emb'] = s1_emb
            self.tensor_dict['s2_emb'] = s2_emb

        with tf.variable_scope('hard_network'):
            # for sentence 1
            s1_emb_new = sequence_conditional_feature(s1_emb,
                                                      self.sent1_token_mask)
            s1_logpa_dep, s1_act_dep, s1_percentage_dep = generate_mask_with_rl(
                s1_emb_new, self.sent1_token_mask, False,
                'generate_mask_with_rl_dep', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s1_logpa_head, s1_act_head, s1_percentage_head = generate_mask_with_rl(
                s1_emb_new, self.sent1_token_mask, False,
                'generate_mask_with_rl_head', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s1_logpa = tf.concat([s1_logpa_dep, s1_logpa_head], -1)
            s1_act = tf.logical_and(tf.expand_dims(s1_act_dep, 1),
                                    tf.expand_dims(s1_act_head, 2))
            s1_percentage = s1_percentage_dep * s1_percentage_head

            tf.get_variable_scope().reuse_variables()
            # for sentence 2
            s2_emb_new = sequence_conditional_feature(s2_emb,
                                                      self.sent2_token_mask)
            s2_logpa_dep, s2_act_dep, s2_percentage_dep = generate_mask_with_rl(
                s2_emb_new, self.sent2_token_mask, False,
                'generate_mask_with_rl_dep', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s2_logpa_head, s2_act_head, s2_percentage_head = generate_mask_with_rl(
                s2_emb_new, self.sent2_token_mask, False,
                'generate_mask_with_rl_head', cfg.dropout, self.is_train,
                cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode,
                cfg.start_only_rl, hn)  # [bs, sl] & [bs, sl]
            s2_logpa = tf.concat([s2_logpa_dep, s2_logpa_head], -1)
            s2_act = tf.logical_and(tf.expand_dims(s2_act_dep, 1),
                                    tf.expand_dims(s2_act_head, 2))
            s2_percentage = s2_percentage_dep * s2_percentage_head

        keep_unselected = True
        with tf.variable_scope('ct_attn'):
            s1_fw, s1_token_mask_new = directional_attention_with_selections(
                s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head,
                'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s1_bw, _ = directional_attention_with_selections(
                s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head,
                'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')

            s1_seq_rep = tf.concat([s1_fw, s1_bw], -1)

            tf.get_variable_scope().reuse_variables()

            s2_fw, s2_token_mask_new = directional_attention_with_selections(
                s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head,
                'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s2_bw, _ = directional_attention_with_selections(
                s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head,
                'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout,
                self.is_train, cfg.wd, 'relu')
            s2_seq_rep = tf.concat([s2_fw, s2_bw], -1)

        with tf.variable_scope('sentence_enc'):
            s1_rep = multi_dimensional_attention(s1_seq_rep,
                                                 s1_token_mask_new,
                                                 'multi_dimensional_attention',
                                                 cfg.dropout,
                                                 self.is_train,
                                                 cfg.wd,
                                                 'relu',
                                                 tensor_dict=self.tensor_dict,
                                                 name='s1_attn')
            tf.get_variable_scope().reuse_variables()
            s2_rep = multi_dimensional_attention(s2_seq_rep,
                                                 s2_token_mask_new,
                                                 'multi_dimensional_attention',
                                                 cfg.dropout,
                                                 self.is_train,
                                                 cfg.wd,
                                                 'relu',
                                                 tensor_dict=self.tensor_dict,
                                                 name='s2_attn')

        with tf.variable_scope('output'):
            out_rep = tf.concat([s1_rep * s2_rep, tf.abs(s1_rep - s2_rep)], -1)
            out_rep_map = bn_dense_layer(out_rep, hn, True, 0., 'out_rep_map',
                                         'relu', False, cfg.wd, cfg.dropout,
                                         self.is_train)
            if cfg.use_mse and cfg.mse_logits:
                logits = tf.nn.sigmoid(
                    linear(out_rep_map,
                           1,
                           True,
                           0.,
                           scope='logits',
                           squeeze=True,
                           wd=cfg.wd,
                           input_keep_prob=cfg.dropout,
                           is_train=self.is_train)) * 2. + 3.
            else:
                logits = linear([out_rep_map],
                                self.output_class,
                                True,
                                0.,
                                scope='logits',
                                squeeze=False,
                                wd=cfg.wd,
                                input_keep_prob=cfg.dropout,
                                is_train=self.is_train)
        return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage,
                                                                s2_percentage
                                                                )  # logits
Exemplo n.º 20
0
def visit_sa_with_dense(rep_tensor,
                        keep_prob=1.,
                        is_train=None,
                        wd=0.,
                        activation='relu',
                        hn=None,
                        is_scale=True,
                        is_plus_sa=True):

    batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape(
        rep_tensor)[1], tf.shape(rep_tensor)[2]
    ivec = rep_tensor.get_shape().as_list()[2]
    ivec = hn or ivec
    with tf.variable_scope('temporal_attention'):
        # mask generation
        attn_mask = tf.cast(
            tf.diag(-tf.ones([sw_len], tf.int32)) + 1,
            tf.bool)  # batch_size, code_len, code_len

        # non-linear for context
        rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map',
                                 activation, False, wd, keep_prob, is_train)
        rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1),
                               [1, sw_len, 1, 1])  # bs,sl,sl,vec
        rep_map_dp = dropout(rep_map, keep_prob, is_train)

        # attention
        with tf.variable_scope('attention'):  # bs,sl,sl,vec

            f_bias = tf.get_variable('f_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            dependent = linear(
                rep_map_dp, ivec, False,
                scope='linear_dependent')  # batch_size, code_len, vec_size
            dependent_etd = tf.expand_dims(
                dependent, 1)  # batch_size, code_len,code_len, vec_size
            head = linear(
                rep_map_dp, ivec, False,
                scope='linear_head')  # batch_size, code_len, vec_size
            head_etd = tf.expand_dims(
                head, 2)  # batch_size, code_len,code_len, vec_size

            if is_plus_sa:
                attention_fact = dependent_etd + head_etd + f_bias
            else:
                return rep_map

            if is_scale:
                logits = scaled_tanh(attention_fact, 5.0)  # bs,sl,sl,vec
            else:
                logits = linear(tf.nn.tanh(attention_fact),
                                ivec,
                                True,
                                scope='linear_attn_fact')

            logits_masked = exp_mask_for_high_rank(logits, attn_mask)
            attn_score = tf.nn.softmax(logits_masked, 2)  # bs,sl,sl,vec
            attn_score = mask_for_high_rank(attn_score, attn_mask)

            attn_result = tf.reduce_sum(attn_score * rep_map_tile,
                                        2)  # bs,sl,vec

        with tf.variable_scope('output'):
            o_bias = tf.get_variable('o_bias', [ivec], tf.float32,
                                     tf.constant_initializer(0.))
            # input gate
            fusion_gate = tf.nn.sigmoid(
                linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd,
                       keep_prob, is_train) +
                linear(attn_result, ivec, True, 0., 'linear_fusion_a', False,
                       wd, keep_prob, is_train) + o_bias)
            output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result

        return output
Exemplo n.º 21
0
    def build_network(self):

        with tf.name_scope('code_embeddings'):
            if self.model_type == 'raw':
                # init_code_embed = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
                # code_embeddings = tf.Variable(init_code_embed)
                init_code_embed = tf.one_hot(self.inputs, self.vocabulary_size,on_value=1.0, off_value=0.0,axis=-1)
                inputs_embed = bn_dense_layer(init_code_embed, self.embedding_size, True, 0.,
                                        'bn_dense_map_linear', 'linear',
                                        False, wd=0., keep_prob=1.,
                                        is_train=True)
            elif self.model_type == 'tesa':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'delta':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'sa':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'normal':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'cbow':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'sg':
                init_code_embed = tesan_trans(self.model_type)
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'mce':
                init_code_embed = mce_trans()
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            elif self.model_type == 'glove':
                init_code_embed = glove_trans()
                # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)
            else:
                init_code_embed = med2vec_trans()
                # code_embeddings = tf.constant(init_code_embed, dtype=tf.float32)
                code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32)
                inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs)

        with tf.name_scope('visit_embedding'):
            # bs, max_visits, max_len_visit, embed_size
            inputs_masked = mask_for_high_rank(inputs_embed, self.inputs_mask)
            inputs_reduced = tf.reduce_mean(inputs_masked, 2)  # batch_size, max_visits, embed_size

        with tf.name_scope('visit_masking'):
            visit_mask = tf.reduce_sum(tf.cast(self.inputs_mask, tf.int32), -1)  # [bs,max_visits]
            visit_mask = tf.cast(visit_mask, tf.bool)
            tensor_len = tf.reduce_sum(tf.cast(visit_mask, tf.int32), -1)  # [bs]

        with tf.name_scope('RNN_computaion'):
            reuse = None if not tf.get_variable_scope().reuse else True
            if cfg.cell_type == 'gru':
                cell = tf.contrib.rnn.GRUCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'lstm':
                cell = tf.contrib.rnn.LSTMCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'basic_lstm':
                cell = tf.contrib.rnn.BasicLSTMCell(cfg.hn, reuse=reuse)
            elif cfg.cell_type == 'basic_rnn':
                cell = tf.contrib.rnn.BasicRNNCell(cfg.hn, reuse=reuse)

            outputs, final_state = dynamic_rnn(cell, inputs_reduced, tensor_len, dtype=tf.float32)
        return outputs, final_state, tensor_len