예제 #1
0
def match_layer_selfatt(x5_e_ts,
                        x5_enc_cur_list,
                        x5_mask_cur_list,
                        x_e_ts,
                        x_enc_mb2_ts,
                        x_mask_mb2_ts,
                        emb_dim,
                        initializer_opt,
                        turn_num5,
                        matchin_include_x=False):
    # print(x5_enc_cur_list.shape)  # bs*turn5 sent_len emb_dim
    # print(x5_mask_cur_list.shape) # bs*turn5 sent_len
    # print(x_enc_mb2_ts.shape) # bs*turn5 turn1*sent_len emb_dim
    # print(x_mask_mb2_ts.shape) # bs*turn5 turn1*sent_len

    if matchin_include_x:
        a_list = [x5_e_ts, x5_enc_cur_list]
        b_list = [x_e_ts, x_enc_mb2_ts]
    else:
        a_list = [x5_enc_cur_list]
        b_list = [x_enc_mb2_ts]

    with tf.variable_scope("atb", reuse=tf.AUTO_REUSE):
        atb = layers.block(x5_enc_cur_list,
                           x_enc_mb2_ts,
                           x_enc_mb2_ts,
                           Q_lengths=x5_mask_cur_list,
                           K_lengths=x_mask_mb2_ts,
                           use_len=True)
    with tf.variable_scope("bta", reuse=tf.AUTO_REUSE):
        bta = layers.block(x_enc_mb2_ts,
                           x5_enc_cur_list,
                           x5_enc_cur_list,
                           Q_lengths=x_mask_mb2_ts,
                           K_lengths=x5_mask_cur_list,
                           use_len=True)
    a_list.append(atb)
    b_list.append(bta)

    a_list = tf.stack(a_list, axis=-1)
    b_list = tf.stack(b_list, axis=-1)
    sim_ori = tf.einsum('biks,bjks->bijs', a_list, b_list) / tf.sqrt(200.0)
    sim = layers.CNN_FZX(sim_ori)
    if turn_num5 is not None:
        sim = tf.reshape(sim, [-1, turn_num5, sim.shape[-1]])
    return sim, sim_ori
예제 #2
0
    def build_graph(self):
        with self._graph.as_default():
            rand_seed = self._conf['rand_seed']
            tf.set_random_seed(rand_seed)

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            #define placehloders
            self.turns = tf.placeholder(tf.int32,
                                        shape=[
                                            self._conf["batch_size"],
                                            self._conf["max_turn_num"],
                                            self._conf["max_turn_len"]
                                        ])

            self.tt_turns_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.label = tf.placeholder(tf.float32,
                                        shape=[self._conf["batch_size"]])

            #define operations
            #response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)

            for index in range(self._conf['stack_num']):
                with tf.variable_scope('self_stack_' + str(index)):
                    Hr = layers.block(Hr,
                                      Hr,
                                      Hr,
                                      Q_lengths=self.response_len,
                                      K_lengths=self.response_len)

            #context part
            #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)

            sim_turns = []
            #for every turn_t calculate matching vector
            for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
                Hu = tf.nn.embedding_lookup(
                    self._word_embedding,
                    turn_t)  #[batch, max_turn_len, emb_size]

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)

                for index in range(self._conf['stack_num']):

                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        Hu = layers.block(Hu,
                                          Hu,
                                          Hu,
                                          Q_lengths=t_turn_length,
                                          K_lengths=t_turn_length)

                with tf.variable_scope('u_attentd_r_' + str(index)):
                    try:
                        u_a_r = layers.block(Hu,
                                             Hr,
                                             Hr,
                                             Q_lengths=t_turn_length,
                                             K_lengths=self.response_len)
                    except ValueError:
                        tf.get_variable_scope().reuse_variables()
                        u_a_r = layers.block(Hu,
                                             Hr,
                                             Hr,
                                             Q_lengths=t_turn_length,
                                             K_lengths=self.response_len)

                with tf.variable_scope('r_attend_u_' + str(index)):
                    try:
                        r_a_u = layers.block(Hr,
                                             Hu,
                                             Hu,
                                             Q_lengths=self.response_len,
                                             K_lengths=t_turn_length)
                    except ValueError:
                        tf.get_variable_scope().reuse_variables()
                        r_a_u = layers.block(Hr,
                                             Hu,
                                             Hu,
                                             Q_lengths=self.response_len,
                                             K_lengths=t_turn_length)

                u_a_r = tf.stack([u_a_r, Hu], axis=-1)
                r_a_u = tf.stack([r_a_u, Hr], axis=-1)

                #calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
                    # divide sqrt(200) to prevent gradient explosion
                    sim = tf.einsum('biks,bjks->bijs', r_a_u,
                                    u_a_r) / tf.sqrt(200.0)

                sim_turns.append(sim)

            #cnn and aggregation
            sim = tf.stack(sim_turns, axis=1)
            print('sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, 32, 16)
                #for douban
                #final_info = layers.CNN_3d(sim, 16, 16)

            #loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(self.loss)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print var

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var)
                                   for grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs, global_step=self.global_step)

        return self._graph
예제 #3
0
def msn_model(input_x,
              input_x_mask,
              input_y,
              input_y_mask,
              word_emb,
              keep_rate,
              conf,
              x_len=None,
              y_len=None):

    turn_num = input_x.shape[1]
    sent_len = conf["max_turn_len"]
    emb_dim = conf["emb_size"]
    is_mask = False
    is_layer_norm = False
    # init = None
    init = tf.contrib.layers.xavier_initializer()

    init1 = tf.contrib.layers.xavier_initializer()
    init1 = tf.random_uniform_initializer(0.0, 1.0)

    Hr = tf.nn.embedding_lookup(word_emb, input_y)  # bs len emb
    Hu = tf.nn.embedding_lookup(word_emb, input_x)  # bs turn len emb

    x_len = tf.reshape(x_len, [-1])
    y_len = tf.tile(tf.expand_dims(y_len, axis=1), [1, turn_num])
    y_len = tf.reshape(y_len, [-1])

    with tf.variable_scope('enc', reuse=tf.AUTO_REUSE):

        # context selector
        context_ = tf.reshape(Hu, [-1, sent_len, emb_dim])

        context_ = layers.block(context_,
                                context_,
                                context_,
                                Q_lengths=x_len,
                                K_lengths=x_len,
                                is_mask=is_mask,
                                is_layer_norm=is_layer_norm,
                                init=init)
        context_ = tf.reshape(context_, [-1, turn_num, sent_len, emb_dim])

        W_word = tf.get_variable(
            name='w_word',
            shape=[emb_dim, emb_dim, turn_num],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())  # 200 200 10
        v = tf.get_variable(
            name='v',
            shape=[turn_num, 1],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())  # 10 1

        ss = []
        for hop_index in [1, 2, 3]:

            kk = Hu[:, turn_num - hop_index:, :, :]

            kk = tf.reduce_mean(kk, axis=1)
            kk = layers.block(kk,
                              kk,
                              kk,
                              is_mask=False,
                              is_layer_norm=is_layer_norm,
                              init=init)

            # kk context_

            A = tf.einsum("blrm,mdh,bud->blruh", context_, W_word,
                          kk) / tf.sqrt(200.0)
            A = tf.einsum("blruh,hp->blrup", A,
                          v)  # bs turn_num sent_len sent_len 1
            A = tf.squeeze(A, [
                4,
            ])  # bs turn_num sent_len sent_len

            A1 = tf.reduce_max(A, axis=2)  # bs turn_num sent_len
            A2 = tf.reduce_max(A, axis=3)  # bs turn_num sent_len
            a = tf.concat([A1, A2], axis=-1)  # bs turn_num sent_len*2
            a = tf.layers.dense(a, 1,
                                kernel_initializer=init1)  # bs turn_num 1

            a = tf.squeeze(a, [
                2,
            ])  # bs turn_num
            s1 = tf.nn.softmax(a, axis=1)

            # kk context_
            kk1 = tf.reduce_mean(kk, axis=1)  # bs emb
            context1 = tf.reduce_mean(context_, axis=2)  # bs turn emb
            norm1 = tf.norm(context1, axis=-1)
            norm2 = tf.norm(kk1, axis=-1, keepdims=True)
            # print(context1.shape) # bs 10 200
            # print(kk1.shape) # bs 200
            # print(norm1.shape) # bs 10
            # print(norm2.shape) # bs 1
            # exit()
            s2 = tf.einsum("bud,bd->bu", context1, kk1) / (1e-6 + norm1 * norm2
                                                           )  # bs turn
            # print(s1.shape, s2.shape)
            # exit()
            s = 0.5 * s1 + 0.5 * s2
            ss.append(s)

        #s = tf.expand_dims(s, axis=-1)
        s = tf.stack(ss, axis=-1)

        s = tf.layers.dense(s, 1, kernel_initializer=init1)  # bs turn_num 1
        s = tf.squeeze(s, [
            2,
        ])  # bs turn_num

        if "douban" in conf["data_path"]:
            grmmar_score = 0.3
        else:
            grmmar_score = 0.5

        s_mask1 = tf.nn.sigmoid(s)
        s_mask = tf.math.greater(s_mask1, grmmar_score)
        s_mask = tf.cast(s_mask, tf.float32)
        final_score = [s, s_mask1]
        s = s * s_mask

        Hu = Hu * tf.expand_dims(tf.expand_dims(s, axis=-1), axis=-1)

        Hu = tf.reshape(Hu, [-1, sent_len, emb_dim])

        Hr = tf.tile(tf.expand_dims(Hr, axis=1), [1, turn_num, 1, 1])
        Hr = tf.reshape(Hr, [-1, sent_len, emb_dim])

        # UR Matching Hu Hr

        def distance(A, B, C, epsilon=1e-6):
            Ma = tf.einsum("bum,md,brd->bur", A, B, C)
            A_norm = tf.norm(A, axis=-1)
            C_norm = tf.norm(C, axis=-1)
            norm_score = tf.einsum("bu,br->bur", A_norm, C_norm) + epsilon
            # norm_score = tf.math.maximum(norm_score, 1.0)
            Mb = tf.einsum("bud,brd->bur", A, C) / norm_score
            return Ma, Mb, norm_score

        v1 = tf.get_variable(
            name='v1',
            shape=[emb_dim, emb_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        M1, M2, norm_score1 = distance(Hu, v1, Hr)

        with tf.variable_scope('enc11', reuse=tf.AUTO_REUSE):
            Hu1 = layers.block(Hu,
                               Hu,
                               Hu,
                               Q_lengths=x_len,
                               K_lengths=x_len,
                               is_mask=is_mask,
                               is_layer_norm=is_layer_norm,
                               init=init)
        with tf.variable_scope('enc12', reuse=tf.AUTO_REUSE):
            Hr1 = layers.block(Hr,
                               Hr,
                               Hr,
                               Q_lengths=y_len,
                               K_lengths=y_len,
                               is_mask=is_mask,
                               is_layer_norm=is_layer_norm,
                               init=init)
        v2 = tf.get_variable(
            name='v2',
            shape=[emb_dim, emb_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        M3, M4, norm_score2 = distance(Hu1, v2, Hr1)

        with tf.variable_scope('enc21', reuse=tf.AUTO_REUSE):
            Hu1 = layers.block(Hu,
                               Hr,
                               Hr,
                               Q_lengths=x_len,
                               K_lengths=y_len,
                               is_mask=is_mask,
                               is_layer_norm=is_layer_norm,
                               init=init)
        with tf.variable_scope('enc22', reuse=tf.AUTO_REUSE):
            Hr1 = layers.block(Hr,
                               Hu,
                               Hu,
                               Q_lengths=y_len,
                               K_lengths=x_len,
                               is_mask=is_mask,
                               is_layer_norm=is_layer_norm,
                               init=init)
        v3 = tf.get_variable(
            name='v3',
            shape=[emb_dim, emb_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        M5, M6, norm_score3 = distance(Hu1, v3, Hr1)

        final_score = [norm_score1, norm_score2, norm_score3]
        final_score = [M2, M4, M6]

        M = tf.stack([M1, M2, M3, M4, M5, M6],
                     axis=1)  # bs*turn 6 sent_len sent_len

        M = layers.CNN_MSN(M, init=init)  # bs*turn 128
        M = tf.layers.dense(
            M,
            300,
            activation=tf.nn.tanh,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="dense1")  # bs turn_num 1

        M = tf.reshape(M, [-1, turn_num, 300])

        gru = tf.contrib.rnn.GRUCell(300)
        M = tf.nn.dynamic_rnn(gru, M, dtype=tf.float32)
        final_info = M[0][:, -1, :]
        final_info = tf.layers.dropout(final_info, rate=1.0 - keep_rate)

    return final_info, final_score
    def build_graph(self):
        with self._graph.as_default():
            if self._conf['rand_seed'] is not None:
                rand_seed = self._conf['rand_seed']
                tf.set_random_seed(rand_seed)
                print('set tf random seed: %s' % self._conf['rand_seed'])

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            #define placehloders
            #config max_turn_history_num
            self.turns_history = tf.placeholder(
                tf.int32,
                shape=[
                    self._conf["batch_size"],
                    self._conf["max_turn_history_num"],
                    self._conf["max_turn_len"]
                ])

            self.turns = tf.placeholder(tf.int32,
                                        shape=[
                                            self._conf["batch_size"],
                                            self._conf["max_turn_num"],
                                            self._conf["max_turn_len"]
                                        ])

            self.tt_turns_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.label = tf.placeholder(tf.float32,
                                        shape=[self._conf["batch_size"]])

            #define operations
            #response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
            turns_history_embedding = tf.nn.embedding_lookup(
                self._word_embedding, self.turns_history)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)
            Hr_stack = [Hr]

            _batch_size, _turn_nums, _turn_words, _emb_size = turns_history_embedding.get_shape(
            ).as_list()
            turns_history_embedding = tf.reshape(turns_history_embedding,
                                                 [-1, _turn_words, _emb_size])

            for index in range(self._conf['stack_num']):
                turns_history_embedding, _ = self._multihead(
                    turns_history_embedding, turns_history_embedding,
                    turns_history_embedding)

            turns_history_embedding = tf.reshape(
                turns_history_embedding,
                [_batch_size, _turn_nums, _turn_words, _emb_size])

            for index in range(self._conf['stack_num']):
                with tf.variable_scope('self_stack_' + str(index)):
                    Hr = layers.block(Hr,
                                      Hr,
                                      Hr,
                                      Q_lengths=self.response_len,
                                      K_lengths=self.response_len)
                    Hr_stack.append(Hr)

            with tf.variable_scope('respone_extraction_history'):
                turn_important_inf = []
                #需要增加一个全链接层
                for _t in tf.split(turns_history_embedding,
                                   self._conf['max_turn_history_num'], 1):
                    _t = tf.squeeze(_t)
                    #_match_result = layers.attention(Hr_stack[-1], _t,  _t, self.response_len, self.response_len)
                    _match_result = layers.attention(
                        self._dense1(Hr_stack[-1]), _t, _t, self.response_len,
                        self.response_len)
                    turn_important_inf.append(tf.expand_dims(_match_result, 1))

            best_turn_match = tf.concat(turn_important_inf, 1)
            with tf.variable_scope('response_extraciton_best_information'):
                #best_information,_ = self._multihead(Hr_stack[-1], best_turn_match, best_turn_match)
                best_information, _ = self._multihead(
                    self._dense2(Hr_stack[-1]), best_turn_match,
                    best_turn_match)
                best_information = layers.FFN(best_information)

            #context part
            #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)

            sim_turns = []
            #for every turn_t calculate matching vector
            for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
                Hu = tf.nn.embedding_lookup(
                    self._word_embedding,
                    turn_t)  #[batch, max_turn_len, emb_size]

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)
                Hu_stack = [Hu]

                for index in range(self._conf['stack_num']):

                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        Hu = layers.block(Hu,
                                          Hu,
                                          Hu,
                                          Q_lengths=t_turn_length,
                                          K_lengths=t_turn_length)

                        Hu_stack.append(Hu)

                r_a_t_stack = []
                t_a_r_stack = []
                for index in range(self._conf['stack_num'] + 1):

                    with tf.variable_scope('t_attend_r_' + str(index)):
                        try:
                            t_a_r = layers.block(tf.add(
                                Hu_stack[index], best_information),
                                                 Hr_stack[index],
                                                 Hr_stack[index],
                                                 Q_lengths=t_turn_length,
                                                 K_lengths=self.response_len)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            t_a_r = layers.block(tf.add(
                                Hu_stack[index], best_information),
                                                 Hr_stack[index],
                                                 Hr_stack[index],
                                                 Q_lengths=t_turn_length,
                                                 K_lengths=self.response_len)

                    with tf.variable_scope('r_attend_t_' + str(index)):
                        try:
                            r_a_t = layers.block(
                                Hr_stack[index],
                                tf.add(Hu_stack[index], best_information),
                                tf.add(Hu_stack[index], best_information),
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            r_a_t = layers.block(
                                Hr_stack[index],
                                tf.add(Hu_stack[index], best_information),
                                tf.add(Hu_stack[index], best_information),
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)

                    t_a_r_stack.append(t_a_r)
                    r_a_t_stack.append(r_a_t)

                t_a_r_stack.extend(Hu_stack)
                r_a_t_stack.extend(Hr_stack)

                t_a_r = tf.stack(t_a_r_stack, axis=-1)
                r_a_t = tf.stack(r_a_t_stack, axis=-1)

                #calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
                    # divide sqrt(200) to prevent gradient explosion
                    sim = tf.einsum('biks,bjks->bijs', t_a_r,
                                    r_a_t) / tf.sqrt(200.0)

                sim_turns.append(sim)

            #cnn and aggregation
            sim = tf.stack(sim_turns, axis=1)
            print('sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, 32, 16)
                #final_info_dim = final_info.get_shape().as_list()[-1]
                #for douban
                #final_info = layers.CNN_3d(sim, 16, 16)
                #                 _x = self._conv1d(best_information)
                #                 _x = self._pool1d(_x)
                #final_info = tf.concat([final_info,best_information],-1)

            #loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(
                    self.loss, global_step=self.global_step)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print(var)

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var)
                                   for grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs, global_step=self.global_step)

        return self._graph
    def build_graph(self):
        with self._graph.as_default():
            if self._conf['rand_seed'] is not None:
                rand_seed = self._conf['rand_seed']
                tf.set_random_seed(rand_seed)
                print('set tf random seed: %s' % self._conf['rand_seed'])

            # word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            # define placehloders
            self.turns = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"],
                       self._conf["max_turn_len"]])

            self.tt_turns_len = tf.placeholder(  # turn_num
                tf.int32,
                shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.turns_intent = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"],
                       self._conf["intent_size"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"]])

            self.response_intent = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"], self._conf["intent_size"]])

            self.label = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"]])

            # define operations
            # response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
            # [batch_size, max_turn_len, embed_size]

            # print('[after embedding_lookup] Hr shape: %s' % Hr.shape)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)
            Hr_stack = [Hr]  # 1st element of Hr_stack is the orginal embedding
            # lyang comments: self attention
            for index in range(self._conf['stack_num']):
                # print('[self attention for response] stack index: %d ' % index)
                with tf.variable_scope('self_stack_' + str(index)):
                    # [batch, max_turn_len, emb_size]
                    Hr = layers.block(  # attentive module
                        Hr, Hr, Hr,
                        Q_lengths=self.response_len,
                        K_lengths=self.response_len)
                    # print('[after layers.block] Hr shape: %s' % Hr.shape)
                    # Hr is still [batch_size, max_turn_len, embed_size]
                    Hr_stack.append(Hr)

            # print('[after self attention of response] len(Hr_stack)',
            #       len(Hr_stack))  # 1+stack_num
            # context part
            # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)
            list_turn_intent = tf.unstack(self.turns_intent, axis=1)

            sim_turns = []
            attention_turns = [] # intent based attention on each turn
            # for every turn_t calculate matching vector
            turn_index = 0
            for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent):
                print('current turn_index : ', turn_index)
                turn_index += 1
                Hu = tf.nn.embedding_lookup(self._word_embedding,
                                            turn_t)  # [batch, max_turn_len, emb_size]
                # print('[after embedding_lookup] Hu shape: %s' % Hu.shape)

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)
                Hu_stack = [Hu]  # 1st element of Hu_stack is the orginal embedding

                # lyang comments: self attention
                for index in range(self._conf['stack_num']):
                    # print('[self attention for context turn] stack index: %d ' % index)
                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        # [batch, max_turn_len, emb_size]
                        Hu = layers.block(  # attentive module
                            Hu, Hu, Hu,
                            Q_lengths=t_turn_length, K_lengths=t_turn_length)
                        # print('[after layers.block] Hu shape: %s' % Hu.shape)
                        Hu_stack.append(Hu)
                # print('[after self attention of context turn] len(Hu_stack)',
                #       len(Hu_stack))  # 1+stack_num

                # lyang comments: cross attention
                # print('[cross attention ...]')
                r_a_t_stack = []
                t_a_r_stack = []
                # cross attention
                for index in range(self._conf['stack_num'] + 1):
                    # print('[cross attention] stack index = ', index)
                    with tf.variable_scope('t_attend_r_' + str(index)):
                        try:
                            # [batch, max_turn_len, emb_size]
                            t_a_r = layers.block(  # attentive module
                                Hu_stack[index], Hr_stack[index],
                                Hr_stack[index],
                                Q_lengths=t_turn_length,
                                K_lengths=self.response_len)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            t_a_r = layers.block(
                                # [batch, max_turn_len, emb_size]
                                Hu_stack[index], Hr_stack[index],
                                Hr_stack[index],
                                Q_lengths=t_turn_length,
                                K_lengths=self.response_len)
                        # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % (
                        #         index, t_a_r.shape))

                    with tf.variable_scope('r_attend_t_' + str(index)):
                        try:
                            # [batch, max_turn_len, emb_size]
                            r_a_t = layers.block(  # attentive module
                                Hr_stack[index], Hu_stack[index],
                                Hu_stack[index],
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            r_a_t = layers.block(
                                Hr_stack[index], Hu_stack[index],
                                Hu_stack[index],
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % (
                        #         index, r_a_t.shape))

                    t_a_r_stack.append(t_a_r)
                    r_a_t_stack.append(r_a_t)
                    # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack))
                    # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack))

                # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack))
                # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack))
                # lyang comments: 3D aggregation
                t_a_r_stack.extend(
                    Hu_stack)  # half from self-attention; half from cross-attention
                r_a_t_stack.extend(
                    Hr_stack)  # half from self-attention; half from cross-attention
                # after extend, len(t_a_r_stack)) = 2*(stack_num+1)

                # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack))
                # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack))

                t_a_r = tf.stack(t_a_r_stack, axis=-1)
                r_a_t = tf.stack(r_a_t_stack, axis=-1)

                # print('after stack along the last dimension: ')
                # print('t_a_r shape: %s' % t_a_r.shape)
                # print('r_a_t shape: %s' % r_a_t.shape)
                # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1))

                with tf.variable_scope('intent_based_attention',
                                       reuse=tf.AUTO_REUSE): # share parameter across different turns
                    # there are 3 different ways to implement intent based attention
                    # implement these three different variations and compare the
                    # effectiveness as model abalation analysis
                    # let I_u_t and I_r_k are intent vector in [12,1]
                    # 1. dot: w * [I_u_t, I_r_k], where w is [24,1]
                    # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12]
                    # 3. outprod: I_u_t * I_r_k' -> [12,12] out product ->
                    #             flaten to [144,1] outprod -> w*outprod
                    #             where w is [1,144]
                    attention_logits = layers.attention_intent(t_intent,
                                        self.response_intent,
                                        self._conf['intent_attention_type'])
                    # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape)
                    attention_turns.append(attention_logits)

                    # calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # divide sqrt(200) to prevent gradient explosion
                    # A_biks * B_bjks -> C_bijs
                    sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(
                        200.0)
                    # (batch, max_turn_len, embed_size, 2*(stack_num+1)) *
                    # (batch, max_turn_len, embed_size, 2*(stack_num+1)) ->
                    # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # where k is corresponding to the dimension of embed_size,
                    # which can be eliminated by dot product with einsum
                    # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape)
                    # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # ! Here we multipy sim by intent based attention weights before
                    # append sim into sim_turns in order to generate the weighted
                    # stack in the next step

                sim_turns.append(sim)
                # print('[similarity] after append, len(sim_turns):', len(sim_turns))

            attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num]
            print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape)
            # add mask in attention following the way in BERT
            # real turn_num is in self.tt_turns_len [batch]
            # return a mask tensor with shape [batch,  conf['max_turn_num']]
            attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'],
                                              dtype=tf.float32)
            print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape)
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            adder = (1.0 - attention_mask) * -10000.0

            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_logits += adder
            attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num]
            print('[attention] attention.shape: %s' % attention_mask.shape)
            self.attention = attention # will print it for visualization

            # cnn and aggregation
            # lyang comments aggregation by 3D CNN layer
            # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10)
            # conv_0 shape: (32, 9, 180, 180, 16)
            # pooling_0 shape: (32, 3, 60, 60, 16)
            # conv_1 shape: (32, 3, 60, 60, 16)
            # pooling_1 shape: (32, 1, 20, 20, 16)
            # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size]
            # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)]
            # (32, 9, 180, 180, 10)
            sim = tf.stack(sim_turns, axis=1)
            # multipy sim by attention score
            sim = tf.einsum('bijks,bi->bijks', sim, attention)
            print('[3d cnn aggregation] sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'],
                                           self._conf['cnn_3d_oc1'])
                # for udc
                # final_info = layers.CNN_3d(sim, 32, 16)
                # for douban
                # final_info = layers.CNN_3d(sim, 16, 16)

            print('[3d cnn aggregation] final_info: %s' % final_info.shape)
            # loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(
                    self.loss,
                    global_step=self.global_step)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print var

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for
                                   grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs,
                    global_step=self.global_step)

        return self._graph
예제 #6
0
def self_cross_attention_block(config, Hu, every_turn_len, Hr, response_len):
    """

    :param config:
    :param Hu: shape = (batch_size, max_turn_num, sentence_len, emb_size)
    :param every_turn_len: shape = (batch_size, max_turn_num )
    :param Hr: shape = (batch_size, sentence_len, emb_size)
    :param response_len: shape = (batch_size)
    :return:
    """

    if config['is_positional'] and config['stack_num'] > 0:
        with tf.variable_scope('positional', reuse=tf.AUTO_REUSE):
            Hr = op.positional_encoding_vector(Hr, max_timescale=10)
    Hr_stack = [Hr]

    for index in range(config['stack_num']):
        with tf.variable_scope('self_stack_' + str(index),
                               reuse=tf.AUTO_REUSE):
            # Hr.shape = (batch_size, max_turn_len, emb_size)
            Hr = layers.block(Hr,
                              Hr,
                              Hr,
                              Q_lengths=response_len,
                              K_lengths=response_len)
            Hr_stack.append(Hr)

    # context part
    # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len, emb_size]
    list_turn_t = tf.unstack(Hu, axis=1)
    list_turn_length = tf.unstack(every_turn_len, axis=1)

    sim_turns = []
    # for every Hu calculate matching vector
    for Hu, t_turn_length in zip(list_turn_t, list_turn_length):
        if config['is_positional'] and config['stack_num'] > 0:
            with tf.variable_scope('positional', reuse=tf.AUTO_REUSE):
                Hu = op.positional_encoding_vector(Hu, max_timescale=10)
        Hu_stack = [Hu]

        for index in range(config['stack_num']):
            with tf.variable_scope('self_stack_' + str(index),
                                   reuse=tf.AUTO_REUSE):
                Hu = layers.block(Hu,
                                  Hu,
                                  Hu,
                                  Q_lengths=t_turn_length,
                                  K_lengths=t_turn_length)

                Hu_stack.append(Hu)

        r_a_t_stack = []
        t_a_r_stack = []
        for index in range(config['stack_num'] + 1):

            with tf.variable_scope('t_attend_r_' + str(index),
                                   reuse=tf.AUTO_REUSE):
                try:
                    t_a_r = layers.block(Hu_stack[index],
                                         Hr_stack[index],
                                         Hr_stack[index],
                                         Q_lengths=t_turn_length,
                                         K_lengths=response_len)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    t_a_r = layers.block(Hu_stack[index],
                                         Hr_stack[index],
                                         Hr_stack[index],
                                         Q_lengths=t_turn_length,
                                         K_lengths=response_len)

            with tf.variable_scope('r_attend_t_' + str(index),
                                   reuse=tf.AUTO_REUSE):
                try:
                    r_a_t = layers.block(Hr_stack[index],
                                         Hu_stack[index],
                                         Hu_stack[index],
                                         Q_lengths=response_len,
                                         K_lengths=t_turn_length)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    r_a_t = layers.block(Hr_stack[index],
                                         Hu_stack[index],
                                         Hu_stack[index],
                                         Q_lengths=response_len,
                                         K_lengths=t_turn_length)

            t_a_r_stack.append(t_a_r)
            r_a_t_stack.append(r_a_t)

        t_a_r_stack.extend(Hu_stack)
        r_a_t_stack.extend(Hr_stack)

        t_a_r = tf.stack(t_a_r_stack, axis=-1)
        r_a_t = tf.stack(r_a_t_stack, axis=-1)

        # calculate similarity matrix
        with tf.variable_scope('similarity', reuse=tf.AUTO_REUSE):
            # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
            # divide sqrt(200) to prevent gradient explosion
            sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0)

        sim_turns.append(sim)

    # cnn and aggregation
    sim = tf.stack(sim_turns, axis=1)
    print('sim shape: %s' % sim.shape)
    with tf.variable_scope('cnn_aggregation', reuse=tf.AUTO_REUSE):
        final_info = layers.CNN_3d(sim, 32, 16)

    with tf.variable_scope('linear', reuse=tf.AUTO_REUSE):
        W = tf.get_variable(name='weights',
                            shape=[final_info.shape[-1], 1],
                            initializer=tf.orthogonal_initializer())
        bias = tf.get_variable(name='bias',
                               shape=[1],
                               initializer=tf.zeros_initializer())

    logits = tf.reshape(tf.matmul(final_info, W) + bias, [-1])

    return logits
예제 #7
0
    def create_network(self):
        mask_cache = dict() if self.use_mask_cache else None

        response_emb = fluid.layers.embedding(
            input=self.response,
            size=[self._vocab_size + 1, self._emb_size],
            is_sparse=self.use_sparse_embedding,
            param_attr=fluid.ParamAttr(
                name=self.word_emb_name,
                initializer=fluid.initializer.Normal(scale=0.1)))

        # response part
        Hr = response_emb
        Hr_stack = [Hr]

        for index in six.moves.xrange(self._stack_num):
            Hr = layers.block(
                name="response_self_stack" + str(index),
                query=Hr,
                key=Hr,
                value=Hr,
                d_key=self._emb_size,
                q_mask=self.response_mask,
                k_mask=self.response_mask,
                mask_cache=mask_cache)
            Hr_stack.append(Hr)

        # context part
        sim_turns = []
        for t in six.moves.xrange(self._max_turn_num):
            Hu = fluid.layers.embedding(
                input=self.turns_data[t],
                size=[self._vocab_size + 1, self._emb_size],
                is_sparse=self.use_sparse_embedding,
                param_attr=fluid.ParamAttr(
                    name=self.word_emb_name,
                    initializer=fluid.initializer.Normal(scale=0.1)))
            Hu_stack = [Hu]

            for index in six.moves.xrange(self._stack_num):
                # share parameters
                Hu = layers.block(
                    name="turn_self_stack" + str(index),
                    query=Hu,
                    key=Hu,
                    value=Hu,
                    d_key=self._emb_size,
                    q_mask=self.turns_mask[t],
                    k_mask=self.turns_mask[t],
                    mask_cache=mask_cache)
                Hu_stack.append(Hu)

            # cross attention
            r_a_t_stack = []
            t_a_r_stack = []
            for index in six.moves.xrange(self._stack_num + 1):
                t_a_r = layers.block(
                    name="t_attend_r_" + str(index),
                    query=Hu_stack[index],
                    key=Hr_stack[index],
                    value=Hr_stack[index],
                    d_key=self._emb_size,
                    q_mask=self.turns_mask[t],
                    k_mask=self.response_mask,
                    mask_cache=mask_cache)
                r_a_t = layers.block(
                    name="r_attend_t_" + str(index),
                    query=Hr_stack[index],
                    key=Hu_stack[index],
                    value=Hu_stack[index],
                    d_key=self._emb_size,
                    q_mask=self.response_mask,
                    k_mask=self.turns_mask[t],
                    mask_cache=mask_cache)

                t_a_r_stack.append(t_a_r)
                r_a_t_stack.append(r_a_t)

            t_a_r_stack.extend(Hu_stack)
            r_a_t_stack.extend(Hr_stack)

            if self.use_stack_op:
                t_a_r = fluid.layers.stack(t_a_r_stack, axis=1)
                r_a_t = fluid.layers.stack(r_a_t_stack, axis=1)
            else:
                for index in six.moves.xrange(len(t_a_r_stack)):
                    t_a_r_stack[index] = fluid.layers.unsqueeze(
                        input=t_a_r_stack[index], axes=[1])
                    r_a_t_stack[index] = fluid.layers.unsqueeze(
                        input=r_a_t_stack[index], axes=[1])

                t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
                r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)

            # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
            sim = fluid.layers.matmul(
                x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
            sim_turns.append(sim)

        if self.use_stack_op:
            sim = fluid.layers.stack(sim_turns, axis=2)
        else:
            for index in six.moves.xrange(len(sim_turns)):
                sim_turns[index] = fluid.layers.unsqueeze(
                    input=sim_turns[index], axes=[2])
            # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
            sim = fluid.layers.concat(input=sim_turns, axis=2)

        final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
        loss, logits = layers.loss(final_info, self.label)
        return loss, logits
def cc_model(input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, word_emb, conf, con_c):


    #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
    list_turn_t1 = tf.unstack(input_x, axis=1) 
    list_turn_length1 = tf.unstack(input_x_len, axis=1)
    list_turn_length1 = [tf.sequence_mask(i, conf["max_turn_len"]) for i in list_turn_length1]
    list_turn_length1 = [tf.cast(i, tf.float32) for i in list_turn_length1]

    list_turn_t2 = tf.unstack(input_x2, axis=1) 
    list_turn_length2 = tf.unstack(input_x_len2, axis=1)
    list_turn_length2 = [tf.sequence_mask(i, conf["max_turn_len"]) for i in list_turn_length2]
    list_turn_length2 = [tf.cast(i, tf.float32) for i in list_turn_length2]

    if con_c:
        list_turn_t1 = tf.reshape(input_x, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]])
        list_turn_t1 = [list_turn_t1]
        list_turn_t2 = tf.reshape(input_x2, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]])
        list_turn_t2 = [list_turn_t2]
        list_turn_length1 = tf.cast(tf.sequence_mask(input_x_len, conf["max_turn_len"]), tf.float32)
        list_turn_length1 = tf.reshape(list_turn_length1, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]])
        list_turn_length1 = [list_turn_length1]
        list_turn_length2 = tf.cast(tf.sequence_mask(input_x_len2, conf["max_turn_len"]), tf.float32)
        list_turn_length2 = tf.reshape(list_turn_length2, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]])
        list_turn_length2 = [list_turn_length2]



    #for every turn_t calculate matching vector
    trans_u1, trans_u2 = [], []
    for turn_t, t_turn_length in zip(list_turn_t1, list_turn_length1):
        Hu = tf.nn.embedding_lookup(word_emb, turn_t) #[batch, max_turn_len, emb_size]
        #Hu = turn_t
        if conf['is_positional'] and conf['stack_num'] > 0:
            with tf.variable_scope('positional_', reuse=tf.AUTO_REUSE):
                Hu = op.positional_encoding_vector(Hu, max_timescale=10)
        for index in range(conf['stack_num']):
            with tf.variable_scope('self_stack_cc' + str(index), reuse=tf.AUTO_REUSE):
                Hu = layers.block(
                    Hu, Hu, Hu,
                    Q_lengths=t_turn_length, K_lengths=t_turn_length, input_mask=True)
        trans_u1.append(Hu)

    for turn_r, r_turn_length in zip(list_turn_t2, list_turn_length2):
        Hu = tf.nn.embedding_lookup(word_emb, turn_r) #[batch, max_turn_len, emb_size]
        #Hu = turn_r
        if conf['is_positional'] and conf['stack_num'] > 0:
            with tf.variable_scope('positional_', reuse=tf.AUTO_REUSE):
                Hu = op.positional_encoding_vector(Hu, max_timescale=10)
        for index in range(conf['stack_num']):
            with tf.variable_scope('self_stack_cc' + str(index), reuse=tf.AUTO_REUSE):
                Hu = layers.block(
                    Hu, Hu, Hu,
                    Q_lengths=r_turn_length, K_lengths=r_turn_length, input_mask=True)
        trans_u2.append(Hu)

    final_info_all = []
    sim_turns_all = []
    for t_inedx, (turn_t, t_turn_length, Hu) in enumerate(zip(list_turn_t1, list_turn_length1, trans_u1)):
        sim_turns = []
        for r_index, (turn_r, r_turn_length, Hr) in enumerate(zip(list_turn_t2, list_turn_length2, trans_u2)):

            with tf.variable_scope('u_attentd_r_' + str(index)):
                try:
                    u_a_r = layers.block(
                        Hu, Hr, Hr,
                        Q_lengths=t_turn_length, K_lengths=r_turn_length, input_mask=True)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    u_a_r = layers.block(
                        Hu, Hr, Hr,
                        Q_lengths=t_turn_length, K_lengths=r_turn_length, input_mask=True)
                    

            with tf.variable_scope('r_attend_u_' + str(index)):
                try:
                    r_a_u = layers.block(
                        Hr, Hu, Hu,
                        Q_lengths=r_turn_length, K_lengths=t_turn_length, input_mask=True)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    r_a_u = layers.block(
                        Hr, Hu, Hu,
                        Q_lengths=r_turn_length, K_lengths=t_turn_length, input_mask=True)
        
            # u_a_r batch_size turn emb
            u_a_r = tf.stack([u_a_r, Hu], axis=-1)
            r_a_u = tf.stack([r_a_u, Hr], axis=-1)
        
            #calculate similarity matrix
            with tf.variable_scope('similarity', reuse=tf.AUTO_REUSE):
                #sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
                #sim shape [batch, max_turn_len, max_turn_len, 2]
                sim = tf.einsum('biks,bjks->bijs', r_a_u, u_a_r) / tf.sqrt(200.0)
                sim = layers.CNN_FZX(sim)
            final_info_all.append(sim)

    att_weight_print = None
    if not con_c:
        # final_info_all
        final_info_all = tf.stack(final_info_all, axis=1)  # 100 9 144
        max_nei = 5
        gcn_size = conf["max_turn_num"]*conf["max_turn_num"]
        turn_size = conf["max_turn_num"]
        m1 = [ [] for i in range(gcn_size)]
        m_pos = [ [] for i in range(gcn_size)]
        m1_len = [ 0 for i in range(gcn_size)]
        for i in range(turn_size):
            for j in range(turn_size):
                cur_index = i*turn_size+j
                m1[cur_index].append(cur_index)
                m_pos[cur_index].extend([i,j])
                if cur_index%turn_size!=0:
                    m1[cur_index].append(cur_index-1)
                    m_pos[cur_index].extend([i-1,j])
                if cur_index%turn_size!=turn_size-1:
                    m1[cur_index].append(cur_index+1)
                    m_pos[cur_index].extend([i+1,j])
                if i!=0:
                    m1[cur_index].append(cur_index-turn_size)
                    m_pos[cur_index].extend([i,j-1])
                if i!=turn_size-1:
                    m1[cur_index].append(cur_index+turn_size)
                    m_pos[cur_index].extend([i,j+1])
                m1_len[cur_index] = len(m1[cur_index])
                if m1_len[cur_index]<max_nei:
                    m1[cur_index].extend([cur_index for k in range(max_nei-m1_len[cur_index])])
                    for k in range(max_nei-m1_len[cur_index]): m_pos[cur_index].extend([i,j])
        # m1 25 5
        # m1_len 25

        m1 = tf.constant(m1, dtype=tf.int32) # 25 5
        m1_len = tf.constant(m1_len, dtype=tf.int32)
        m_pos = tf.constant(m_pos, dtype=tf.int32)

        def gan(input_m, adjm, adjm_len, adjm_pos, gcn_size, turn_size, max_nei):
            #return input_m
            batch_size_gnn = tf.shape(input_m)[0]
            mask_value = tf.cast(tf.sequence_mask(adjm_len, max_nei), tf.float32) # 25 5
            res_all = []
            for gan_index in range(4):
                with tf.variable_scope('gan_layer'+str(gan_index), reuse=tf.AUTO_REUSE):
                    role_emb1 = tf.get_variable(name="gnn_role_emb1", shape=[turn_size, conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1))
                    role_emb2 = tf.get_variable(name="gnn_role_emb2", shape=[turn_size, conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1))

                    input_m_exp = tf.expand_dims(input_m, axis=2) # bs 25 1 144
                    input_m_exp = tf.tile(input_m_exp, [1, 1, max_nei, 1]) # bs 25 5 144

                    nei_rep = tf.gather(input_m, adjm, axis=1) # bs 25*5 144
                    nei_rep = tf.reshape(nei_rep, [tf.shape(input_m)[0], gcn_size, max_nei, -1]) # bs 25 5 144

                    att1 = tf.layers.dense(nei_rep, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcn") # bs 25 5 128
                    att2 = tf.layers.dense(input_m_exp, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcn") # bs 25 5 128


                    pos_index11 = tf.gather(adjm_pos, [0,], axis=1)
                    pos_index12 = tf.gather(adjm_pos, [1,], axis=1)
                    pos_index11 = tf.tile(pos_index11, [1, max_nei])
                    pos_index12 = tf.tile(pos_index12, [1, max_nei])

                    pos_index21 = tf.gather(adjm_pos, [0,2,4,6,8], axis=1)
                    pos_index22 = tf.gather(adjm_pos, [1,3,5,7,9], axis=1)

                    pos_index11 = tf.gather(role_emb1, pos_index11) # 25 5 30
                    pos_index12 = tf.gather(role_emb2, pos_index12) # 25 5 30
                    pos_index21 = tf.gather(role_emb1, pos_index21) # 25 5 30
                    pos_index22 = tf.gather(role_emb2, pos_index22) # 25 5 30

                    pos_index11 = tf.tile(tf.expand_dims(pos_index11, axis=0), [batch_size_gnn,1,1,1])
                    pos_index12 = tf.tile(tf.expand_dims(pos_index12, axis=0), [batch_size_gnn,1,1,1])
                    pos_index21 = tf.tile(tf.expand_dims(pos_index21, axis=0), [batch_size_gnn,1,1,1])
                    pos_index22 = tf.tile(tf.expand_dims(pos_index22, axis=0), [batch_size_gnn,1,1,1])


                    att = tf.concat([att1, att2], axis=-1)
                    #att = tf.concat([att1, att2, pos_index11, pos_index12, pos_index21, pos_index22], axis=-1)
                    att = tf.layers.dense(att, 1, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcna") # bs 25 5 128
                    att = tf.reshape(att, [-1, gcn_size, max_nei])
                    att = tf.nn.leaky_relu(att) # bs 25 5

                    att = att * tf.expand_dims(mask_value, axis=0)
                    att = tf.nn.softmax(att, axis=2) # bs 25 5
                    att = att * tf.expand_dims(mask_value, axis=0)

                    nei_rep2 = tf.layers.dense(nei_rep, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcnl") # bs 25 5 128
                    nei_rep11 = tf.layers.dense(input_m, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcnl") # bs 25 5 128
                    nei_rep2 = nei_rep2 * tf.expand_dims(tf.expand_dims(mask_value, axis=0), axis=-1)

                    res = tf.einsum('bdik,bdi->bdk', nei_rep2, att) # bs 25 128

                    att_input = res+nei_rep11
                    att_out = tf.layers.dense(att_input, 1, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="att"+str(i))
                    att_out = tf.nn.sigmoid(att_out)
                    print_weight = att_out
                    # att_out not used

                    res = res + nei_rep11
                    input_m = res
                    res_all.append(res)
            res_all = tf.concat(res_all, axis=-1)

            return res_all, print_weight

        gan_res, att_weight_print = gan(final_info_all, m1, m1_len, m_pos, gcn_size, turn_size, max_nei)


        final_info_all = gan_res
        final_info_role = []

        role_emb1 = tf.get_variable(name="role_emb1", shape=[len(list_turn_t1), conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1))
        role_emb2 = tf.get_variable(name="role_emb2", shape=[len(list_turn_t2), conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1))
        for i, ii in enumerate(list_turn_t1):
            for j, jj in enumerate(list_turn_t2):
                role_con = tf.concat([role_emb1[i], role_emb2[j]], axis=0)
                final_info_role.append(role_con)
        final_info_role = tf.stack(final_info_role, axis=0) # 9 50
        final_info_role = tf.expand_dims(final_info_role, 0) # 1 9 50
        final_info_role = tf.tile(final_info_role, [tf.shape(final_info_all)[0], 1, 1], name="role_con")
        final_info_all_att = tf.concat([final_info_role, final_info_all], axis=2)

        final_info_all_att = tf.reshape(final_info_all_att, [-1, final_info_all_att.get_shape()[-1]]) # bs*9 144
        final_info_all_att = tf.layers.dense(final_info_all_att, 1, kernel_initializer=tf.contrib.layers.xavier_initializer())
        final_info_all_att = tf.squeeze(final_info_all_att, [1])
        final_info_all_att = tf.reshape(final_info_all_att, [-1, final_info_all.get_shape()[1]]) # 100 9
        final_info_all_att = tf.nn.softmax(final_info_all_att, axis=1)

        final_info_all_att = tf.expand_dims(final_info_all_att, -1)
        final_info_all_max = tf.reduce_max(final_info_all, axis=1)
        final_info_all_mean = tf.reduce_mean(final_info_all, axis=1)
        final_info_all =  final_info_all * final_info_all_att
        final_info_all = tf.reduce_sum(final_info_all, axis=1)

        final_info_all = tf.concat([final_info_all_mean, final_info_all_max, final_info_all], axis=1)

    else:
        final_info_all = final_info_all[0]
    return final_info_all, att_weight_print
예제 #9
0
def cs_model(input_x,
             input_x_mask,
             input_x_len,
             input_x2,
             input_x_mask2,
             input_x_len2,
             input_x3,
             input_x_mask3,
             input_x_len3,
             word_emb,
             conf,
             initializer_opt=None):

    turn_num1 = input_x.shape[1]  #conf["max_turn_num"] # tf.shape(input_x)[1]
    turn_num2 = input_x2.shape[
        1]  # conf["max_turn_num"] #tf.shape(input_x2)[1]
    turn_num3 = input_x3.shape[
        1]  # conf["max_turn_num"] #tf.shape(input_x2)[1]
    sent_len = conf["max_turn_len"]
    emb_dim = conf["emb_size"]
    matchin_include_x = conf["matchin_include_x"]

    data_type = []
    if "history" in conf["cs_type"]: data_type.append("history")
    if "future" in conf["cs_type"]: data_type.append("future")

    merge_hf = False

    # EMB
    x_e = tf.nn.embedding_lookup(word_emb, input_x)
    x2_e = tf.nn.embedding_lookup(word_emb, input_x2)
    x3_e = tf.nn.embedding_lookup(word_emb, input_x3)

    x_e_mb = tf.reshape(x_e, [-1, sent_len, emb_dim])
    x2_e_mb = tf.reshape(x2_e, [-1, sent_len, emb_dim])
    x3_e_mb = tf.reshape(x3_e, [-1, sent_len, emb_dim])

    x_len_mb = tf.reshape(input_x_len, [-1])
    x_len2_mb = tf.reshape(input_x_len2, [-1])
    x_len3_mb = tf.reshape(input_x_len3, [-1])

    x_mask = tf.to_float(input_x_mask)  # bs turn_num1 sent_len
    x2_mask = tf.to_float(input_x_mask2)  # bs turn_num2 sent_len
    x3_mask = tf.to_float(input_x_mask3)  # bs turn_num3 sent_len

    # ==================================== Encoder Layer =============================
    with tf.variable_scope("Encode", reuse=tf.AUTO_REUSE):
        with tf.variable_scope('enc_self_att', reuse=tf.AUTO_REUSE):
            x_enc_mb = layers.block(
                x_e_mb, x_e_mb, x_e_mb, Q_lengths=x_len_mb,
                K_lengths=x_len_mb)  # bs*turn_num1 sent_len emb
            x2_enc_mb = layers.block(
                x2_e_mb,
                x2_e_mb,
                x2_e_mb,
                Q_lengths=x_len2_mb,
                K_lengths=x_len2_mb)  # bs*turn_num2 sent_len emb
            x3_enc_mb = layers.block(
                x3_e_mb,
                x3_e_mb,
                x3_e_mb,
                Q_lengths=x_len3_mb,
                K_lengths=x_len3_mb)  # bs*turn_num2 sent_len emb

    x_enc = tf.reshape(
        x_enc_mb,
        [-1, turn_num1, sent_len, emb_dim])  # bs turn_num1 sent_len emb
    x_mask_flat = tf.reshape(
        x_mask, [-1, turn_num1 * sent_len])  # bs turn_num1*sent_len
    x_enc_ts = tf.reshape(x_enc, [-1, turn_num1 * sent_len, emb_dim])
    x_mask_ts = tf.reshape(x_mask_flat, [-1, turn_num1 * sent_len])

    iter_rep = []
    input_all_dict = {"history": [], "future": []}
    input_all_dict["history"] = [
        x2_e_mb, x2_enc_mb, x_len2_mb, turn_num2, x2_mask
    ]
    input_all_dict["future"] = [
        x3_e_mb, x3_enc_mb, x_len3_mb, turn_num3, x3_mask
    ]
    save_dynamic_dict = {}
    all_mem_weight_dict = {}
    sim_ori_all = []

    for match_type in data_type:
        x5_e_mb, x5_enc_mb, x_len5_mb, turn_num5, x5_mask = input_all_dict[
            match_type]
        scope_name = "Model"
        with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):

            x5_enc = tf.reshape(x5_enc_mb, [-1, turn_num5, sent_len, emb_dim
                                            ])  # bs turn_num1 sent_len emb

            x5_enc_ts = tf.reshape(x5_enc_mb,
                                   [-1, turn_num5 * sent_len, emb_dim])
            x5_mask_ts = tf.reshape(x5_mask, [-1, turn_num5 * sent_len])

            # ==================================== Static Memory Layer =============================
            if conf["use_static_memory"]:
                with tf.variable_scope("static_memory", reuse=tf.AUTO_REUSE):

                    if merge_hf:
                        x_enc_ts = tf.reshape(
                            x_enc, [-1, turn_num1 * sent_len, emb_dim])
                        x_mask_ts = tf.reshape(x_mask_flat,
                                               [-1, turn_num1 * sent_len])
                        iter_rep_per1, iter_rep_per5 = match_layer(
                            x5_enc_ts,
                            x5_mask_ts,
                            x_enc_ts,
                            x_mask_ts,
                            emb_dim,
                            initializer_opt,
                            turn_num5=None)
                        iter_rep_per_out = tf.concat(
                            [iter_rep_per1, iter_rep_per5], axis=1)
                        print(iter_rep_per_out.shape)
                        iter_rep.append(iter_rep_per_out)
                    else:
                        x5_enc_mb = x5_enc_mb  # bs*turn_num2 sent_len emb_dim
                        x5_mask_mb = tf.reshape(x5_mask, [-1, sent_len])
                        x_enc_mb2_ts = tf.reshape(
                            tf.tile(tf.expand_dims(x_enc, axis=1),
                                    [1, turn_num5, 1, 1, 1]),
                            [-1, turn_num1 * sent_len, emb_dim
                             ])  # bs*turn_num2 sent_len*turn_num1 emb_dim
                        x_mask_mb2_ts = tf.reshape(
                            tf.tile(tf.expand_dims(x_mask_flat, axis=1),
                                    [1, turn_num5, 1]),
                            [-1, turn_num1 * sent_len])
                        x5_enc_ts = tf.reshape(
                            x5_enc_mb, [-1, turn_num5 * sent_len, emb_dim])
                        x5_e_ts = tf.reshape(
                            x5_e_mb, [-1, turn_num5 * sent_len, emb_dim])
                        x5_mask_ts = tf.reshape(x5_mask,
                                                [-1, turn_num5 * sent_len])
                        x_e_ts = tf.reshape(
                            x_e, [-1, turn_num1 * sent_len, emb_dim])
                        iter_rep_per_out, sim_ori = match_layer_selfatt(
                            x5_e_ts,
                            x5_enc_ts,
                            x5_mask_ts,
                            x_e_ts,
                            x_enc_ts,
                            x_mask_ts,
                            emb_dim,
                            initializer_opt,
                            turn_num5=None,
                            matchin_include_x=matchin_include_x)
                        sim_ori_all.append(sim_ori)
                        iter_rep.append(iter_rep_per_out)

            # ==================================== Dynamic Memory Layer Local =============================
            if conf["use_dynamic_memory"]:
                with tf.variable_scope("dynamic_memory", reuse=tf.AUTO_REUSE):
                    x5_enc_list = tf.unstack(
                        x5_enc, axis=1)  # bs [turn_num2] sent_len emb_dim
                    x5_mask_list = tf.unstack(
                        x5_mask, axis=1)  # bs [turn_num2] sent_len

                    x_enc_list = tf.unstack(x_enc, axis=1)
                    x_mask_list = tf.unstack(x_mask, axis=1)
                    x_enc_cur_list, x_mask_cur_list, _, _, _, _, all_mem_weight = mem_all_update(
                        x_enc_list,
                        x_mask_list,
                        initializer_opt,
                        need_reverse=True)
                    all_mem_weight = tf.stack(all_mem_weight, axis=1)
                    all_mem_weight_dict[match_type + "_query"] = all_mem_weight

                    x_enc_mb2_ts = tf.reshape(
                        x_enc_cur_list, [-1, turn_num1 * sent_len, emb_dim])
                    x_mask_mb2_ts = tf.reshape(x_mask_cur_list,
                                               [-1, turn_num1 * sent_len])

                    if match_type == "history": need_reverse = True
                    else: need_reverse = False

                    x5_enc_cur_list, x5_mask_cur_list, x5_enc_list, x5_mask_list, x5_enc_cur_last, x5_mask_cur_last, all_mem_weight = mem_all_update(
                        x5_enc_list,
                        x5_mask_list,
                        initializer_opt,
                        need_reverse=need_reverse)
                    all_mem_weight = tf.stack(all_mem_weight, axis=1)
                    all_mem_weight_dict[match_type] = all_mem_weight

                    turn_xishu = turn_num5
                    # else: turn_xishu=1
                    x5_enc_cur_list = tf.reshape(
                        x5_enc_cur_list, [-1, turn_xishu * sent_len, emb_dim])
                    x5_mask_cur_list = tf.reshape(x5_mask_cur_list,
                                                  [-1, turn_xishu * sent_len])

                    save_dynamic_dict[match_type] = [
                        x5_e_mb, x5_enc_cur_list, x5_mask_cur_list,
                        x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list,
                        turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last,
                        x5_mask_cur_last
                    ]

    # ==================================== Dynamic Memory Layer Global =============================
    data_type_dm2 = copy.deepcopy(data_type)
    # data_type_dm2 = []
    if conf["use_dynamic_memory"] and conf["dynamic_memory_global"]:
        for match_type in data_type:
            scope_name = "Model"
            with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
                with tf.variable_scope("dynamic_memory_part2",
                                       reuse=tf.AUTO_REUSE):
                    x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list, turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last, x5_mask_cur_last = save_dynamic_dict[
                        match_type]

                    h1 = save_dynamic_dict["history"][-4]
                    h2 = save_dynamic_dict["history"][-3]
                    hq1 = x_enc_ts
                    hq2 = x_mask_ts
                    f1 = save_dynamic_dict["future"][-4]
                    f2 = save_dynamic_dict["future"][-3]
                    g_last_his = [h1, h2]
                    g_last_fut = [f1, f2]

                    g_last_fut = None
                    g1 = tf.concat([h1, f1, hq1], axis=1)
                    g2 = tf.concat([h2, f2, hq2], axis=1)
                    g_last_his = [g1, g2]
                    x5_mask_cur_last, x5_enc_cur_last = None, None

                    x_enc_cur_list, x_mask_cur_list, _, _, _, _, all_mem_weight = mem_all_update(
                        x_enc_list,
                        x_mask_list,
                        initializer_opt,
                        need_reverse=True,
                        g_last_his=g_last_his,
                        g_last_fut=g_last_fut)

                    x5_enc_cur_list, x5_mask_cur_list, x5_enc_list, x5_mask_list, x5_enc_cur_last, x5_mask_cur_last, all_mem_weight = mem_all_update(
                        x5_enc_list,
                        x5_mask_list,
                        initializer_opt,
                        x_enc_cur_last=x5_enc_cur_last,
                        x_mask_cur_last=x5_mask_cur_last,
                        g_last_his=g_last_his,
                        g_last_fut=g_last_fut)

                    x_enc_mb2_ts = tf.reshape(
                        x_enc_cur_list, [-1, turn_num1 * sent_len, emb_dim])
                    x_mask_mb2_ts = tf.reshape(x_mask_cur_list,
                                               [-1, turn_num1 * sent_len])

                    turn_xishu = turn_num5
                    x5_enc_cur_list = tf.reshape(
                        x5_enc_cur_list, [-1, turn_xishu * sent_len, emb_dim])
                    x5_mask_cur_list = tf.reshape(x5_mask_cur_list,
                                                  [-1, turn_xishu * sent_len])

                    save_dynamic_dict[match_type + "_global"] = [
                        x5_e_mb, x5_enc_cur_list, x5_mask_cur_list,
                        x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list,
                        turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last,
                        x5_mask_cur_last
                    ]
                    data_type_dm2.append(match_type + "_global")

    # ==================================== Dynamic Memory Layer AGG =============================
    for match_type in data_type_dm2:
        scope_name = "Model"
        # if conf["sepqrate_cs"]: scope_name = "Model"+match_type
        with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
            if conf["use_dynamic_memory"]:
                scope_name1 = "dynamic_memory_part3" if "global" not in match_type else "dynamic_memory_part3_global"
                with tf.variable_scope(scope_name1, reuse=tf.AUTO_REUSE):
                    x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, _, _, turn_num5, _, _, _, _ = save_dynamic_dict[
                        match_type]

                    x_e_ts = tf.reshape(x_e,
                                        [-1, turn_num1 * sent_len, emb_dim])
                    x5_e_ts = tf.reshape(x5_e_mb,
                                         [-1, turn_num5 * sent_len, emb_dim])
                    iter_rep_per_out, sim_ori = match_layer_selfatt(
                        x5_e_ts,
                        x5_enc_cur_list,
                        x5_mask_cur_list,
                        x_e_ts,
                        x_enc_mb2_ts,
                        x_mask_mb2_ts,
                        emb_dim,
                        initializer_opt,
                        turn_num5=None,
                        matchin_include_x=matchin_include_x)
                    iter_rep.append(iter_rep_per_out)

    iter_rep_con = tf.concat(iter_rep, axis=1)
    return iter_rep_con, iter_rep, all_mem_weight_dict, save_dynamic_dict, sim_ori_all
def dam_model(input_x,
              input_x_mask,
              input_y,
              input_y_mask,
              word_emb,
              keep_rate,
              conf,
              x_len=None,
              y_len=None):

    Hr = tf.nn.embedding_lookup(word_emb, input_y)

    if conf['is_positional'] and conf['stack_num'] > 0:
        with tf.variable_scope('positional'):
            Hr = op.positional_encoding_vector(Hr, max_timescale=10)
    Hr_stack = [Hr]

    for index in range(conf['stack_num']):
        with tf.variable_scope('self_stack_cr_' + str(index)):
            Hr = layers.block(Hr, Hr, Hr, Q_lengths=y_len, K_lengths=y_len)
            Hr_stack.append(Hr)

    #context part
    #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
    list_turn_t = tf.unstack(input_x, axis=1)
    list_turn_length = tf.unstack(x_len, axis=1)

    sim_turns = []
    #for every turn_t calculate matching vector
    for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
        Hu = tf.nn.embedding_lookup(word_emb,
                                    turn_t)  #[batch, max_turn_len, emb_size]

        if conf['is_positional'] and conf['stack_num'] > 0:
            with tf.variable_scope('positional', reuse=True):
                Hu = op.positional_encoding_vector(Hu, max_timescale=10)
        Hu_stack = [Hu]

        for index in range(conf['stack_num']):

            with tf.variable_scope('self_stack_cr_' + str(index), reuse=True):
                Hu = layers.block(Hu,
                                  Hu,
                                  Hu,
                                  Q_lengths=t_turn_length,
                                  K_lengths=t_turn_length)

                Hu_stack.append(Hu)

        r_a_t_stack = []
        t_a_r_stack = []
        for index in range(conf['stack_num'] + 1):

            with tf.variable_scope('t_attend_r_cr_' + str(index)):
                try:
                    t_a_r = layers.block(Hu_stack[index],
                                         Hr_stack[index],
                                         Hr_stack[index],
                                         Q_lengths=t_turn_length,
                                         K_lengths=y_len)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    t_a_r = layers.block(Hu_stack[index],
                                         Hr_stack[index],
                                         Hr_stack[index],
                                         Q_lengths=t_turn_length,
                                         K_lengths=y_len)

            with tf.variable_scope('r_attend_t_cr_' + str(index)):
                try:
                    r_a_t = layers.block(Hr_stack[index],
                                         Hu_stack[index],
                                         Hu_stack[index],
                                         Q_lengths=y_len,
                                         K_lengths=t_turn_length)
                except ValueError:
                    tf.get_variable_scope().reuse_variables()
                    r_a_t = layers.block(Hr_stack[index],
                                         Hu_stack[index],
                                         Hu_stack[index],
                                         Q_lengths=y_len,
                                         K_lengths=t_turn_length)

            t_a_r_stack.append(t_a_r)
            r_a_t_stack.append(r_a_t)

        t_a_r_stack.extend(Hu_stack)
        r_a_t_stack.extend(Hr_stack)

        t_a_r = tf.stack(t_a_r_stack, axis=-1)
        r_a_t = tf.stack(r_a_t_stack, axis=-1)

        #calculate similarity matrix
        with tf.variable_scope('similarity'):
            # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
            # divide sqrt(200) to prevent gradient explosion
            sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0)

        sim_turns.append(sim)

    #cnn and aggregation
    sim = tf.stack(sim_turns, axis=1)
    print('sim shape: %s' % sim.shape)
    with tf.variable_scope('cnn_aggregation'):
        final_info = layers.CNN_3d(sim, 32, 16)
        #for douban
        #final_info = layers.CNN_3d(sim, 16, 16)

    return final_info