예제 #1
0
    def build_graph(self):
        with self._graph.as_default():
            rand_seed = self._conf['rand_seed']
            tf.set_random_seed(rand_seed)

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            #define placehloders
            self.turns = tf.placeholder(tf.int32,
                                        shape=[
                                            self._conf["batch_size"],
                                            self._conf["max_turn_num"],
                                            self._conf["max_turn_len"]
                                        ])

            self.tt_turns_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.label = tf.placeholder(tf.float32,
                                        shape=[self._conf["batch_size"]])

            #define operations
            #response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)

            for index in range(self._conf['stack_num']):
                with tf.variable_scope('self_stack_' + str(index)):
                    Hr = layers.block(Hr,
                                      Hr,
                                      Hr,
                                      Q_lengths=self.response_len,
                                      K_lengths=self.response_len)

            #context part
            #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)

            sim_turns = []
            #for every turn_t calculate matching vector
            for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
                Hu = tf.nn.embedding_lookup(
                    self._word_embedding,
                    turn_t)  #[batch, max_turn_len, emb_size]

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)

                for index in range(self._conf['stack_num']):

                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        Hu = layers.block(Hu,
                                          Hu,
                                          Hu,
                                          Q_lengths=t_turn_length,
                                          K_lengths=t_turn_length)

                with tf.variable_scope('u_attentd_r_' + str(index)):
                    try:
                        u_a_r = layers.block(Hu,
                                             Hr,
                                             Hr,
                                             Q_lengths=t_turn_length,
                                             K_lengths=self.response_len)
                    except ValueError:
                        tf.get_variable_scope().reuse_variables()
                        u_a_r = layers.block(Hu,
                                             Hr,
                                             Hr,
                                             Q_lengths=t_turn_length,
                                             K_lengths=self.response_len)

                with tf.variable_scope('r_attend_u_' + str(index)):
                    try:
                        r_a_u = layers.block(Hr,
                                             Hu,
                                             Hu,
                                             Q_lengths=self.response_len,
                                             K_lengths=t_turn_length)
                    except ValueError:
                        tf.get_variable_scope().reuse_variables()
                        r_a_u = layers.block(Hr,
                                             Hu,
                                             Hu,
                                             Q_lengths=self.response_len,
                                             K_lengths=t_turn_length)

                u_a_r = tf.stack([u_a_r, Hu], axis=-1)
                r_a_u = tf.stack([r_a_u, Hr], axis=-1)

                #calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
                    # divide sqrt(200) to prevent gradient explosion
                    sim = tf.einsum('biks,bjks->bijs', r_a_u,
                                    u_a_r) / tf.sqrt(200.0)

                sim_turns.append(sim)

            #cnn and aggregation
            sim = tf.stack(sim_turns, axis=1)
            print('sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, 32, 16)
                #for douban
                #final_info = layers.CNN_3d(sim, 16, 16)

            #loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(self.loss)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print var

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var)
                                   for grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs, global_step=self.global_step)

        return self._graph
    def build_graph(self):
        with self._graph.as_default():
            if self._conf['rand_seed'] is not None:
                rand_seed = self._conf['rand_seed']
                tf.set_random_seed(rand_seed)
                print('set tf random seed: %s' % self._conf['rand_seed'])

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            #define placehloders
            #config max_turn_history_num
            self.turns_history = tf.placeholder(
                tf.int32,
                shape=[
                    self._conf["batch_size"],
                    self._conf["max_turn_history_num"],
                    self._conf["max_turn_len"]
                ])

            self.turns = tf.placeholder(tf.int32,
                                        shape=[
                                            self._conf["batch_size"],
                                            self._conf["max_turn_num"],
                                            self._conf["max_turn_len"]
                                        ])

            self.tt_turns_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32, shape=[self._conf["batch_size"]])

            self.label = tf.placeholder(tf.float32,
                                        shape=[self._conf["batch_size"]])

            #define operations
            #response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
            turns_history_embedding = tf.nn.embedding_lookup(
                self._word_embedding, self.turns_history)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)
            Hr_stack = [Hr]

            _batch_size, _turn_nums, _turn_words, _emb_size = turns_history_embedding.get_shape(
            ).as_list()
            turns_history_embedding = tf.reshape(turns_history_embedding,
                                                 [-1, _turn_words, _emb_size])

            for index in range(self._conf['stack_num']):
                turns_history_embedding, _ = self._multihead(
                    turns_history_embedding, turns_history_embedding,
                    turns_history_embedding)

            turns_history_embedding = tf.reshape(
                turns_history_embedding,
                [_batch_size, _turn_nums, _turn_words, _emb_size])

            for index in range(self._conf['stack_num']):
                with tf.variable_scope('self_stack_' + str(index)):
                    Hr = layers.block(Hr,
                                      Hr,
                                      Hr,
                                      Q_lengths=self.response_len,
                                      K_lengths=self.response_len)
                    Hr_stack.append(Hr)

            with tf.variable_scope('respone_extraction_history'):
                turn_important_inf = []
                #需要增加一个全链接层
                for _t in tf.split(turns_history_embedding,
                                   self._conf['max_turn_history_num'], 1):
                    _t = tf.squeeze(_t)
                    #_match_result = layers.attention(Hr_stack[-1], _t,  _t, self.response_len, self.response_len)
                    _match_result = layers.attention(
                        self._dense1(Hr_stack[-1]), _t, _t, self.response_len,
                        self.response_len)
                    turn_important_inf.append(tf.expand_dims(_match_result, 1))

            best_turn_match = tf.concat(turn_important_inf, 1)
            with tf.variable_scope('response_extraciton_best_information'):
                #best_information,_ = self._multihead(Hr_stack[-1], best_turn_match, best_turn_match)
                best_information, _ = self._multihead(
                    self._dense2(Hr_stack[-1]), best_turn_match,
                    best_turn_match)
                best_information = layers.FFN(best_information)

            #context part
            #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)

            sim_turns = []
            #for every turn_t calculate matching vector
            for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):
                Hu = tf.nn.embedding_lookup(
                    self._word_embedding,
                    turn_t)  #[batch, max_turn_len, emb_size]

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)
                Hu_stack = [Hu]

                for index in range(self._conf['stack_num']):

                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        Hu = layers.block(Hu,
                                          Hu,
                                          Hu,
                                          Q_lengths=t_turn_length,
                                          K_lengths=t_turn_length)

                        Hu_stack.append(Hu)

                r_a_t_stack = []
                t_a_r_stack = []
                for index in range(self._conf['stack_num'] + 1):

                    with tf.variable_scope('t_attend_r_' + str(index)):
                        try:
                            t_a_r = layers.block(tf.add(
                                Hu_stack[index], best_information),
                                                 Hr_stack[index],
                                                 Hr_stack[index],
                                                 Q_lengths=t_turn_length,
                                                 K_lengths=self.response_len)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            t_a_r = layers.block(tf.add(
                                Hu_stack[index], best_information),
                                                 Hr_stack[index],
                                                 Hr_stack[index],
                                                 Q_lengths=t_turn_length,
                                                 K_lengths=self.response_len)

                    with tf.variable_scope('r_attend_t_' + str(index)):
                        try:
                            r_a_t = layers.block(
                                Hr_stack[index],
                                tf.add(Hu_stack[index], best_information),
                                tf.add(Hu_stack[index], best_information),
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            r_a_t = layers.block(
                                Hr_stack[index],
                                tf.add(Hu_stack[index], best_information),
                                tf.add(Hu_stack[index], best_information),
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)

                    t_a_r_stack.append(t_a_r)
                    r_a_t_stack.append(r_a_t)

                t_a_r_stack.extend(Hu_stack)
                r_a_t_stack.extend(Hr_stack)

                t_a_r = tf.stack(t_a_r_stack, axis=-1)
                r_a_t = tf.stack(r_a_t_stack, axis=-1)

                #calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1]
                    # divide sqrt(200) to prevent gradient explosion
                    sim = tf.einsum('biks,bjks->bijs', t_a_r,
                                    r_a_t) / tf.sqrt(200.0)

                sim_turns.append(sim)

            #cnn and aggregation
            sim = tf.stack(sim_turns, axis=1)
            print('sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, 32, 16)
                #final_info_dim = final_info.get_shape().as_list()[-1]
                #for douban
                #final_info = layers.CNN_3d(sim, 16, 16)
                #                 _x = self._conv1d(best_information)
                #                 _x = self._pool1d(_x)
                #final_info = tf.concat([final_info,best_information],-1)

            #loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(
                    self.loss, global_step=self.global_step)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print(var)

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var)
                                   for grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs, global_step=self.global_step)

        return self._graph
예제 #3
0
    def build_graph(self):
        with self._graph.as_default():
            rand_seed = self._conf['rand_seed']
            tf.set_random_seed(rand_seed)

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer,
                trainable=True)

            batch_size = None
            #define placehloders
            self.turns1 = tf.placeholder(tf.int32,
                                         shape=[
                                             batch_size,
                                             self._conf["max_turn_num"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turns1")
            self.tt_turns_len1 = tf.placeholder(tf.int32,
                                                shape=[
                                                    batch_size,
                                                ],
                                                name="tt_turns_len1")
            self.every_turn_len1 = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_num"]],
                name="every_turn_len1")
            self.response = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_len"]],
                name="response")
            self.response_len = tf.placeholder(tf.int32,
                                               shape=[
                                                   batch_size,
                                               ],
                                               name="response_len")
            self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate")
            self.label = tf.placeholder(tf.float32, shape=[
                batch_size,
            ])

            # ==================================== Building Model =============================
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())),
                "Starting build Model")

            if self.cr_model == "SMN":
                input_x = self.turns1
                input_y = self.response

                with tf.variable_scope('model_cr_smn'):
                    final_info_cr = smn_model(input_x,
                                              None,
                                              input_y,
                                              None,
                                              self._word_embedding,
                                              self.keep_rate,
                                              self._conf,
                                              x_len=self.every_turn_len1,
                                              y_len=self.response_len)
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # DAM
            elif self.cr_model == "DAM":
                input_x = self.turns1
                input_y = self.response

                with tf.variable_scope('model_cr_dam'):
                    final_info_cr = dam_model(input_x,
                                              None,
                                              input_y,
                                              None,
                                              self._word_embedding,
                                              self.keep_rate,
                                              self._conf,
                                              x_len=self.every_turn_len1,
                                              y_len=self.response_len)
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # MSN
            elif self.cr_model == "MSN":
                input_x = self.turns1
                input_y = self.response

                with tf.variable_scope('model_cr_msn'):
                    final_info_cr, self.final_score = msn_model(
                        input_x,
                        None,
                        input_y,
                        None,
                        self._word_embedding,
                        self.keep_rate,
                        self._conf,
                        x_len=self.every_turn_len1,
                        y_len=self.response_len)
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # ESIM
            elif self.cr_model == "ESIM":
                input_x = tf.reshape(self.turns1, [
                    -1, self._conf["max_turn_num"] * self._conf["max_turn_len"]
                ])
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                input_x_mask = tf.reshape(input_x_mask, [
                    -1, self._conf["max_turn_num"] * self._conf["max_turn_len"]
                ])
                input_y = self.response
                input_y_mask = tf.sequence_mask(self.response_len,
                                                self._conf["max_turn_len"])

                with tf.variable_scope('model_cr_esim'):
                    final_info_cr = esim_model(input_x, input_x_mask, input_y,
                                               input_y_mask,
                                               self._word_embedding,
                                               self.keep_rate)
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            elif self.cr_model == "IOI":
                input_x = tf.reshape(self.turns1, [
                    -1, self._conf["max_turn_num"] * self._conf["max_turn_len"]
                ])
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                input_x_mask = tf.reshape(input_x_mask, [
                    -1, self._conf["max_turn_num"] * self._conf["max_turn_len"]
                ])

                input_y = self.response
                input_y_mask = tf.sequence_mask(self.response_len,
                                                self._conf["max_turn_len"])

                with tf.variable_scope('model_cr_ioi'):
                    final_info_cr, final_info_cr_ioi = ioi_model(
                        input_x, input_x_mask, input_y, input_y_mask,
                        self._word_embedding, self.keep_rate, self._conf)
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # ==================================== Calculating Model =============================
            self.trainops = {
                "cr": dict(),
            }
            loss_input = final_info_cr

            for loss_type in [
                    "cr",
            ]:
                with tf.variable_scope('loss_' + loss_type):
                    if self.cr_model == "IOI":
                        loss_list = []
                        logits_list = []
                        for i, j in enumerate(final_info_cr_ioi):
                            with tf.variable_scope("loss" + str(i)):
                                loss_per, logits_per = layers.loss(
                                    j, self.label)
                            loss_list.append(loss_per)
                            logits_list.append(logits_per)
                        self.trainops[loss_type]["loss"] = sum([
                            ((idx + 1) / self._conf["ioi_layer_num"]) * item
                            for idx, item in enumerate(loss_list)
                        ])
                        self.trainops[loss_type]["logits"] = sum(logits_list)
                    else:
                        self.trainops[loss_type]["loss"], self.trainops[
                            loss_type]["logits"] = layers.loss(
                                final_info_cr, self.label)

                    self.trainops[loss_type]["global_step"] = tf.Variable(
                        0, trainable=False)
                    initial_learning_rate = self._conf['learning_rate']
                    self.trainops[loss_type][
                        "learning_rate"] = tf.train.exponential_decay(
                            initial_learning_rate,
                            global_step=self.trainops[loss_type]
                            ["global_step"],
                            decay_steps=self._conf["decay_step"],
                            decay_rate=self._conf["decay_rate"],
                            staircase=True)

                    Optimizer = tf.train.AdamOptimizer(
                        self.trainops[loss_type]["learning_rate"])
                    self.trainops[loss_type]["optimizer"] = Optimizer.minimize(
                        self.trainops[loss_type]["loss"])

                    self.trainops[loss_type][
                        "grads_and_vars"] = Optimizer.compute_gradients(
                            self.trainops[loss_type]["loss"])

                    self.trainops[loss_type]["capped_gvs"] = [
                        (tf.clip_by_value(grad, -5, 5), var) for grad, var in
                        self.trainops[loss_type]["grads_and_vars"]
                        if grad != None
                    ]
                    self.trainops[loss_type][
                        "g_updates"] = Optimizer.apply_gradients(
                            self.trainops[loss_type]["capped_gvs"],
                            global_step=self.trainops[loss_type]
                            ["global_step"])

            self.all_variables = tf.global_variables()
            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver(max_to_keep=self._conf["max_to_keep"])

            self.all_operations = self._graph.get_operations()

        return self._graph
    def build_graph(self):
        with self._graph.as_default():
            rand_seed = self._conf['rand_seed']
            tf.set_random_seed(rand_seed)

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer,
                trainable=True)

            batch_size = None
            initializer_opt = tf.contrib.layers.variance_scaling_initializer(
                factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32)
            initializer_opt = tf.truncated_normal_initializer(stddev=0.02)

            self.turns_sess_num = self._conf["max_turn_num_hf"] * 2 + 1
            self.turns_q_num = self._conf["max_turn_num"]

            #define placehloders
            self.turns1 = tf.placeholder(tf.int32,
                                         shape=[
                                             batch_size,
                                             self._conf["max_turn_num"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turns1")
            self.tt_turns_len1 = tf.placeholder(tf.int32,
                                                shape=[
                                                    batch_size,
                                                ],
                                                name="tt_turns_len1")
            self.every_turn_len1 = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_num"]],
                name="every_turn_len1")
            self.turns2 = tf.placeholder(tf.int32,
                                         shape=[
                                             batch_size,
                                             self._conf["max_turn_num_hf"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turns2")
            self.tt_turns_len2 = tf.placeholder(tf.int32,
                                                shape=[
                                                    batch_size,
                                                ],
                                                name="tt_turns_len2")
            self.every_turn_len2 = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_num_hf"]],
                name="every_turn_len2")
            self.turnsf = tf.placeholder(tf.int32,
                                         shape=[
                                             batch_size,
                                             self._conf["max_turn_num_hf"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turnsf")
            self.tt_turns_lenf = tf.placeholder(tf.int32,
                                                shape=[
                                                    batch_size,
                                                ],
                                                name="tt_turns_lenf")
            self.every_turn_lenf = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_num_hf"]],
                name="every_turn_lenf")
            self.response = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_len"]],
                name="response")
            self.response_len = tf.placeholder(tf.int32,
                                               shape=[
                                                   batch_size,
                                               ],
                                               name="response_len")
            self.turnsa = tf.placeholder(
                tf.int32,
                shape=[
                    batch_size,
                    self._conf["max_turn_len"] * self.turns_sess_num
                ],
                name="turnsa")
            self.turnsa_len = tf.placeholder(tf.int32,
                                             shape=[
                                                 batch_size,
                                             ],
                                             name="turnsa_len")
            self.turnsq = tf.placeholder(
                tf.int32,
                shape=[
                    batch_size, self._conf["max_turn_len"] * self.turns_q_num
                ],
                name="turnsq")
            self.turnsq_len = tf.placeholder(tf.int32,
                                             shape=[
                                                 batch_size,
                                             ],
                                             name="turnsq_len")
            self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate")

            self.turns_sess = tf.placeholder(
                tf.int32,
                shape=[
                    batch_size, self._conf["max_turn_num_sess"],
                    self._conf["max_turn_len"]
                ],
                name="turns_sess")
            self.tt_turns_len_sess = tf.placeholder(tf.int32,
                                                    shape=[
                                                        batch_size,
                                                    ],
                                                    name="tt_turns_len_sess")
            self.every_turn_len_sess = tf.placeholder(
                tf.int32,
                shape=[batch_size, self._conf["max_turn_num_sess"]],
                name="every_turn_len_sess")

            self.label = tf.placeholder(tf.float32, shape=[
                batch_size,
            ])

            # ==================================== CS Model =============================
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())),
                "Starting build CS Model")

            input_x = self.turns1
            input_x_len = self.every_turn_len1
            input_x_mask = tf.sequence_mask(input_x_len,
                                            self._conf["max_turn_len"])

            input_xf = self.turnsf
            input_x_lenf = self.every_turn_lenf

            input_xf = tf.concat(
                [tf.expand_dims(self.response, axis=1), input_xf], axis=1)
            input_x_lenf = tf.concat(
                [input_x_lenf,
                 tf.expand_dims(self.response_len, axis=1)],
                axis=1)
            input_x_maskf = tf.sequence_mask(input_x_lenf,
                                             self._conf["max_turn_len"])

            input_x2 = self.turns2
            input_x_len2 = self.every_turn_len2
            input_x2 = tf.concat(
                [input_x2, tf.expand_dims(self.response, axis=1)], axis=1)
            input_x_len2 = tf.concat(
                [input_x_len2,
                 tf.expand_dims(self.response_len, axis=1)],
                axis=1)
            input_x_mask2 = tf.sequence_mask(input_x_len2,
                                             self._conf["max_turn_len"])

            with tf.variable_scope('model_crdms'):
                final_info_cs, final_info_css, self.all_mem_weight_dict, self.save_dynamic_dict, self.sim_ori = cs_model(
                    input_x, input_x_mask, input_x_len, input_x2,
                    input_x_mask2, input_x_len2, input_xf, input_x_maskf,
                    input_x_lenf, self._word_embedding, self._conf)
                final_info_cs = tf.layers.dense(
                    final_info_cs,
                    50,
                    kernel_initializer=tf.contrib.layers.xavier_initializer())

            # ==================================== Calculate Loss =============================
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())),
                "Starting calculate Loss")
            self.trainops = {"cs": dict()}
            all_loss_inouts = [
                ["cs", final_info_cs],
            ]
            for loss_type, loss_input in all_loss_inouts:
                if loss_type != self._conf["train_type"] and loss_type != "cr":
                    continue
                with tf.variable_scope('loss_' + loss_type):

                    self.trainops[loss_type]["loss"], self.trainops[loss_type][
                        "logits"] = layers.loss(loss_input, self.label)

                    use_loss_weight = True
                    loss_added, logits_added = [], []
                    num_loss = len(final_info_css)
                    for i, j in enumerate(final_info_css):
                        with tf.variable_scope("losscc" + str(i)):
                            loss_per, logits_per = layers.loss(j, self.label)
                        if num_loss == 6 and i >= 2 and use_loss_weight:
                            loss_per = loss_per * 0.5
                            logits_per = logits_per * 0.5
                        loss_added.append(loss_per)
                        logits_added.append(logits_per)
                    if num_loss == 6 and use_loss_weight:
                        num_loss = num_loss - 2
                    self.trainops[loss_type]["loss"] += sum(
                        loss_added) / num_loss
                    self.trainops[loss_type]["logits"] += sum(
                        logits_added) / num_loss

                    self.trainops[loss_type]["global_step"] = tf.Variable(
                        0, trainable=False)
                    initial_learning_rate = self._conf['learning_rate']
                    self.trainops[loss_type][
                        "learning_rate"] = tf.train.exponential_decay(
                            initial_learning_rate,
                            global_step=self.trainops[loss_type]
                            ["global_step"],
                            decay_steps=self._conf["decay_step"],
                            decay_rate=0.9,
                            staircase=True)

                    Optimizer = tf.train.AdamOptimizer(
                        self.trainops[loss_type]["learning_rate"])
                    self.trainops[loss_type]["optimizer"] = Optimizer.minimize(
                        self.trainops[loss_type]["loss"])

                    self.trainops[loss_type][
                        "grads_and_vars"] = Optimizer.compute_gradients(
                            self.trainops[loss_type]["loss"])

                    self.trainops[loss_type]["capped_gvs"] = [
                        (tf.clip_by_value(grad, -1, 1), var) for grad, var in
                        self.trainops[loss_type]["grads_and_vars"]
                        if grad != None
                    ]
                    self.trainops[loss_type][
                        "g_updates"] = Optimizer.apply_gradients(
                            self.trainops[loss_type]["capped_gvs"],
                            global_step=self.trainops[loss_type]
                            ["global_step"])

            self.all_variables = tf.global_variables()
            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver(max_to_keep=self._conf["max_to_keep"])

            self.all_operations = self._graph.get_operations()

        return self._graph
    def build_graph(self):
        with self._graph.as_default():
            rand_seed = self._conf['rand_seed']
            tf.set_random_seed(rand_seed)

            #word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            #define placehloders
            self.turns1 = tf.placeholder(tf.int32,
                                         shape=[
                                             self._conf["batch_size"],
                                             self._conf["max_turn_num"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turns1")
            self.tt_turns_len1 = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"]],
                name="tt_turns_len1")
            self.every_turn_len1 = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]],
                name="every_turn_len1")
            self.turns2 = tf.placeholder(tf.int32,
                                         shape=[
                                             self._conf["batch_size"],
                                             self._conf["max_turn_num"],
                                             self._conf["max_turn_len"]
                                         ],
                                         name="turns2")
            self.tt_turns_len2 = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"]],
                name="tt_turns_len2")
            self.every_turn_len2 = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]],
                name="every_turn_len2")
            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]],
                name="response")
            self.response_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"]],
                name="response_len")
            self.keep_rate = tf.placeholder(tf.float32, [], name="keep_rate")

            self.label = tf.placeholder(tf.float32,
                                        shape=[self._conf["batch_size"]])

            self.turns1_e = tf.nn.embedding_lookup(self._word_embedding,
                                                   self.turns1)
            self.turns2_e = tf.nn.embedding_lookup(self._word_embedding,
                                                   self.turns2)
            self.response_e = tf.nn.embedding_lookup(self._word_embedding,
                                                     self.response)

            # SMN
            if self.cr_model == "SMN":
                input_x = self.turns1
                input_y = self.response
                final_info_cr = smn_model(input_x,
                                          None,
                                          input_y,
                                          None,
                                          self._word_embedding,
                                          self.keep_rate,
                                          self._conf,
                                          x_len=self.every_turn_len1,
                                          y_len=self.response_len)
                with tf.variable_scope('final_smn_mlp_cr'):
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # DAM
            elif self.cr_model == "DAM":
                input_x = self.turns1
                input_y = self.response
                final_info_cr = dam_model(input_x,
                                          None,
                                          input_y,
                                          None,
                                          self._word_embedding,
                                          self.keep_rate,
                                          self._conf,
                                          x_len=self.every_turn_len1,
                                          y_len=self.response_len)
                with tf.variable_scope('final_esim_mlp_cr'):
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            # IOI
            elif self.cr_model == "IOI":
                input_x = tf.reshape(self.turns1,
                                     [self._conf["batch_size"], -1])
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                input_x_mask = tf.reshape(input_x_mask,
                                          [self._conf["batch_size"], -1])

                input_x2 = tf.reshape(self.turns2,
                                      [self._conf["batch_size"], -1])
                input_x_mask2 = tf.sequence_mask(self.every_turn_len2,
                                                 self._conf["max_turn_len"])
                input_x_mask2 = tf.reshape(input_x_mask2,
                                           [self._conf["batch_size"], -1])

                input_y = self.response
                input_y_mask = tf.sequence_mask(self.response_len,
                                                self._conf["max_turn_len"])

                final_info_cr, final_info_cr_ioi = ioi_model(
                    input_x, input_x_mask, input_y, input_y_mask,
                    self._word_embedding, self.keep_rate, self._conf)
                with tf.variable_scope('final_esim_mlp_cr'):
                    final_info_cr = tf.layers.dense(
                        final_info_cr,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            if self.cc_model == "cc":
                input_x = self.turns1
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                #input_x_mask = tf.reshape(input_x_mask, [-1, self._conf["max_turn_num"]*self._conf["max_turn_len"]])
                input_x_len = self.every_turn_len1

                input_x2 = self.turns2
                input_x_mask2 = tf.sequence_mask(self.every_turn_len2,
                                                 self._conf["max_turn_len"])
                #input_x_mask2 = tf.reshape(input_x_mask2, [-1, self._conf["max_turn_num_s"]*self._conf["max_turn_len"]])
                input_x_len2 = self.every_turn_len2

                final_info_cc, self.att_weight_print = cc_model(
                    input_x,
                    input_x_mask,
                    input_x_len,
                    input_x2,
                    input_x_mask2,
                    input_x_len2,
                    self._word_embedding,
                    self._conf,
                    con_c=self.con_c)
                with tf.variable_scope('final_mlp_cc'):
                    final_info_cc = tf.layers.dense(
                        final_info_cc,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            elif self.cc_model == "onesent":
                input_x = self.turns1
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                #input_x_mask = tf.reshape(input_x_mask, [-1, self._conf["max_turn_num"]*self._conf["max_turn_len"]])
                input_x_len = self.every_turn_len1

                input_x2 = self.turns2
                input_x_mask2 = tf.sequence_mask(self.every_turn_len2,
                                                 self._conf["max_turn_len"])
                #input_x_mask2 = tf.reshape(input_x_mask2, [-1, self._conf["max_turn_num_s"]*self._conf["max_turn_len"]])
                input_x_len2 = self.every_turn_len2

                final_info_cc = cc_model(input_x,
                                         input_x_mask,
                                         input_x_len,
                                         input_x2,
                                         input_x_mask2,
                                         input_x_len2,
                                         self._word_embedding,
                                         self._conf,
                                         con_c=True)
                with tf.variable_scope('final_mlp_onesent_cc'):
                    final_info_cc = tf.layers.dense(
                        final_info_cc,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            elif self.cc_model == "bimpm":
                input_x = tf.reshape(self.turns1,
                                     [self._conf["batch_size"], -1])
                input_x_mask = tf.sequence_mask(self.every_turn_len1,
                                                self._conf["max_turn_len"])
                input_x_mask = tf.reshape(input_x_mask,
                                          [self._conf["batch_size"], -1])
                input_y = tf.reshape(self.turns2,
                                     [self._conf["batch_size"], -1])
                input_y_mask = tf.sequence_mask(self.every_turn_len2,
                                                self._conf["max_turn_len"])
                input_y_mask = tf.reshape(input_y_mask,
                                          [self._conf["batch_size"], -1])

                with tf.variable_scope('final_bimpm_cc_cr'):
                    final_info_cc = bimpm_model(input_x, input_x_mask, input_y,
                                                input_y_mask,
                                                self._word_embedding,
                                                self.keep_rate)
                    final_info_cc = tf.layers.dense(
                        final_info_cc,
                        50,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer())

            #loss and train
            with tf.variable_scope('loss_cc'):
                self.loss_cc, self.logits_cc = layers.loss(
                    final_info_cc, self.label)

                self.global_step_cc = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate_cc = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step_cc,
                    decay_steps=5000,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer_cc = tf.train.AdamOptimizer(self.learning_rate_cc)
                self.optimizer_cc = Optimizer_cc.minimize(self.loss_cc)

                #self.all_operations = self._graph.get_operations()
                self.grads_and_vars_cc = Optimizer_cc.compute_gradients(
                    self.loss_cc)

                self.capped_gvs_cc = [(tf.clip_by_value(grad, -1, 1), var)
                                      for grad, var in self.grads_and_vars_cc
                                      if grad != None]
                self.g_updates_cc = Optimizer_cc.apply_gradients(
                    self.capped_gvs_cc, global_step=self.global_step_cc)

            with tf.variable_scope('loss_cr'):
                if self.cr_model == "IOI":
                    loss_list = []
                    logits_list = []
                    for i, j in enumerate(final_info_cr_ioi):
                        with tf.variable_scope("loss" + str(i)):
                            loss_per, logits_per = layers.loss(j, self.label)
                        loss_list.append(loss_per)
                        logits_list.append(logits_per)
                    self.loss_cr = sum([((idx + 1) / 7.0) * item
                                        for idx, item in enumerate(loss_list)])
                    self.logits_cr = sum(logits_list)
                else:
                    self.loss_cr, self.logits_cr = layers.loss(
                        final_info_cr, self.label)

                self.global_step_cr = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate_cr = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step_cr,
                    decay_steps=10000,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer_cr = tf.train.AdamOptimizer(self.learning_rate_cr)
                self.optimizer_cr = Optimizer_cr.minimize(self.loss_cr)

                self.grads_and_vars_cr = Optimizer_cr.compute_gradients(
                    self.loss_cr)

                self.capped_gvs_cr = [(tf.clip_by_value(grad, -1, 1), var)
                                      for grad, var in self.grads_and_vars_cr
                                      if grad != None]
                self.g_updates_cr = Optimizer_cr.apply_gradients(
                    self.capped_gvs_cr, global_step=self.global_step_cr)

            with tf.variable_scope('loss_ccr'):

                if self._conf["fusion"] == "fusion":
                    final_att = tf.concat([final_info_cc, final_info_cr],
                                          axis=1)
                    final_att = tf.layers.dense(final_att,
                                                1,
                                                kernel_initializer=tf.contrib.
                                                layers.xavier_initializer(),
                                                name="nosave")
                    final_att = tf.nn.sigmoid(final_att)
                    self.final_att_print = final_att
                    final_info_ccr = final_info_cc * final_att + final_info_cr * (
                        1 - final_att)
                elif self._conf["fusion"] == "con":
                    #print(final_info_cr.shape)
                    final_att = tf.concat([final_info_cc, final_info_cr],
                                          axis=1)
                    final_att = tf.layers.dense(final_att,
                                                final_info_cr.shape[-1],
                                                kernel_initializer=tf.contrib.
                                                layers.xavier_initializer(),
                                                name="nosave")
                    final_att = tf.nn.sigmoid(final_att)
                    self.final_att_print = final_att
                    final_info_ccr = tf.concat([final_info_cr, final_info_cc],
                                               axis=1)
                elif self._conf["fusion"] == "none":
                    final_info_ccr = final_info_cc + final_info_cr
                else:
                    assert False

                self.loss_ccr, self.logits_ccr = layers.loss(
                    final_info_ccr, self.label)
                self.loss_ccr += self.loss_cr
                self.logits_ccr += self.logits_cr

                self.global_step_ccr = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate_ccr = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step_ccr,
                    decay_steps=5000,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer_ccr = tf.train.AdamOptimizer(self.learning_rate_ccr)
                self.optimizer_ccr = Optimizer_ccr.minimize(self.loss_ccr)

                self.grads_and_vars_ccr = Optimizer_ccr.compute_gradients(
                    self.loss_ccr)

                self.capped_gvs_ccr = [(tf.clip_by_value(grad, -1, 1), var)
                                       for grad, var in self.grads_and_vars_ccr
                                       if grad != None]
                self.g_updates_ccr = Optimizer_ccr.apply_gradients(
                    self.capped_gvs_ccr, global_step=self.global_step_ccr)

            self.all_variables = tf.global_variables()
            self.init = tf.global_variables_initializer()
            self.saver_load = tf.train.Saver(
                max_to_keep=self._conf["max_to_keep"])
            self.saver_save = self.saver_load

            self.all_operations = self._graph.get_operations()

        return self._graph
    def build_graph(self):
        with self._graph.as_default():
            if self._conf['rand_seed'] is not None:
                rand_seed = self._conf['rand_seed']
                tf.set_random_seed(rand_seed)
                print('set tf random seed: %s' % self._conf['rand_seed'])

            # word embedding
            if self._word_embedding_init is not None:
                word_embedding_initializer = tf.constant_initializer(
                    self._word_embedding_init)
            else:
                word_embedding_initializer = tf.random_normal_initializer(
                    stddev=0.1)

            self._word_embedding = tf.get_variable(
                name='word_embedding',
                shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']],
                dtype=tf.float32,
                initializer=word_embedding_initializer)

            # define placehloders
            self.turns = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"],
                       self._conf["max_turn_len"]])

            self.tt_turns_len = tf.placeholder(  # turn_num
                tf.int32,
                shape=[self._conf["batch_size"]])

            self.every_turn_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"]])

            self.turns_intent = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"], self._conf["max_turn_num"],
                       self._conf["intent_size"]])

            self.response = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"], self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(
                tf.int32,
                shape=[self._conf["batch_size"]])

            self.response_intent = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"], self._conf["intent_size"]])

            self.label = tf.placeholder(
                tf.float32,
                shape=[self._conf["batch_size"]])

            # define operations
            # response part
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
            # [batch_size, max_turn_len, embed_size]

            # print('[after embedding_lookup] Hr shape: %s' % Hr.shape)

            if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)
            Hr_stack = [Hr]  # 1st element of Hr_stack is the orginal embedding
            # lyang comments: self attention
            for index in range(self._conf['stack_num']):
                # print('[self attention for response] stack index: %d ' % index)
                with tf.variable_scope('self_stack_' + str(index)):
                    # [batch, max_turn_len, emb_size]
                    Hr = layers.block(  # attentive module
                        Hr, Hr, Hr,
                        Q_lengths=self.response_len,
                        K_lengths=self.response_len)
                    # print('[after layers.block] Hr shape: %s' % Hr.shape)
                    # Hr is still [batch_size, max_turn_len, embed_size]
                    Hr_stack.append(Hr)

            # print('[after self attention of response] len(Hr_stack)',
            #       len(Hr_stack))  # 1+stack_num
            # context part
            # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len]
            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)
            list_turn_intent = tf.unstack(self.turns_intent, axis=1)

            sim_turns = []
            attention_turns = [] # intent based attention on each turn
            # for every turn_t calculate matching vector
            turn_index = 0
            for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent):
                print('current turn_index : ', turn_index)
                turn_index += 1
                Hu = tf.nn.embedding_lookup(self._word_embedding,
                                            turn_t)  # [batch, max_turn_len, emb_size]
                # print('[after embedding_lookup] Hu shape: %s' % Hu.shape)

                if self._conf['is_positional'] and self._conf['stack_num'] > 0:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)
                Hu_stack = [Hu]  # 1st element of Hu_stack is the orginal embedding

                # lyang comments: self attention
                for index in range(self._conf['stack_num']):
                    # print('[self attention for context turn] stack index: %d ' % index)
                    with tf.variable_scope('self_stack_' + str(index),
                                           reuse=True):
                        # [batch, max_turn_len, emb_size]
                        Hu = layers.block(  # attentive module
                            Hu, Hu, Hu,
                            Q_lengths=t_turn_length, K_lengths=t_turn_length)
                        # print('[after layers.block] Hu shape: %s' % Hu.shape)
                        Hu_stack.append(Hu)
                # print('[after self attention of context turn] len(Hu_stack)',
                #       len(Hu_stack))  # 1+stack_num

                # lyang comments: cross attention
                # print('[cross attention ...]')
                r_a_t_stack = []
                t_a_r_stack = []
                # cross attention
                for index in range(self._conf['stack_num'] + 1):
                    # print('[cross attention] stack index = ', index)
                    with tf.variable_scope('t_attend_r_' + str(index)):
                        try:
                            # [batch, max_turn_len, emb_size]
                            t_a_r = layers.block(  # attentive module
                                Hu_stack[index], Hr_stack[index],
                                Hr_stack[index],
                                Q_lengths=t_turn_length,
                                K_lengths=self.response_len)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            t_a_r = layers.block(
                                # [batch, max_turn_len, emb_size]
                                Hu_stack[index], Hr_stack[index],
                                Hr_stack[index],
                                Q_lengths=t_turn_length,
                                K_lengths=self.response_len)
                        # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % (
                        #         index, t_a_r.shape))

                    with tf.variable_scope('r_attend_t_' + str(index)):
                        try:
                            # [batch, max_turn_len, emb_size]
                            r_a_t = layers.block(  # attentive module
                                Hr_stack[index], Hu_stack[index],
                                Hu_stack[index],
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        except ValueError:
                            tf.get_variable_scope().reuse_variables()
                            r_a_t = layers.block(
                                Hr_stack[index], Hu_stack[index],
                                Hu_stack[index],
                                Q_lengths=self.response_len,
                                K_lengths=t_turn_length)
                        # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % (
                        #         index, r_a_t.shape))

                    t_a_r_stack.append(t_a_r)
                    r_a_t_stack.append(r_a_t)
                    # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack))
                    # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack))

                # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack))
                # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack))
                # lyang comments: 3D aggregation
                t_a_r_stack.extend(
                    Hu_stack)  # half from self-attention; half from cross-attention
                r_a_t_stack.extend(
                    Hr_stack)  # half from self-attention; half from cross-attention
                # after extend, len(t_a_r_stack)) = 2*(stack_num+1)

                # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack))
                # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack))

                t_a_r = tf.stack(t_a_r_stack, axis=-1)
                r_a_t = tf.stack(r_a_t_stack, axis=-1)

                # print('after stack along the last dimension: ')
                # print('t_a_r shape: %s' % t_a_r.shape)
                # print('r_a_t shape: %s' % r_a_t.shape)
                # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1))

                with tf.variable_scope('intent_based_attention',
                                       reuse=tf.AUTO_REUSE): # share parameter across different turns
                    # there are 3 different ways to implement intent based attention
                    # implement these three different variations and compare the
                    # effectiveness as model abalation analysis
                    # let I_u_t and I_r_k are intent vector in [12,1]
                    # 1. dot: w * [I_u_t, I_r_k], where w is [24,1]
                    # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12]
                    # 3. outprod: I_u_t * I_r_k' -> [12,12] out product ->
                    #             flaten to [144,1] outprod -> w*outprod
                    #             where w is [1,144]
                    attention_logits = layers.attention_intent(t_intent,
                                        self.response_intent,
                                        self._conf['intent_attention_type'])
                    # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape)
                    attention_turns.append(attention_logits)

                    # calculate similarity matrix
                with tf.variable_scope('similarity'):
                    # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # divide sqrt(200) to prevent gradient explosion
                    # A_biks * B_bjks -> C_bijs
                    sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(
                        200.0)
                    # (batch, max_turn_len, embed_size, 2*(stack_num+1)) *
                    # (batch, max_turn_len, embed_size, 2*(stack_num+1)) ->
                    # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # where k is corresponding to the dimension of embed_size,
                    # which can be eliminated by dot product with einsum
                    # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape)
                    # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)]
                    # ! Here we multipy sim by intent based attention weights before
                    # append sim into sim_turns in order to generate the weighted
                    # stack in the next step

                sim_turns.append(sim)
                # print('[similarity] after append, len(sim_turns):', len(sim_turns))

            attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num]
            print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape)
            # add mask in attention following the way in BERT
            # real turn_num is in self.tt_turns_len [batch]
            # return a mask tensor with shape [batch,  conf['max_turn_num']]
            attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'],
                                              dtype=tf.float32)
            print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape)
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            adder = (1.0 - attention_mask) * -10000.0

            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_logits += adder
            attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num]
            print('[attention] attention.shape: %s' % attention_mask.shape)
            self.attention = attention # will print it for visualization

            # cnn and aggregation
            # lyang comments aggregation by 3D CNN layer
            # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10)
            # conv_0 shape: (32, 9, 180, 180, 16)
            # pooling_0 shape: (32, 3, 60, 60, 16)
            # conv_1 shape: (32, 3, 60, 60, 16)
            # pooling_1 shape: (32, 1, 20, 20, 16)
            # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size]
            # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)]
            # (32, 9, 180, 180, 10)
            sim = tf.stack(sim_turns, axis=1)
            # multipy sim by attention score
            sim = tf.einsum('bijks,bi->bijks', sim, attention)
            print('[3d cnn aggregation] sim shape: %s' % sim.shape)
            with tf.variable_scope('cnn_aggregation'):
                final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'],
                                           self._conf['cnn_3d_oc1'])
                # for udc
                # final_info = layers.CNN_3d(sim, 32, 16)
                # for douban
                # final_info = layers.CNN_3d(sim, 16, 16)

            print('[3d cnn aggregation] final_info: %s' % final_info.shape)
            # loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(final_info, self.label)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=400,
                    decay_rate=0.9,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(
                    self.loss,
                    global_step=self.global_step)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print var

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for
                                   grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs,
                    global_step=self.global_step)

        return self._graph
예제 #7
0
    def create_network(self):
        mask_cache = dict() if self.use_mask_cache else None

        response_emb = fluid.layers.embedding(
            input=self.response,
            size=[self._vocab_size + 1, self._emb_size],
            is_sparse=self.use_sparse_embedding,
            param_attr=fluid.ParamAttr(
                name=self.word_emb_name,
                initializer=fluid.initializer.Normal(scale=0.1)))

        # response part
        Hr = response_emb
        Hr_stack = [Hr]

        for index in six.moves.xrange(self._stack_num):
            Hr = layers.block(
                name="response_self_stack" + str(index),
                query=Hr,
                key=Hr,
                value=Hr,
                d_key=self._emb_size,
                q_mask=self.response_mask,
                k_mask=self.response_mask,
                mask_cache=mask_cache)
            Hr_stack.append(Hr)

        # context part
        sim_turns = []
        for t in six.moves.xrange(self._max_turn_num):
            Hu = fluid.layers.embedding(
                input=self.turns_data[t],
                size=[self._vocab_size + 1, self._emb_size],
                is_sparse=self.use_sparse_embedding,
                param_attr=fluid.ParamAttr(
                    name=self.word_emb_name,
                    initializer=fluid.initializer.Normal(scale=0.1)))
            Hu_stack = [Hu]

            for index in six.moves.xrange(self._stack_num):
                # share parameters
                Hu = layers.block(
                    name="turn_self_stack" + str(index),
                    query=Hu,
                    key=Hu,
                    value=Hu,
                    d_key=self._emb_size,
                    q_mask=self.turns_mask[t],
                    k_mask=self.turns_mask[t],
                    mask_cache=mask_cache)
                Hu_stack.append(Hu)

            # cross attention
            r_a_t_stack = []
            t_a_r_stack = []
            for index in six.moves.xrange(self._stack_num + 1):
                t_a_r = layers.block(
                    name="t_attend_r_" + str(index),
                    query=Hu_stack[index],
                    key=Hr_stack[index],
                    value=Hr_stack[index],
                    d_key=self._emb_size,
                    q_mask=self.turns_mask[t],
                    k_mask=self.response_mask,
                    mask_cache=mask_cache)
                r_a_t = layers.block(
                    name="r_attend_t_" + str(index),
                    query=Hr_stack[index],
                    key=Hu_stack[index],
                    value=Hu_stack[index],
                    d_key=self._emb_size,
                    q_mask=self.response_mask,
                    k_mask=self.turns_mask[t],
                    mask_cache=mask_cache)

                t_a_r_stack.append(t_a_r)
                r_a_t_stack.append(r_a_t)

            t_a_r_stack.extend(Hu_stack)
            r_a_t_stack.extend(Hr_stack)

            if self.use_stack_op:
                t_a_r = fluid.layers.stack(t_a_r_stack, axis=1)
                r_a_t = fluid.layers.stack(r_a_t_stack, axis=1)
            else:
                for index in six.moves.xrange(len(t_a_r_stack)):
                    t_a_r_stack[index] = fluid.layers.unsqueeze(
                        input=t_a_r_stack[index], axes=[1])
                    r_a_t_stack[index] = fluid.layers.unsqueeze(
                        input=r_a_t_stack[index], axes=[1])

                t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
                r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)

            # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
            sim = fluid.layers.matmul(
                x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
            sim_turns.append(sim)

        if self.use_stack_op:
            sim = fluid.layers.stack(sim_turns, axis=2)
        else:
            for index in six.moves.xrange(len(sim_turns)):
                sim_turns[index] = fluid.layers.unsqueeze(
                    input=sim_turns[index], axes=[2])
            # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
            sim = fluid.layers.concat(input=sim_turns, axis=2)

        final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
        loss, logits = layers.loss(final_info, self.label)
        return loss, logits
예제 #8
0
파일: net.py 프로젝트: kifish/IACMN
    def build_graph(self):
        with self._graph.as_default():
            if self._conf['rand_seed'] is not None:
                rand_seed = self._conf['rand_seed']
                tf.set_random_seed(rand_seed)
                print('set tf random seed: %s' % self._conf['rand_seed'])

            #word embedding
            with tf.device('/cpu:0'), tf.name_scope("embedding"):
                self._word_embedding = tf.get_variable(
                    'word_embedding',
                    shape=(self._conf['vocab_size'], self._conf['emb_size']),
                    dtype=tf.float32,
                    trainable=False)

                self.emb_placeholder = tf.placeholder(
                    tf.float32,
                    shape=[self._conf['vocab_size'], self._conf['emb_size']])

                self.emb_init = self._word_embedding.assign(
                    self.emb_placeholder)

            #define placehloders
            self.turns = tf.placeholder(  # context data
                tf.int32,
                shape=[
                    None, self._conf["max_turn_num"],
                    self._conf["max_turn_len"]
                ])

            self.tt_turns_len = tf.placeholder(  # utterance num of context
                tf.int32, shape=[None])

            self.every_turn_len = tf.placeholder(  # length of each utterance in context
                tf.int32,
                shape=[None, self._conf["max_turn_num"]])

            self.response = tf.placeholder(  # response data
                tf.int32,
                shape=[None, self._conf["max_turn_len"]])

            self.response_len = tf.placeholder(  # response len
                tf.int32, shape=[None])

            self.label = tf.placeholder(  # scale label
                tf.float32, shape=[None])

            self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                    name="dropout_keep_prob")

            #define operations
            #build response embedding
            Hr = tf.nn.embedding_lookup(self._word_embedding, self.response)
            Hr = tf.nn.dropout(Hr, self.dropout_keep_prob)

            if self._conf['is_positional']:
                with tf.variable_scope('positional'):
                    Hr = op.positional_encoding_vector(Hr, max_timescale=10)

            with tf.variable_scope('attention_cnn_block'):
                hr_conv_list = layers.agdr_block(
                    Hr, self._conf['repeat_times'],
                    self._conf['delation_list'],
                    self._conf['dcnn_filter_width'],
                    self._conf['dcnn_channel'], self.dropout_keep_prob)

            list_turn_t = tf.unstack(self.turns, axis=1)
            list_turn_length = tf.unstack(self.every_turn_len, axis=1)

            reuse = None
            sim_turns = []
            #for every turn_t, build embedding and calculate matching vector
            for turn_t, t_turn_length in zip(list_turn_t, list_turn_length):

                Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t)
                Hu = tf.nn.dropout(Hu, self.dropout_keep_prob)

                if self._conf['is_positional']:
                    with tf.variable_scope('positional', reuse=True):
                        Hu = op.positional_encoding_vector(Hu,
                                                           max_timescale=10)

                # multi-level sim matrix of response and each utterance
                sim_matrix = [layers.Word_Sim(Hr, Hu)]

                with tf.variable_scope('attention_cnn_block', reuse=True):
                    hu_conv_list = layers.agdr_block(
                        Hu, self._conf['repeat_times'],
                        self._conf['delation_list'],
                        self._conf['dcnn_filter_width'],
                        self._conf['dcnn_channel'], self.dropout_keep_prob)

                for index in range(len(hu_conv_list)):
                    with tf.variable_scope('segment_sim'):
                        sim_matrix.append(
                            layers.Word_Sim(hr_conv_list[index],
                                            hu_conv_list[index]))

                sim_matrix = tf.stack(sim_matrix,
                                      axis=-1,
                                      name='one_matrix_stack')

                with tf.variable_scope('cnn_aggregation', reuse=tf.AUTO_REUSE):
                    matching_vector = layers.CNN_2d(sim_matrix, 32, 16,
                                                    self.dropout_keep_prob)
                if not reuse:
                    reuse = True

                sim_turns.append(matching_vector)

            #aggregation with a gru
            sim = tf.stack(sim_turns, axis=1, name='matching_stack')

            with tf.variable_scope("sent_rnn"):
                sent_rnn_outputs, _ = layers.bigru_sequence(
                    sim, 64, None, self.dropout_keep_prob)  # TODO:CHECK

            # attention at sentence level:
            sent_atten_inputs = tf.concat(sent_rnn_outputs, 2)

            with tf.variable_scope("sent_atten"):
                rev_outs, alphas_sents = layers.intro_attention(
                    sent_atten_inputs, 50)

            #loss and train
            with tf.variable_scope('loss'):
                self.loss, self.logits = layers.loss(rev_outs,
                                                     self.label,
                                                     is_clip=True)

                self.global_step = tf.Variable(0, trainable=False)
                initial_learning_rate = self._conf['learning_rate']
                self.learning_rate = tf.train.exponential_decay(
                    initial_learning_rate,
                    global_step=self.global_step,
                    decay_steps=5000,
                    decay_rate=0.96,
                    staircase=True)

                Optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.optimizer = Optimizer.minimize(
                    self.loss, global_step=self.global_step)

                self.init = tf.global_variables_initializer()
                self.saver = tf.train.Saver(
                    max_to_keep=self._conf["max_to_keep"])
                self.all_variables = tf.global_variables()
                self.all_operations = self._graph.get_operations()
                self.grads_and_vars = Optimizer.compute_gradients(self.loss)

                for grad, var in self.grads_and_vars:
                    if grad is None:
                        print(var)

                self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var)
                                   for grad, var in self.grads_and_vars]
                self.g_updates = Optimizer.apply_gradients(
                    self.capped_gvs, global_step=self.global_step)

            # summary
            grad_summaries = []
            for g, v in self.grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "gradient/{}/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "gradient/{}/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)

            grad_summaries_merged = tf.summary.merge(grad_summaries)

            logit_summary = tf.summary.histogram("{}".format(self.logits.name),
                                                 self.logits)

            # Loss Summaries
            loss_summary = tf.summary.scalar("loss", self.loss)
            # Train, Dev Summaries
            self.train_summary_op = tf.summary.merge(
                [loss_summary, logit_summary, grad_summaries_merged])
            self.dev_summary_op = tf.summary.merge([
                loss_summary,
            ])

        return self._graph