def match_layer_selfatt(x5_e_ts, x5_enc_cur_list, x5_mask_cur_list, x_e_ts, x_enc_mb2_ts, x_mask_mb2_ts, emb_dim, initializer_opt, turn_num5, matchin_include_x=False): # print(x5_enc_cur_list.shape) # bs*turn5 sent_len emb_dim # print(x5_mask_cur_list.shape) # bs*turn5 sent_len # print(x_enc_mb2_ts.shape) # bs*turn5 turn1*sent_len emb_dim # print(x_mask_mb2_ts.shape) # bs*turn5 turn1*sent_len if matchin_include_x: a_list = [x5_e_ts, x5_enc_cur_list] b_list = [x_e_ts, x_enc_mb2_ts] else: a_list = [x5_enc_cur_list] b_list = [x_enc_mb2_ts] with tf.variable_scope("atb", reuse=tf.AUTO_REUSE): atb = layers.block(x5_enc_cur_list, x_enc_mb2_ts, x_enc_mb2_ts, Q_lengths=x5_mask_cur_list, K_lengths=x_mask_mb2_ts, use_len=True) with tf.variable_scope("bta", reuse=tf.AUTO_REUSE): bta = layers.block(x_enc_mb2_ts, x5_enc_cur_list, x5_enc_cur_list, Q_lengths=x_mask_mb2_ts, K_lengths=x5_mask_cur_list, use_len=True) a_list.append(atb) b_list.append(bta) a_list = tf.stack(a_list, axis=-1) b_list = tf.stack(b_list, axis=-1) sim_ori = tf.einsum('biks,bjks->bijs', a_list, b_list) / tf.sqrt(200.0) sim = layers.CNN_FZX(sim_ori) if turn_num5 is not None: sim = tf.reshape(sim, [-1, turn_num5, sim.shape[-1]]) return sim, sim_ori
def build_graph(self): with self._graph.as_default(): rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) #define placehloders self.turns = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ]) self.tt_turns_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.label = tf.placeholder(tf.float32, shape=[self._conf["batch_size"]]) #define operations #response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup( self._word_embedding, turn_t) #[batch, max_turn_len, emb_size] if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) with tf.variable_scope('u_attentd_r_' + str(index)): try: u_a_r = layers.block(Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() u_a_r = layers.block(Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=self.response_len) with tf.variable_scope('r_attend_u_' + str(index)): try: r_a_u = layers.block(Hr, Hu, Hu, Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_u = layers.block(Hr, Hu, Hu, Q_lengths=self.response_len, K_lengths=t_turn_length) u_a_r = tf.stack([u_a_r, Hu], axis=-1) r_a_u = tf.stack([r_a_u, Hr], axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', r_a_u, u_a_r) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #for douban #final_info = layers.CNN_3d(sim, 16, 16) #loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize(self.loss) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print var self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def msn_model(input_x, input_x_mask, input_y, input_y_mask, word_emb, keep_rate, conf, x_len=None, y_len=None): turn_num = input_x.shape[1] sent_len = conf["max_turn_len"] emb_dim = conf["emb_size"] is_mask = False is_layer_norm = False # init = None init = tf.contrib.layers.xavier_initializer() init1 = tf.contrib.layers.xavier_initializer() init1 = tf.random_uniform_initializer(0.0, 1.0) Hr = tf.nn.embedding_lookup(word_emb, input_y) # bs len emb Hu = tf.nn.embedding_lookup(word_emb, input_x) # bs turn len emb x_len = tf.reshape(x_len, [-1]) y_len = tf.tile(tf.expand_dims(y_len, axis=1), [1, turn_num]) y_len = tf.reshape(y_len, [-1]) with tf.variable_scope('enc', reuse=tf.AUTO_REUSE): # context selector context_ = tf.reshape(Hu, [-1, sent_len, emb_dim]) context_ = layers.block(context_, context_, context_, Q_lengths=x_len, K_lengths=x_len, is_mask=is_mask, is_layer_norm=is_layer_norm, init=init) context_ = tf.reshape(context_, [-1, turn_num, sent_len, emb_dim]) W_word = tf.get_variable( name='w_word', shape=[emb_dim, emb_dim, turn_num], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # 200 200 10 v = tf.get_variable( name='v', shape=[turn_num, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # 10 1 ss = [] for hop_index in [1, 2, 3]: kk = Hu[:, turn_num - hop_index:, :, :] kk = tf.reduce_mean(kk, axis=1) kk = layers.block(kk, kk, kk, is_mask=False, is_layer_norm=is_layer_norm, init=init) # kk context_ A = tf.einsum("blrm,mdh,bud->blruh", context_, W_word, kk) / tf.sqrt(200.0) A = tf.einsum("blruh,hp->blrup", A, v) # bs turn_num sent_len sent_len 1 A = tf.squeeze(A, [ 4, ]) # bs turn_num sent_len sent_len A1 = tf.reduce_max(A, axis=2) # bs turn_num sent_len A2 = tf.reduce_max(A, axis=3) # bs turn_num sent_len a = tf.concat([A1, A2], axis=-1) # bs turn_num sent_len*2 a = tf.layers.dense(a, 1, kernel_initializer=init1) # bs turn_num 1 a = tf.squeeze(a, [ 2, ]) # bs turn_num s1 = tf.nn.softmax(a, axis=1) # kk context_ kk1 = tf.reduce_mean(kk, axis=1) # bs emb context1 = tf.reduce_mean(context_, axis=2) # bs turn emb norm1 = tf.norm(context1, axis=-1) norm2 = tf.norm(kk1, axis=-1, keepdims=True) # print(context1.shape) # bs 10 200 # print(kk1.shape) # bs 200 # print(norm1.shape) # bs 10 # print(norm2.shape) # bs 1 # exit() s2 = tf.einsum("bud,bd->bu", context1, kk1) / (1e-6 + norm1 * norm2 ) # bs turn # print(s1.shape, s2.shape) # exit() s = 0.5 * s1 + 0.5 * s2 ss.append(s) #s = tf.expand_dims(s, axis=-1) s = tf.stack(ss, axis=-1) s = tf.layers.dense(s, 1, kernel_initializer=init1) # bs turn_num 1 s = tf.squeeze(s, [ 2, ]) # bs turn_num if "douban" in conf["data_path"]: grmmar_score = 0.3 else: grmmar_score = 0.5 s_mask1 = tf.nn.sigmoid(s) s_mask = tf.math.greater(s_mask1, grmmar_score) s_mask = tf.cast(s_mask, tf.float32) final_score = [s, s_mask1] s = s * s_mask Hu = Hu * tf.expand_dims(tf.expand_dims(s, axis=-1), axis=-1) Hu = tf.reshape(Hu, [-1, sent_len, emb_dim]) Hr = tf.tile(tf.expand_dims(Hr, axis=1), [1, turn_num, 1, 1]) Hr = tf.reshape(Hr, [-1, sent_len, emb_dim]) # UR Matching Hu Hr def distance(A, B, C, epsilon=1e-6): Ma = tf.einsum("bum,md,brd->bur", A, B, C) A_norm = tf.norm(A, axis=-1) C_norm = tf.norm(C, axis=-1) norm_score = tf.einsum("bu,br->bur", A_norm, C_norm) + epsilon # norm_score = tf.math.maximum(norm_score, 1.0) Mb = tf.einsum("bud,brd->bur", A, C) / norm_score return Ma, Mb, norm_score v1 = tf.get_variable( name='v1', shape=[emb_dim, emb_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) M1, M2, norm_score1 = distance(Hu, v1, Hr) with tf.variable_scope('enc11', reuse=tf.AUTO_REUSE): Hu1 = layers.block(Hu, Hu, Hu, Q_lengths=x_len, K_lengths=x_len, is_mask=is_mask, is_layer_norm=is_layer_norm, init=init) with tf.variable_scope('enc12', reuse=tf.AUTO_REUSE): Hr1 = layers.block(Hr, Hr, Hr, Q_lengths=y_len, K_lengths=y_len, is_mask=is_mask, is_layer_norm=is_layer_norm, init=init) v2 = tf.get_variable( name='v2', shape=[emb_dim, emb_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) M3, M4, norm_score2 = distance(Hu1, v2, Hr1) with tf.variable_scope('enc21', reuse=tf.AUTO_REUSE): Hu1 = layers.block(Hu, Hr, Hr, Q_lengths=x_len, K_lengths=y_len, is_mask=is_mask, is_layer_norm=is_layer_norm, init=init) with tf.variable_scope('enc22', reuse=tf.AUTO_REUSE): Hr1 = layers.block(Hr, Hu, Hu, Q_lengths=y_len, K_lengths=x_len, is_mask=is_mask, is_layer_norm=is_layer_norm, init=init) v3 = tf.get_variable( name='v3', shape=[emb_dim, emb_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) M5, M6, norm_score3 = distance(Hu1, v3, Hr1) final_score = [norm_score1, norm_score2, norm_score3] final_score = [M2, M4, M6] M = tf.stack([M1, M2, M3, M4, M5, M6], axis=1) # bs*turn 6 sent_len sent_len M = layers.CNN_MSN(M, init=init) # bs*turn 128 M = tf.layers.dense( M, 300, activation=tf.nn.tanh, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="dense1") # bs turn_num 1 M = tf.reshape(M, [-1, turn_num, 300]) gru = tf.contrib.rnn.GRUCell(300) M = tf.nn.dynamic_rnn(gru, M, dtype=tf.float32) final_info = M[0][:, -1, :] final_info = tf.layers.dropout(final_info, rate=1.0 - keep_rate) return final_info, final_score
def build_graph(self): with self._graph.as_default(): if self._conf['rand_seed'] is not None: rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) print('set tf random seed: %s' % self._conf['rand_seed']) #word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) #define placehloders #config max_turn_history_num self.turns_history = tf.placeholder( tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_history_num"], self._conf["max_turn_len"] ]) self.turns = tf.placeholder(tf.int32, shape=[ self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"] ]) self.tt_turns_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.label = tf.placeholder(tf.float32, shape=[self._conf["batch_size"]]) #define operations #response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) turns_history_embedding = tf.nn.embedding_lookup( self._word_embedding, self.turns_history) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] _batch_size, _turn_nums, _turn_words, _emb_size = turns_history_embedding.get_shape( ).as_list() turns_history_embedding = tf.reshape(turns_history_embedding, [-1, _turn_words, _emb_size]) for index in range(self._conf['stack_num']): turns_history_embedding, _ = self._multihead( turns_history_embedding, turns_history_embedding, turns_history_embedding) turns_history_embedding = tf.reshape( turns_history_embedding, [_batch_size, _turn_nums, _turn_words, _emb_size]) for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) Hr_stack.append(Hr) with tf.variable_scope('respone_extraction_history'): turn_important_inf = [] #需要增加一个全链接层 for _t in tf.split(turns_history_embedding, self._conf['max_turn_history_num'], 1): _t = tf.squeeze(_t) #_match_result = layers.attention(Hr_stack[-1], _t, _t, self.response_len, self.response_len) _match_result = layers.attention( self._dense1(Hr_stack[-1]), _t, _t, self.response_len, self.response_len) turn_important_inf.append(tf.expand_dims(_match_result, 1)) best_turn_match = tf.concat(turn_important_inf, 1) with tf.variable_scope('response_extraciton_best_information'): #best_information,_ = self._multihead(Hr_stack[-1], best_turn_match, best_turn_match) best_information, _ = self._multihead( self._dense2(Hr_stack[-1]), best_turn_match, best_turn_match) best_information = layers.FFN(best_information) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup( self._word_embedding, turn_t) #[batch, max_turn_len, emb_size] if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] for index in range(self._conf['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) Hu_stack.append(Hu) r_a_t_stack = [] t_a_r_stack = [] for index in range(self._conf['stack_num'] + 1): with tf.variable_scope('t_attend_r_' + str(index)): try: t_a_r = layers.block(tf.add( Hu_stack[index], best_information), Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block(tf.add( Hu_stack[index], best_information), Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) with tf.variable_scope('r_attend_t_' + str(index)): try: r_a_t = layers.block( Hr_stack[index], tf.add(Hu_stack[index], best_information), tf.add(Hu_stack[index], best_information), Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block( Hr_stack[index], tf.add(Hu_stack[index], best_information), tf.add(Hu_stack[index], best_information), Q_lengths=self.response_len, K_lengths=t_turn_length) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #final_info_dim = final_info.get_shape().as_list()[-1] #for douban #final_info = layers.CNN_3d(sim, 16, 16) # _x = self._conv1d(best_information) # _x = self._pool1d(_x) #final_info = tf.concat([final_info,best_information],-1) #loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize( self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print(var) self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def build_graph(self): with self._graph.as_default(): if self._conf['rand_seed'] is not None: rand_seed = self._conf['rand_seed'] tf.set_random_seed(rand_seed) print('set tf random seed: %s' % self._conf['rand_seed']) # word embedding if self._word_embedding_init is not None: word_embedding_initializer = tf.constant_initializer( self._word_embedding_init) else: word_embedding_initializer = tf.random_normal_initializer( stddev=0.1) self._word_embedding = tf.get_variable( name='word_embedding', shape=[self._conf['vocab_size'] + 1, self._conf['emb_size']], dtype=tf.float32, initializer=word_embedding_initializer) # define placehloders self.turns = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["max_turn_len"]]) self.tt_turns_len = tf.placeholder( # turn_num tf.int32, shape=[self._conf["batch_size"]]) self.every_turn_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_num"]]) self.turns_intent = tf.placeholder( tf.float32, shape=[self._conf["batch_size"], self._conf["max_turn_num"], self._conf["intent_size"]]) self.response = tf.placeholder( tf.int32, shape=[self._conf["batch_size"], self._conf["max_turn_len"]]) self.response_len = tf.placeholder( tf.int32, shape=[self._conf["batch_size"]]) self.response_intent = tf.placeholder( tf.float32, shape=[self._conf["batch_size"], self._conf["intent_size"]]) self.label = tf.placeholder( tf.float32, shape=[self._conf["batch_size"]]) # define operations # response part Hr = tf.nn.embedding_lookup(self._word_embedding, self.response) # [batch_size, max_turn_len, embed_size] # print('[after embedding_lookup] Hr shape: %s' % Hr.shape) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] # 1st element of Hr_stack is the orginal embedding # lyang comments: self attention for index in range(self._conf['stack_num']): # print('[self attention for response] stack index: %d ' % index) with tf.variable_scope('self_stack_' + str(index)): # [batch, max_turn_len, emb_size] Hr = layers.block( # attentive module Hr, Hr, Hr, Q_lengths=self.response_len, K_lengths=self.response_len) # print('[after layers.block] Hr shape: %s' % Hr.shape) # Hr is still [batch_size, max_turn_len, embed_size] Hr_stack.append(Hr) # print('[after self attention of response] len(Hr_stack)', # len(Hr_stack)) # 1+stack_num # context part # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(self.turns, axis=1) list_turn_length = tf.unstack(self.every_turn_len, axis=1) list_turn_intent = tf.unstack(self.turns_intent, axis=1) sim_turns = [] attention_turns = [] # intent based attention on each turn # for every turn_t calculate matching vector turn_index = 0 for turn_t, t_turn_length, t_intent in zip(list_turn_t, list_turn_length, list_turn_intent): print('current turn_index : ', turn_index) turn_index += 1 Hu = tf.nn.embedding_lookup(self._word_embedding, turn_t) # [batch, max_turn_len, emb_size] # print('[after embedding_lookup] Hu shape: %s' % Hu.shape) if self._conf['is_positional'] and self._conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] # 1st element of Hu_stack is the orginal embedding # lyang comments: self attention for index in range(self._conf['stack_num']): # print('[self attention for context turn] stack index: %d ' % index) with tf.variable_scope('self_stack_' + str(index), reuse=True): # [batch, max_turn_len, emb_size] Hu = layers.block( # attentive module Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) # print('[after layers.block] Hu shape: %s' % Hu.shape) Hu_stack.append(Hu) # print('[after self attention of context turn] len(Hu_stack)', # len(Hu_stack)) # 1+stack_num # lyang comments: cross attention # print('[cross attention ...]') r_a_t_stack = [] t_a_r_stack = [] # cross attention for index in range(self._conf['stack_num'] + 1): # print('[cross attention] stack index = ', index) with tf.variable_scope('t_attend_r_' + str(index)): try: # [batch, max_turn_len, emb_size] t_a_r = layers.block( # attentive module Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block( # [batch, max_turn_len, emb_size] Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=self.response_len) # print('[cross attention t_attend_r_] stack index: %d, t_a_r.shape: %s' % ( # index, t_a_r.shape)) with tf.variable_scope('r_attend_t_' + str(index)): try: # [batch, max_turn_len, emb_size] r_a_t = layers.block( # attentive module Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=self.response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block( Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=self.response_len, K_lengths=t_turn_length) # print('[cross attention r_a_t_] stack index: %d, r_a_t.shape: %s' % ( # index, r_a_t.shape)) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) # print('[cross attention] len(t_a_r_stack):', len(t_a_r_stack)) # print('[cross attention] len(r_a_t_stack):', len(r_a_t_stack)) # print('[before extend] len(t_a_r_stack):', len(t_a_r_stack)) # print('[before extend] len(r_a_t_stack):', len(r_a_t_stack)) # lyang comments: 3D aggregation t_a_r_stack.extend( Hu_stack) # half from self-attention; half from cross-attention r_a_t_stack.extend( Hr_stack) # half from self-attention; half from cross-attention # after extend, len(t_a_r_stack)) = 2*(stack_num+1) # print('[after extend] len(t_a_r_stack):', len(t_a_r_stack)) # print('[after extend] len(r_a_t_stack):', len(r_a_t_stack)) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) # print('after stack along the last dimension: ') # print('t_a_r shape: %s' % t_a_r.shape) # print('r_a_t shape: %s' % r_a_t.shape) # after stack, t_a_r and r_a_t are (batch, max_turn_len, embed_size, 2*(stack_num+1)) with tf.variable_scope('intent_based_attention', reuse=tf.AUTO_REUSE): # share parameter across different turns # there are 3 different ways to implement intent based attention # implement these three different variations and compare the # effectiveness as model abalation analysis # let I_u_t and I_r_k are intent vector in [12,1] # 1. dot: w * [I_u_t, I_r_k], where w is [24,1] # 2. biliear: I_u_t' * w * I_r_k, where w is [12,12] # 3. outprod: I_u_t * I_r_k' -> [12,12] out product -> # flaten to [144,1] outprod -> w*outprod # where w is [1,144] attention_logits = layers.attention_intent(t_intent, self.response_intent, self._conf['intent_attention_type']) # print('[intent_based_attention] attention_logits.shape: %s' % attention_logits.shape) attention_turns.append(attention_logits) # calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # divide sqrt(200) to prevent gradient explosion # A_biks * B_bjks -> C_bijs sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt( 200.0) # (batch, max_turn_len, embed_size, 2*(stack_num+1)) * # (batch, max_turn_len, embed_size, 2*(stack_num+1)) -> # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # where k is corresponding to the dimension of embed_size, # which can be eliminated by dot product with einsum # print('[similarity] after einsum dot prod sim shape: %s' % sim.shape) # [batch, max_turn_len, max_turn_len, 2*(stack_num+1)] # ! Here we multipy sim by intent based attention weights before # append sim into sim_turns in order to generate the weighted # stack in the next step sim_turns.append(sim) # print('[similarity] after append, len(sim_turns):', len(sim_turns)) attention_logits = tf.stack(attention_turns, axis=1) # [batch, max_turn_num] print('[attention_logits] after stack attention_logits.shape: %s' % attention_logits.shape) # add mask in attention following the way in BERT # real turn_num is in self.tt_turns_len [batch] # return a mask tensor with shape [batch, conf['max_turn_num']] attention_mask = tf.sequence_mask(self.tt_turns_len, self._conf['max_turn_num'], dtype=tf.float32) print('[attention_mask] attention_mask.shape: %s' % attention_mask.shape) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - attention_mask) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_logits += adder attention = tf.nn.softmax(attention_logits) # by default softmax along dim=-1 [batch, max_turn_num] print('[attention] attention.shape: %s' % attention_mask.shape) self.attention = attention # will print it for visualization # cnn and aggregation # lyang comments aggregation by 3D CNN layer # [3d cnn aggregation] sim shape: (32, 9, 180, 180, 10) # conv_0 shape: (32, 9, 180, 180, 16) # pooling_0 shape: (32, 3, 60, 60, 16) # conv_1 shape: (32, 3, 60, 60, 16) # pooling_1 shape: (32, 1, 20, 20, 16) # [3d cnn aggregation] final_info: (32, 6400) # [batch * feature_size] # [batch, max_turn_num, max_turn_len, max_turn_len, 2*(stack_num+1)] # (32, 9, 180, 180, 10) sim = tf.stack(sim_turns, axis=1) # multipy sim by attention score sim = tf.einsum('bijks,bi->bijks', sim, attention) print('[3d cnn aggregation] sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, self._conf['cnn_3d_oc0'], self._conf['cnn_3d_oc1']) # for udc # final_info = layers.CNN_3d(sim, 32, 16) # for douban # final_info = layers.CNN_3d(sim, 16, 16) print('[3d cnn aggregation] final_info: %s' % final_info.shape) # loss and train with tf.variable_scope('loss'): self.loss, self.logits = layers.loss(final_info, self.label) self.global_step = tf.Variable(0, trainable=False) initial_learning_rate = self._conf['learning_rate'] self.learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step=self.global_step, decay_steps=400, decay_rate=0.9, staircase=True) Optimizer = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = Optimizer.minimize( self.loss, global_step=self.global_step) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver( max_to_keep=self._conf["max_to_keep"]) self.all_variables = tf.global_variables() self.all_operations = self._graph.get_operations() self.grads_and_vars = Optimizer.compute_gradients(self.loss) for grad, var in self.grads_and_vars: if grad is None: print var self.capped_gvs = [(tf.clip_by_value(grad, -1, 1), var) for grad, var in self.grads_and_vars] self.g_updates = Optimizer.apply_gradients( self.capped_gvs, global_step=self.global_step) return self._graph
def self_cross_attention_block(config, Hu, every_turn_len, Hr, response_len): """ :param config: :param Hu: shape = (batch_size, max_turn_num, sentence_len, emb_size) :param every_turn_len: shape = (batch_size, max_turn_num ) :param Hr: shape = (batch_size, sentence_len, emb_size) :param response_len: shape = (batch_size) :return: """ if config['is_positional'] and config['stack_num'] > 0: with tf.variable_scope('positional', reuse=tf.AUTO_REUSE): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] for index in range(config['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=tf.AUTO_REUSE): # Hr.shape = (batch_size, max_turn_len, emb_size) Hr = layers.block(Hr, Hr, Hr, Q_lengths=response_len, K_lengths=response_len) Hr_stack.append(Hr) # context part # a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len, emb_size] list_turn_t = tf.unstack(Hu, axis=1) list_turn_length = tf.unstack(every_turn_len, axis=1) sim_turns = [] # for every Hu calculate matching vector for Hu, t_turn_length in zip(list_turn_t, list_turn_length): if config['is_positional'] and config['stack_num'] > 0: with tf.variable_scope('positional', reuse=tf.AUTO_REUSE): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] for index in range(config['stack_num']): with tf.variable_scope('self_stack_' + str(index), reuse=tf.AUTO_REUSE): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) Hu_stack.append(Hu) r_a_t_stack = [] t_a_r_stack = [] for index in range(config['stack_num'] + 1): with tf.variable_scope('t_attend_r_' + str(index), reuse=tf.AUTO_REUSE): try: t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=response_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=response_len) with tf.variable_scope('r_attend_t_' + str(index), reuse=tf.AUTO_REUSE): try: r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=response_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=response_len, K_lengths=t_turn_length) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) # calculate similarity matrix with tf.variable_scope('similarity', reuse=tf.AUTO_REUSE): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) sim_turns.append(sim) # cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation', reuse=tf.AUTO_REUSE): final_info = layers.CNN_3d(sim, 32, 16) with tf.variable_scope('linear', reuse=tf.AUTO_REUSE): W = tf.get_variable(name='weights', shape=[final_info.shape[-1], 1], initializer=tf.orthogonal_initializer()) bias = tf.get_variable(name='bias', shape=[1], initializer=tf.zeros_initializer()) logits = tf.reshape(tf.matmul(final_info, W) + bias, [-1]) return logits
def create_network(self): mask_cache = dict() if self.use_mask_cache else None response_emb = fluid.layers.embedding( input=self.response, size=[self._vocab_size + 1, self._emb_size], is_sparse=self.use_sparse_embedding, param_attr=fluid.ParamAttr( name=self.word_emb_name, initializer=fluid.initializer.Normal(scale=0.1))) # response part Hr = response_emb Hr_stack = [Hr] for index in six.moves.xrange(self._stack_num): Hr = layers.block( name="response_self_stack" + str(index), query=Hr, key=Hr, value=Hr, d_key=self._emb_size, q_mask=self.response_mask, k_mask=self.response_mask, mask_cache=mask_cache) Hr_stack.append(Hr) # context part sim_turns = [] for t in six.moves.xrange(self._max_turn_num): Hu = fluid.layers.embedding( input=self.turns_data[t], size=[self._vocab_size + 1, self._emb_size], is_sparse=self.use_sparse_embedding, param_attr=fluid.ParamAttr( name=self.word_emb_name, initializer=fluid.initializer.Normal(scale=0.1))) Hu_stack = [Hu] for index in six.moves.xrange(self._stack_num): # share parameters Hu = layers.block( name="turn_self_stack" + str(index), query=Hu, key=Hu, value=Hu, d_key=self._emb_size, q_mask=self.turns_mask[t], k_mask=self.turns_mask[t], mask_cache=mask_cache) Hu_stack.append(Hu) # cross attention r_a_t_stack = [] t_a_r_stack = [] for index in six.moves.xrange(self._stack_num + 1): t_a_r = layers.block( name="t_attend_r_" + str(index), query=Hu_stack[index], key=Hr_stack[index], value=Hr_stack[index], d_key=self._emb_size, q_mask=self.turns_mask[t], k_mask=self.response_mask, mask_cache=mask_cache) r_a_t = layers.block( name="r_attend_t_" + str(index), query=Hr_stack[index], key=Hu_stack[index], value=Hu_stack[index], d_key=self._emb_size, q_mask=self.response_mask, k_mask=self.turns_mask[t], mask_cache=mask_cache) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) if self.use_stack_op: t_a_r = fluid.layers.stack(t_a_r_stack, axis=1) r_a_t = fluid.layers.stack(r_a_t_stack, axis=1) else: for index in six.moves.xrange(len(t_a_r_stack)): t_a_r_stack[index] = fluid.layers.unsqueeze( input=t_a_r_stack[index], axes=[1]) r_a_t_stack[index] = fluid.layers.unsqueeze( input=r_a_t_stack[index], axes=[1]) t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1) r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1) # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len] sim = fluid.layers.matmul( x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0)) sim_turns.append(sim) if self.use_stack_op: sim = fluid.layers.stack(sim_turns, axis=2) else: for index in six.moves.xrange(len(sim_turns)): sim_turns[index] = fluid.layers.unsqueeze( input=sim_turns[index], axes=[2]) # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len] sim = fluid.layers.concat(input=sim_turns, axis=2) final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num) loss, logits = layers.loss(final_info, self.label) return loss, logits
def cc_model(input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, word_emb, conf, con_c): #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t1 = tf.unstack(input_x, axis=1) list_turn_length1 = tf.unstack(input_x_len, axis=1) list_turn_length1 = [tf.sequence_mask(i, conf["max_turn_len"]) for i in list_turn_length1] list_turn_length1 = [tf.cast(i, tf.float32) for i in list_turn_length1] list_turn_t2 = tf.unstack(input_x2, axis=1) list_turn_length2 = tf.unstack(input_x_len2, axis=1) list_turn_length2 = [tf.sequence_mask(i, conf["max_turn_len"]) for i in list_turn_length2] list_turn_length2 = [tf.cast(i, tf.float32) for i in list_turn_length2] if con_c: list_turn_t1 = tf.reshape(input_x, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]]) list_turn_t1 = [list_turn_t1] list_turn_t2 = tf.reshape(input_x2, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]]) list_turn_t2 = [list_turn_t2] list_turn_length1 = tf.cast(tf.sequence_mask(input_x_len, conf["max_turn_len"]), tf.float32) list_turn_length1 = tf.reshape(list_turn_length1, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]]) list_turn_length1 = [list_turn_length1] list_turn_length2 = tf.cast(tf.sequence_mask(input_x_len2, conf["max_turn_len"]), tf.float32) list_turn_length2 = tf.reshape(list_turn_length2, [conf["batch_size"], conf["max_turn_num"]*conf["max_turn_len"]]) list_turn_length2 = [list_turn_length2] #for every turn_t calculate matching vector trans_u1, trans_u2 = [], [] for turn_t, t_turn_length in zip(list_turn_t1, list_turn_length1): Hu = tf.nn.embedding_lookup(word_emb, turn_t) #[batch, max_turn_len, emb_size] #Hu = turn_t if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional_', reuse=tf.AUTO_REUSE): Hu = op.positional_encoding_vector(Hu, max_timescale=10) for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cc' + str(index), reuse=tf.AUTO_REUSE): Hu = layers.block( Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length, input_mask=True) trans_u1.append(Hu) for turn_r, r_turn_length in zip(list_turn_t2, list_turn_length2): Hu = tf.nn.embedding_lookup(word_emb, turn_r) #[batch, max_turn_len, emb_size] #Hu = turn_r if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional_', reuse=tf.AUTO_REUSE): Hu = op.positional_encoding_vector(Hu, max_timescale=10) for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cc' + str(index), reuse=tf.AUTO_REUSE): Hu = layers.block( Hu, Hu, Hu, Q_lengths=r_turn_length, K_lengths=r_turn_length, input_mask=True) trans_u2.append(Hu) final_info_all = [] sim_turns_all = [] for t_inedx, (turn_t, t_turn_length, Hu) in enumerate(zip(list_turn_t1, list_turn_length1, trans_u1)): sim_turns = [] for r_index, (turn_r, r_turn_length, Hr) in enumerate(zip(list_turn_t2, list_turn_length2, trans_u2)): with tf.variable_scope('u_attentd_r_' + str(index)): try: u_a_r = layers.block( Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=r_turn_length, input_mask=True) except ValueError: tf.get_variable_scope().reuse_variables() u_a_r = layers.block( Hu, Hr, Hr, Q_lengths=t_turn_length, K_lengths=r_turn_length, input_mask=True) with tf.variable_scope('r_attend_u_' + str(index)): try: r_a_u = layers.block( Hr, Hu, Hu, Q_lengths=r_turn_length, K_lengths=t_turn_length, input_mask=True) except ValueError: tf.get_variable_scope().reuse_variables() r_a_u = layers.block( Hr, Hu, Hu, Q_lengths=r_turn_length, K_lengths=t_turn_length, input_mask=True) # u_a_r batch_size turn emb u_a_r = tf.stack([u_a_r, Hu], axis=-1) r_a_u = tf.stack([r_a_u, Hr], axis=-1) #calculate similarity matrix with tf.variable_scope('similarity', reuse=tf.AUTO_REUSE): #sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] #sim shape [batch, max_turn_len, max_turn_len, 2] sim = tf.einsum('biks,bjks->bijs', r_a_u, u_a_r) / tf.sqrt(200.0) sim = layers.CNN_FZX(sim) final_info_all.append(sim) att_weight_print = None if not con_c: # final_info_all final_info_all = tf.stack(final_info_all, axis=1) # 100 9 144 max_nei = 5 gcn_size = conf["max_turn_num"]*conf["max_turn_num"] turn_size = conf["max_turn_num"] m1 = [ [] for i in range(gcn_size)] m_pos = [ [] for i in range(gcn_size)] m1_len = [ 0 for i in range(gcn_size)] for i in range(turn_size): for j in range(turn_size): cur_index = i*turn_size+j m1[cur_index].append(cur_index) m_pos[cur_index].extend([i,j]) if cur_index%turn_size!=0: m1[cur_index].append(cur_index-1) m_pos[cur_index].extend([i-1,j]) if cur_index%turn_size!=turn_size-1: m1[cur_index].append(cur_index+1) m_pos[cur_index].extend([i+1,j]) if i!=0: m1[cur_index].append(cur_index-turn_size) m_pos[cur_index].extend([i,j-1]) if i!=turn_size-1: m1[cur_index].append(cur_index+turn_size) m_pos[cur_index].extend([i,j+1]) m1_len[cur_index] = len(m1[cur_index]) if m1_len[cur_index]<max_nei: m1[cur_index].extend([cur_index for k in range(max_nei-m1_len[cur_index])]) for k in range(max_nei-m1_len[cur_index]): m_pos[cur_index].extend([i,j]) # m1 25 5 # m1_len 25 m1 = tf.constant(m1, dtype=tf.int32) # 25 5 m1_len = tf.constant(m1_len, dtype=tf.int32) m_pos = tf.constant(m_pos, dtype=tf.int32) def gan(input_m, adjm, adjm_len, adjm_pos, gcn_size, turn_size, max_nei): #return input_m batch_size_gnn = tf.shape(input_m)[0] mask_value = tf.cast(tf.sequence_mask(adjm_len, max_nei), tf.float32) # 25 5 res_all = [] for gan_index in range(4): with tf.variable_scope('gan_layer'+str(gan_index), reuse=tf.AUTO_REUSE): role_emb1 = tf.get_variable(name="gnn_role_emb1", shape=[turn_size, conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1)) role_emb2 = tf.get_variable(name="gnn_role_emb2", shape=[turn_size, conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1)) input_m_exp = tf.expand_dims(input_m, axis=2) # bs 25 1 144 input_m_exp = tf.tile(input_m_exp, [1, 1, max_nei, 1]) # bs 25 5 144 nei_rep = tf.gather(input_m, adjm, axis=1) # bs 25*5 144 nei_rep = tf.reshape(nei_rep, [tf.shape(input_m)[0], gcn_size, max_nei, -1]) # bs 25 5 144 att1 = tf.layers.dense(nei_rep, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcn") # bs 25 5 128 att2 = tf.layers.dense(input_m_exp, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcn") # bs 25 5 128 pos_index11 = tf.gather(adjm_pos, [0,], axis=1) pos_index12 = tf.gather(adjm_pos, [1,], axis=1) pos_index11 = tf.tile(pos_index11, [1, max_nei]) pos_index12 = tf.tile(pos_index12, [1, max_nei]) pos_index21 = tf.gather(adjm_pos, [0,2,4,6,8], axis=1) pos_index22 = tf.gather(adjm_pos, [1,3,5,7,9], axis=1) pos_index11 = tf.gather(role_emb1, pos_index11) # 25 5 30 pos_index12 = tf.gather(role_emb2, pos_index12) # 25 5 30 pos_index21 = tf.gather(role_emb1, pos_index21) # 25 5 30 pos_index22 = tf.gather(role_emb2, pos_index22) # 25 5 30 pos_index11 = tf.tile(tf.expand_dims(pos_index11, axis=0), [batch_size_gnn,1,1,1]) pos_index12 = tf.tile(tf.expand_dims(pos_index12, axis=0), [batch_size_gnn,1,1,1]) pos_index21 = tf.tile(tf.expand_dims(pos_index21, axis=0), [batch_size_gnn,1,1,1]) pos_index22 = tf.tile(tf.expand_dims(pos_index22, axis=0), [batch_size_gnn,1,1,1]) att = tf.concat([att1, att2], axis=-1) #att = tf.concat([att1, att2, pos_index11, pos_index12, pos_index21, pos_index22], axis=-1) att = tf.layers.dense(att, 1, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcna") # bs 25 5 128 att = tf.reshape(att, [-1, gcn_size, max_nei]) att = tf.nn.leaky_relu(att) # bs 25 5 att = att * tf.expand_dims(mask_value, axis=0) att = tf.nn.softmax(att, axis=2) # bs 25 5 att = att * tf.expand_dims(mask_value, axis=0) nei_rep2 = tf.layers.dense(nei_rep, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcnl") # bs 25 5 128 nei_rep11 = tf.layers.dense(input_m, 128, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="gcnl") # bs 25 5 128 nei_rep2 = nei_rep2 * tf.expand_dims(tf.expand_dims(mask_value, axis=0), axis=-1) res = tf.einsum('bdik,bdi->bdk', nei_rep2, att) # bs 25 128 att_input = res+nei_rep11 att_out = tf.layers.dense(att_input, 1, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="att"+str(i)) att_out = tf.nn.sigmoid(att_out) print_weight = att_out # att_out not used res = res + nei_rep11 input_m = res res_all.append(res) res_all = tf.concat(res_all, axis=-1) return res_all, print_weight gan_res, att_weight_print = gan(final_info_all, m1, m1_len, m_pos, gcn_size, turn_size, max_nei) final_info_all = gan_res final_info_role = [] role_emb1 = tf.get_variable(name="role_emb1", shape=[len(list_turn_t1), conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1)) role_emb2 = tf.get_variable(name="role_emb2", shape=[len(list_turn_t2), conf["role_dim"]], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=1)) for i, ii in enumerate(list_turn_t1): for j, jj in enumerate(list_turn_t2): role_con = tf.concat([role_emb1[i], role_emb2[j]], axis=0) final_info_role.append(role_con) final_info_role = tf.stack(final_info_role, axis=0) # 9 50 final_info_role = tf.expand_dims(final_info_role, 0) # 1 9 50 final_info_role = tf.tile(final_info_role, [tf.shape(final_info_all)[0], 1, 1], name="role_con") final_info_all_att = tf.concat([final_info_role, final_info_all], axis=2) final_info_all_att = tf.reshape(final_info_all_att, [-1, final_info_all_att.get_shape()[-1]]) # bs*9 144 final_info_all_att = tf.layers.dense(final_info_all_att, 1, kernel_initializer=tf.contrib.layers.xavier_initializer()) final_info_all_att = tf.squeeze(final_info_all_att, [1]) final_info_all_att = tf.reshape(final_info_all_att, [-1, final_info_all.get_shape()[1]]) # 100 9 final_info_all_att = tf.nn.softmax(final_info_all_att, axis=1) final_info_all_att = tf.expand_dims(final_info_all_att, -1) final_info_all_max = tf.reduce_max(final_info_all, axis=1) final_info_all_mean = tf.reduce_mean(final_info_all, axis=1) final_info_all = final_info_all * final_info_all_att final_info_all = tf.reduce_sum(final_info_all, axis=1) final_info_all = tf.concat([final_info_all_mean, final_info_all_max, final_info_all], axis=1) else: final_info_all = final_info_all[0] return final_info_all, att_weight_print
def cs_model(input_x, input_x_mask, input_x_len, input_x2, input_x_mask2, input_x_len2, input_x3, input_x_mask3, input_x_len3, word_emb, conf, initializer_opt=None): turn_num1 = input_x.shape[1] #conf["max_turn_num"] # tf.shape(input_x)[1] turn_num2 = input_x2.shape[ 1] # conf["max_turn_num"] #tf.shape(input_x2)[1] turn_num3 = input_x3.shape[ 1] # conf["max_turn_num"] #tf.shape(input_x2)[1] sent_len = conf["max_turn_len"] emb_dim = conf["emb_size"] matchin_include_x = conf["matchin_include_x"] data_type = [] if "history" in conf["cs_type"]: data_type.append("history") if "future" in conf["cs_type"]: data_type.append("future") merge_hf = False # EMB x_e = tf.nn.embedding_lookup(word_emb, input_x) x2_e = tf.nn.embedding_lookup(word_emb, input_x2) x3_e = tf.nn.embedding_lookup(word_emb, input_x3) x_e_mb = tf.reshape(x_e, [-1, sent_len, emb_dim]) x2_e_mb = tf.reshape(x2_e, [-1, sent_len, emb_dim]) x3_e_mb = tf.reshape(x3_e, [-1, sent_len, emb_dim]) x_len_mb = tf.reshape(input_x_len, [-1]) x_len2_mb = tf.reshape(input_x_len2, [-1]) x_len3_mb = tf.reshape(input_x_len3, [-1]) x_mask = tf.to_float(input_x_mask) # bs turn_num1 sent_len x2_mask = tf.to_float(input_x_mask2) # bs turn_num2 sent_len x3_mask = tf.to_float(input_x_mask3) # bs turn_num3 sent_len # ==================================== Encoder Layer ============================= with tf.variable_scope("Encode", reuse=tf.AUTO_REUSE): with tf.variable_scope('enc_self_att', reuse=tf.AUTO_REUSE): x_enc_mb = layers.block( x_e_mb, x_e_mb, x_e_mb, Q_lengths=x_len_mb, K_lengths=x_len_mb) # bs*turn_num1 sent_len emb x2_enc_mb = layers.block( x2_e_mb, x2_e_mb, x2_e_mb, Q_lengths=x_len2_mb, K_lengths=x_len2_mb) # bs*turn_num2 sent_len emb x3_enc_mb = layers.block( x3_e_mb, x3_e_mb, x3_e_mb, Q_lengths=x_len3_mb, K_lengths=x_len3_mb) # bs*turn_num2 sent_len emb x_enc = tf.reshape( x_enc_mb, [-1, turn_num1, sent_len, emb_dim]) # bs turn_num1 sent_len emb x_mask_flat = tf.reshape( x_mask, [-1, turn_num1 * sent_len]) # bs turn_num1*sent_len x_enc_ts = tf.reshape(x_enc, [-1, turn_num1 * sent_len, emb_dim]) x_mask_ts = tf.reshape(x_mask_flat, [-1, turn_num1 * sent_len]) iter_rep = [] input_all_dict = {"history": [], "future": []} input_all_dict["history"] = [ x2_e_mb, x2_enc_mb, x_len2_mb, turn_num2, x2_mask ] input_all_dict["future"] = [ x3_e_mb, x3_enc_mb, x_len3_mb, turn_num3, x3_mask ] save_dynamic_dict = {} all_mem_weight_dict = {} sim_ori_all = [] for match_type in data_type: x5_e_mb, x5_enc_mb, x_len5_mb, turn_num5, x5_mask = input_all_dict[ match_type] scope_name = "Model" with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): x5_enc = tf.reshape(x5_enc_mb, [-1, turn_num5, sent_len, emb_dim ]) # bs turn_num1 sent_len emb x5_enc_ts = tf.reshape(x5_enc_mb, [-1, turn_num5 * sent_len, emb_dim]) x5_mask_ts = tf.reshape(x5_mask, [-1, turn_num5 * sent_len]) # ==================================== Static Memory Layer ============================= if conf["use_static_memory"]: with tf.variable_scope("static_memory", reuse=tf.AUTO_REUSE): if merge_hf: x_enc_ts = tf.reshape( x_enc, [-1, turn_num1 * sent_len, emb_dim]) x_mask_ts = tf.reshape(x_mask_flat, [-1, turn_num1 * sent_len]) iter_rep_per1, iter_rep_per5 = match_layer( x5_enc_ts, x5_mask_ts, x_enc_ts, x_mask_ts, emb_dim, initializer_opt, turn_num5=None) iter_rep_per_out = tf.concat( [iter_rep_per1, iter_rep_per5], axis=1) print(iter_rep_per_out.shape) iter_rep.append(iter_rep_per_out) else: x5_enc_mb = x5_enc_mb # bs*turn_num2 sent_len emb_dim x5_mask_mb = tf.reshape(x5_mask, [-1, sent_len]) x_enc_mb2_ts = tf.reshape( tf.tile(tf.expand_dims(x_enc, axis=1), [1, turn_num5, 1, 1, 1]), [-1, turn_num1 * sent_len, emb_dim ]) # bs*turn_num2 sent_len*turn_num1 emb_dim x_mask_mb2_ts = tf.reshape( tf.tile(tf.expand_dims(x_mask_flat, axis=1), [1, turn_num5, 1]), [-1, turn_num1 * sent_len]) x5_enc_ts = tf.reshape( x5_enc_mb, [-1, turn_num5 * sent_len, emb_dim]) x5_e_ts = tf.reshape( x5_e_mb, [-1, turn_num5 * sent_len, emb_dim]) x5_mask_ts = tf.reshape(x5_mask, [-1, turn_num5 * sent_len]) x_e_ts = tf.reshape( x_e, [-1, turn_num1 * sent_len, emb_dim]) iter_rep_per_out, sim_ori = match_layer_selfatt( x5_e_ts, x5_enc_ts, x5_mask_ts, x_e_ts, x_enc_ts, x_mask_ts, emb_dim, initializer_opt, turn_num5=None, matchin_include_x=matchin_include_x) sim_ori_all.append(sim_ori) iter_rep.append(iter_rep_per_out) # ==================================== Dynamic Memory Layer Local ============================= if conf["use_dynamic_memory"]: with tf.variable_scope("dynamic_memory", reuse=tf.AUTO_REUSE): x5_enc_list = tf.unstack( x5_enc, axis=1) # bs [turn_num2] sent_len emb_dim x5_mask_list = tf.unstack( x5_mask, axis=1) # bs [turn_num2] sent_len x_enc_list = tf.unstack(x_enc, axis=1) x_mask_list = tf.unstack(x_mask, axis=1) x_enc_cur_list, x_mask_cur_list, _, _, _, _, all_mem_weight = mem_all_update( x_enc_list, x_mask_list, initializer_opt, need_reverse=True) all_mem_weight = tf.stack(all_mem_weight, axis=1) all_mem_weight_dict[match_type + "_query"] = all_mem_weight x_enc_mb2_ts = tf.reshape( x_enc_cur_list, [-1, turn_num1 * sent_len, emb_dim]) x_mask_mb2_ts = tf.reshape(x_mask_cur_list, [-1, turn_num1 * sent_len]) if match_type == "history": need_reverse = True else: need_reverse = False x5_enc_cur_list, x5_mask_cur_list, x5_enc_list, x5_mask_list, x5_enc_cur_last, x5_mask_cur_last, all_mem_weight = mem_all_update( x5_enc_list, x5_mask_list, initializer_opt, need_reverse=need_reverse) all_mem_weight = tf.stack(all_mem_weight, axis=1) all_mem_weight_dict[match_type] = all_mem_weight turn_xishu = turn_num5 # else: turn_xishu=1 x5_enc_cur_list = tf.reshape( x5_enc_cur_list, [-1, turn_xishu * sent_len, emb_dim]) x5_mask_cur_list = tf.reshape(x5_mask_cur_list, [-1, turn_xishu * sent_len]) save_dynamic_dict[match_type] = [ x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list, turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last, x5_mask_cur_last ] # ==================================== Dynamic Memory Layer Global ============================= data_type_dm2 = copy.deepcopy(data_type) # data_type_dm2 = [] if conf["use_dynamic_memory"] and conf["dynamic_memory_global"]: for match_type in data_type: scope_name = "Model" with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): with tf.variable_scope("dynamic_memory_part2", reuse=tf.AUTO_REUSE): x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list, turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last, x5_mask_cur_last = save_dynamic_dict[ match_type] h1 = save_dynamic_dict["history"][-4] h2 = save_dynamic_dict["history"][-3] hq1 = x_enc_ts hq2 = x_mask_ts f1 = save_dynamic_dict["future"][-4] f2 = save_dynamic_dict["future"][-3] g_last_his = [h1, h2] g_last_fut = [f1, f2] g_last_fut = None g1 = tf.concat([h1, f1, hq1], axis=1) g2 = tf.concat([h2, f2, hq2], axis=1) g_last_his = [g1, g2] x5_mask_cur_last, x5_enc_cur_last = None, None x_enc_cur_list, x_mask_cur_list, _, _, _, _, all_mem_weight = mem_all_update( x_enc_list, x_mask_list, initializer_opt, need_reverse=True, g_last_his=g_last_his, g_last_fut=g_last_fut) x5_enc_cur_list, x5_mask_cur_list, x5_enc_list, x5_mask_list, x5_enc_cur_last, x5_mask_cur_last, all_mem_weight = mem_all_update( x5_enc_list, x5_mask_list, initializer_opt, x_enc_cur_last=x5_enc_cur_last, x_mask_cur_last=x5_mask_cur_last, g_last_his=g_last_his, g_last_fut=g_last_fut) x_enc_mb2_ts = tf.reshape( x_enc_cur_list, [-1, turn_num1 * sent_len, emb_dim]) x_mask_mb2_ts = tf.reshape(x_mask_cur_list, [-1, turn_num1 * sent_len]) turn_xishu = turn_num5 x5_enc_cur_list = tf.reshape( x5_enc_cur_list, [-1, turn_xishu * sent_len, emb_dim]) x5_mask_cur_list = tf.reshape(x5_mask_cur_list, [-1, turn_xishu * sent_len]) save_dynamic_dict[match_type + "_global"] = [ x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, x5_enc_list, x5_mask_list, turn_num5, x5_enc_ts, x5_mask_ts, x5_enc_cur_last, x5_mask_cur_last ] data_type_dm2.append(match_type + "_global") # ==================================== Dynamic Memory Layer AGG ============================= for match_type in data_type_dm2: scope_name = "Model" # if conf["sepqrate_cs"]: scope_name = "Model"+match_type with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): if conf["use_dynamic_memory"]: scope_name1 = "dynamic_memory_part3" if "global" not in match_type else "dynamic_memory_part3_global" with tf.variable_scope(scope_name1, reuse=tf.AUTO_REUSE): x5_e_mb, x5_enc_cur_list, x5_mask_cur_list, x_enc_mb2_ts, x_mask_mb2_ts, _, _, turn_num5, _, _, _, _ = save_dynamic_dict[ match_type] x_e_ts = tf.reshape(x_e, [-1, turn_num1 * sent_len, emb_dim]) x5_e_ts = tf.reshape(x5_e_mb, [-1, turn_num5 * sent_len, emb_dim]) iter_rep_per_out, sim_ori = match_layer_selfatt( x5_e_ts, x5_enc_cur_list, x5_mask_cur_list, x_e_ts, x_enc_mb2_ts, x_mask_mb2_ts, emb_dim, initializer_opt, turn_num5=None, matchin_include_x=matchin_include_x) iter_rep.append(iter_rep_per_out) iter_rep_con = tf.concat(iter_rep, axis=1) return iter_rep_con, iter_rep, all_mem_weight_dict, save_dynamic_dict, sim_ori_all
def dam_model(input_x, input_x_mask, input_y, input_y_mask, word_emb, keep_rate, conf, x_len=None, y_len=None): Hr = tf.nn.embedding_lookup(word_emb, input_y) if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cr_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=y_len, K_lengths=y_len) Hr_stack.append(Hr) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(input_x, axis=1) list_turn_length = tf.unstack(x_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup(word_emb, turn_t) #[batch, max_turn_len, emb_size] if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cr_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) Hu_stack.append(Hu) r_a_t_stack = [] t_a_r_stack = [] for index in range(conf['stack_num'] + 1): with tf.variable_scope('t_attend_r_cr_' + str(index)): try: t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=y_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=y_len) with tf.variable_scope('r_attend_t_cr_' + str(index)): try: r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=y_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=y_len, K_lengths=t_turn_length) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #for douban #final_info = layers.CNN_3d(sim, 16, 16) return final_info