def calc_multi_head_similarity_fn(h, u, h_mask, u_mask, scope=None, num_units=None): if num_units == None: num_units = h.get_shape().as_list()[-1] # u_mask = tf.to_float(u_mask) # h_mask = tf.to_float(h_mask) with tf.variable_scope(scope or 'multi_head_attention'): # Linear projections Q = F(u, num_units, activation=tf.identity, input_keep_prob=input_keep_prob, wd=cfg.wd, is_train=is_train, scope='Q') # (N, T_q, C) K = F(h, num_units, activation=tf.identity, input_keep_prob=input_keep_prob, wd=cfg.wd, is_train=is_train, scope='K') # (N, T_k, C) V = F(h, num_units, activation=tf.identity, input_keep_prob=input_keep_prob, wd=cfg.wd, is_train=is_train, scope='V') # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) aug_V = tf.tile(tf.expand_dims(V_, 1), [1, tf.shape(Q_)[1], 1, 1]) # [N*h, T_q, T_k, C/h] mask = tf.tile(tf.expand_dims(h_mask, 1), [num_heads, tf.shape(Q_)[1], 1]) \ & tf.tile(tf.expand_dims(u_mask, 2), [num_heads, 1, tf.shape(K_)[1]]) # [N*h, T_q, T_k] outputs = softsel(aug_V, outputs, mask=mask) # Key Masking # key_masks = tf.sign(tf.abs(u_mask, axis=-1)) # (N, T_k) #key_masks = tf.tile(h_mask, [num_heads, 1]) # (h*N, T_k) #key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(h)[1], 1]) # (h*N, T_q, T_k) #paddings = tf.ones_like(outputs)*(-2**32+1) #outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) # outputs = exp_mask(outputs, key_masks, scope='outputs') #outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking # query_masks = tf.sign(tf.abs(h_mask, axis=-1)) # (N, T_q) #query_masks = tf.tile(u_mask, [num_heads, 1]) # (h*N, T_q) #query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(h_mask)[1]]) # (h*N, T_q, T_k) #outputs *= query_masks # broadcasting. (N, T_q, C) # Dropouts #outputs = tf.layers.dropout(outputs, rate=1-keep_rate, training=is_train) # Weighted sum #outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) # Residual connection # tf.assert_equal(tf.shape(outputs)[1], tf.shape(h)[1]) #outputs = outputs + h # Normalize # outputs = normalize(outputs) # (N, T_q, C) return outputs
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "AttentionCell"): memory_logits = self._controller(inputs, state, self._flat_memory) sel_mem = softsel(self._flat_memory, memory_logits, mask=self._flat_mask) # [N, m] new_inputs, new_state = self._mapper(inputs, state, sel_mem) return self._cell(new_inputs, state)
def self_attention(config, is_train, p, p_mask, scope=None): #[N, L, 2d] with tf.variable_scope(scope or "self_attention"): PL = tf.shape(p)[1] d = p.get_shape().as_list()[-1] p_aug_1 = tf.tile(tf.expand_dims(p, 2), [1, 1, PL, 1]) p_aug_2 = tf.tile(tf.expand_dims(p, 1), [1, PL, 1, 1]) #[N, PL, HL, 2d] p_mask_aug_1 = tf.tile(tf.expand_dims(p_mask, 2), [1, 1, PL]) p_mask_aug_2 = tf.tile(tf.expand_dims(p_mask, 1), [1, PL, 1]) self_mask = p_mask_aug_1 & p_mask_aug_2 h_logits = get_logits([p_aug_1, p_aug_2], None, True, wd=config.wd, mask=self_mask, is_train=is_train, func=config.att_func, scope='h_logits') # [N, PL, HL] self_att = softsel(p_aug_2, h_logits, mask=self_mask, scope='self_att') return self_att
def calc_multi_perspective_similarity_fn(h, u, u_f, h_mask, u_mask, num_perspectives=2, keep_rate=1.0, scope=None): with tf.variable_scope(scope or 'multi_perspective'): N, JX, JQ = tf.shape(h)[0], tf.shape(h)[1], tf.shape(u)[1] d = h.get_shape().as_list()[-1] l = num_perspectives h_u_mask = tf.logical_and(tf.tile(tf.expand_dims(h_mask, -1), [1, 1, JQ]), tf.tile(tf.expand_dims(u_mask, 1), [1, JX, 1])) # [N, JX, JQ] h1 = match_fn(h, tf.expand_dims(u_f, 1), num_perspectives=num_perspectives, scope='h1') h1 = tf.reshape(h1, [N, JX, l]) h2 = match_fn(h, u, num_perspectives=num_perspectives, scope='h2') # [N, JX, JQ, l] h2 = tf.reduce_max(exp_mask(h2, tf.tile(tf.expand_dims(h_u_mask, 3), [1, 1, 1, l])), 2) # [N, JX, l] h_u_similarity = calc_similarity_fn(h, u, logit_type='dot', scope='h_u_similarity') # [N, JX, JQ] aug_u = tf.tile(tf.expand_dims(u, 1), [1, JX, 1, 1]) # [N, JX, JQ, d] u_mean = softsel(aug_u, h_u_similarity, mask=h_u_mask, scope='u_mean') # [N, JX, d] h3 = match_fn(tf.reshape(h, [-1, 1, d]), tf.reshape(u_mean, [-1, 1, d]), num_perspectives=num_perspectives, scope='h3') h3 = tf.reshape(h3, [N, JX, l]) max_h_u_similarity = tf.argmax(h_u_similarity, axis=2) # [N, JX, 1] max_h_u_similarity = tf.one_hot(max_h_u_similarity, JQ, dtype='float') # [N, JX, JQ] u_max_mean = tf.reduce_sum(tf.tile(tf.expand_dims(max_h_u_similarity, 3), [1, 1, 1, d]) * aug_u, 2) # [N, JX, d] h4 = match_fn(tf.reshape(h, [-1, 1, d]), tf.reshape(u_mean, [-1, 1, d]), num_perspectives=num_perspectives, scope='h4') h4 = tf.reshape(h4, [N, JX, l]) out = tf.concat([h1, h2, h3, h4], 2) # [N, JX, 4*l] return out
def build_network(self): config = self.config if config.debug: self.tensor_dict = {} VW, dw, d = config.n_voc, config.emb_dim, config.hidden_dim N, M, JX = tf.shape(self.docs)[0], tf.shape(self.docs)[1], tf.shape(self.docs)[2] with tf.variable_scope("emb"): with tf.variable_scope('emb_var'), tf.device('/cpu:0'): word_emb_mat = tf.get_variable('word_emb_mat', shape=[VW, dw], dtype='float', initializer=get_initializer(config.embeddings), trainable=config.tune_embedding) docs_emb = tf.nn.embedding_lookup(word_emb_mat, self.docs) # [N, M, JX, d] if config.debug: self.tensor_dict['docs_emb'] = docs_emb wordmask = tf.reshape(self.wordmask, [N*M, JX]) length = tf.reduce_sum(tf.to_int32(wordmask), 1) # [N*M] sentencenum = tf.reduce_sum(tf.to_int32(self.sentencemask), 1) # [N] docs_emb = tf.reshape(docs_emb, [N*M, JX, dw]) with tf.variable_scope("wordlstm"): document, _ = rnn(config.rnn_type, docs_emb, length, d, scope='document', dropout_keep_prob=config.input_keep_prob, wd=config.wd, is_train=self.is_train) # [N*M, JX, 2*d] if config.debug: self.tensor_dict['document'] = document with tf.variable_scope("pooling_layer"): logits = linear(document, 1, True, scope='logits', squeeze=True, input_keep_prob=config.input_keep_prob, wd=config.wd, is_train=self.is_train) # [N*M, JX] document = softsel(document, logits, wordmask, scope='document') document = tf.reshape(document, [N, M, 2*d]) with tf.variable_scope('sentencelstm'): document, _ = rnn(config.rnn_type, document, sentencenum, d, scope='document', dropout_keep_prob=config.input_keep_prob, wd=config.wd, is_train=self.is_train) # [N, M, 2d] with tf.variable_scope("pooling_layer2"): logits = linear(document, 1, True, scope='logits', squeeze=True, input_keep_prob=config.input_keep_prob, wd=config.wd, is_train=self.is_train) # [N, M] document = softsel(document, logits, self.sentencemask, 'document') # [N, d] with tf.variable_scope("hidden_layer"): d = document.get_shape().as_list()[-1] document = F(document, d, activation=tf.nn.tanh, scope='document', input_keep_prob=config.input_keep_prob, wd=config.wd, is_train=self.is_train) with tf.variable_scope("logits_layer"): logits = linear(document, config.n_classes, True, wd=config.wd, scope='logits') # [N, n_classes] self.logits = logits
def basic_block(config, p, h, p_mask, h_mask, input_keep_prob=1.0, is_train=None, scope=None): N, JX, JQ = tf.shape(p)[0], tf.shape(p)[1], tf.shape(h)[1] d = p.get_shape().as_list()[-1] aug_p_mask = tf.tile(tf.expand_dims(p_mask, 2), [1, 1, JQ]) aug_h_mask = tf.tile(tf.expand_dims(h_mask, 1), [1, JX, 1]) mask = aug_p_mask & aug_h_mask # [N, JX, JQ] with tf.variable_scope(scope or "basic_block"): with tf.variable_scope("att"): aug_p = tf.tile(tf.expand_dims(p, 2), [1, 1, JQ, 1]) # [N, JX, JQ, d] aug_h = tf.tile(tf.expand_dims(h, 1), [1, JX, 1, 1]) # [N, JX, JQ, d] similarity = get_logits([aug_p, aug_h], None, True, wd=config.wd, mask=mask, is_train=is_train, func=config.att_func, scope='similarity') # [N, JX, JQ] h_att = softsel(aug_h, similarity, mask=mask, scope='h_att') # [N, JX, d] p_att = softsel(tf.transpose(aug_p, [0, 2, 1, 3]), # [N, JQ, JX, d] tf.transpose(similarity, [0, 2, 1]), mask=tf.transpose(mask, [0, 2, 1]), scope='p_att') # [N, JQ, d] with tf.variable_scope("self_att"): p = self_attention_layer(config, is_train, p, p_mask=p_mask, scope="self_attention") # [N, JX, d] tf.get_variable_scope().reuse_variables() h = self_attention_layer(config, is_train, h, p_mask=h_mask, scope="self_attention") # [N, JQ, d] p = tf.concat([p * h_att, tf.abs(p - h_att)], -1) # [N, JX, d] h = tf.concat([h * p_att, tf.abs(h - p_att)], -1) # [N, JQ, d] with tf.variable_scope("projecion"): p = F(p, d, activation=tf.nn.relu, scope='p', input_keep_prob=config.input_keep_prob, is_train=is_train, wd=config.wd) tf.get_variable_scope().reuse_variables() h = F(h, d, activation=tf.nn.relu, scope='p', input_keep_prob=config.input_keep_prob, is_train=is_train) with tf.variable_scope("rnn"): p_len = tf.reduce_sum(tf.cast(p_mask, tf.int32), 1) h_len = tf.reduce_sum(tf.cast(h_mask, tf.int32), 1) ph, _ = rnn(config.rnn_type, tf.concat([p, h], 0), p_len, config.hidden_size, scope='p_rnn', dropout_keep_prob=config.input_keep_prob, is_train=is_train, wd=config.wd) p, h = tf.split(ph, 2) # p, _ = rnn(config.rnn_type, p, p_len, config.hidden_size, scope='p_rnn', # dropout_keep_prob=config.input_keep_prob, is_train=is_train, wd=config.wd) # tf.get_variable_scope().reuse_variables() # h, _ = rnn(config.rnn_type, h, h_len, config.hidden_size, scope='p_rnn', # dropout_keep_prob=config.input_keep_prob, is_train=is_train) p = p * tf.to_float(tf.expand_dims(p_mask, -1)) # [N, JX, d] h = h * tf.to_float(tf.expand_dims(h_mask, -1)) # [N, JQ, d] p_max = tf.reduce_max(p, 1) h_max = tf.reduce_max(h, 1) attentive = True if attentive: p_att_logits = linear(p, 1, True, scope='p_att_logits', squeeze=True, input_keep_prob=config.input_keep_prob, is_train=is_train, wd=config.wd) p_att = softsel(p, p_att_logits, mask=p_mask, scope='p_att') tf.get_variable_scope().reuse_variables() h_att_logits = linear(h, 1, True, scope='p_att_logits', squeeze=True, input_keep_prob=config.input_keep_prob, is_train=is_train) h_att = softsel(h, h_att_logits, mask=h_mask, scope='h_att') p_max = tf.concat([p_max, p_att], -1) h_max = tf.concat([h_max, h_att], -1) return p, h, p_max, h_max