def mean_pooling_for_unselected_head( unhead_org_idx, sl_unhead, rep_unhead_mask, dep_org_idx, sl_dep, rep_dep_mask, rep_dep_tensor, direction ): with tf.name_scope('pooling_for_un_head'): undep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_unhead, 1]) # [bs, sluh, sld] unhead_idxs = tf.tile(tf.expand_dims(unhead_org_idx, 2), [1, 1, sl_dep]) # [bs, sluh, sld] if direction is None: direct_mask_un = tf.not_equal(unhead_idxs, undep_idxs) # [bs, sluh, sld] else: if direction == 'forward': direct_mask_un = tf.greater(unhead_idxs, undep_idxs) # [bs, sluh, sld] else: direct_mask_un = tf.less(unhead_idxs, undep_idxs) # [bs, sluh, sld] # [bs, sluh, sld] rep_mask_tile_un = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_unhead_mask, 2)) pooling_mask = tf.logical_and(direct_mask_un, rep_mask_tile_un) # [bs, sluh, sld] # data for pooling pooling_data = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_unhead, 1, 1]) # bs,sluh,sld,hn # execute mean pooling based on pooling_mask[bs, sluh, sld] and pooling_data[bs,sluh,sld,hn] pooling_data = mask_for_high_rank(pooling_data, pooling_mask) # [bs,sluh,sld,hn] pooling_data_sum = tf.reduce_sum(pooling_data, -2) # [bs,sluh,hn] pooling_den = tf.reduce_sum(tf.cast(pooling_mask, tf.int32), -1, keep_dims=True) # [bs,sluh] pooling_den = tf.where(tf.equal(pooling_den, 0), tf.ones_like(pooling_den), pooling_den) pooling_result = pooling_data_sum / tf.cast(pooling_den, tf.float32) return pooling_result
def cnn_for_sentence_encoding( # kim rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): """ :param rep_tensor: :param rep_mask: :param filter_sizes: :param num_filters: :param scope: :param is_train: :param keep_prob: :param wd: :return: """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) conv = tf.nn.conv2d( rep_tensor_expand_dp, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl-fs+1, 1, fn # Maxpooling over the outputs # pooled = tf.nn.max_pool( # h, # ksize=[1, sl - filter_size + 1, 1, 1], # strides=[1, 1, 1, 1], # padding='VALID', # name="pool") pooled = tf.reduce_max(h, 1, True) # bs, 1, 1, fn pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) if wd > 0.: add_reg_without_bias() return h_pool_flat
def cnn_for_context_fusion(rep_tensor, rep_mask, filter_sizes=(3, 4, 5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) # bs, sl, rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) # # pading in the sequence if filter_size % 2 == 1: padding_front = padding_back = int((filter_size - 1) / 2) else: padding_front = (filter_size - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] rep_tensor_expand_dp_pad = tf.pad(rep_tensor_expand_dp, padding) conv = tf.nn.conv2d(rep_tensor_expand_dp_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl, 1, fn h_squeeze = tf.squeeze(h, [2]) # bs, sl, fn pooled_outputs.append(h_squeeze) # Combine all the pooled features result = tf.concat(pooled_outputs, 2) # bs, sl, 3 * fn if wd > 0.: add_reg_without_bias() return result
def hierarchical_cnn_res_gate( rep_tensor, rep_mask, n_gram=5, layer_num=5, hn=None, scope=None, is_train=None, keep_prob=1., wd=0.): # padding if n_gram % 2 == 1: padding_front = padding_back = int((n_gram - 1) / 2) else: padding_front = (n_gram - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] # lengths bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) # bs, sl, hn iter_rep = rep_tensor layer_res_list = [] for layer_idx in range(layer_num): with tf.variable_scope("conv_maxpool_%s" % layer_idx): iter_rep_etd = tf.expand_dims(iter_rep, 3) # bs,sl,hn,1 iter_rep_etd_dp = dropout(iter_rep_etd, keep_prob, is_train) # Convolution Layer feature_size = org_ivec if layer_idx == 0 else ivec filter_shape = [n_gram, feature_size, 1, 2 * ivec] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [2 * ivec], tf.float32) iter_rep_etd_pad = tf.pad(iter_rep_etd_dp, padding) conv = tf.nn.conv2d( iter_rep_etd_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") map_res = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs,sl,1,2hn map_res = tf.squeeze(map_res, [2]) # bs,sl,2*hn # gate map_res_a, map_res_b = tf.split(map_res, num_or_size_splits=2, axis=2) iter_rep = map_res_a * tf.nn.sigmoid(map_res_b) # res if len(layer_res_list) > 0: iter_rep = iter_rep + layer_res_list[-1] layer_res_list.append(iter_rep) if wd > 0.: add_reg_without_bias() return iter_rep
def time_aware_attention(train_inputs, embed, mask, embedding_size, k): with tf.variable_scope('time_aware_attention'): attn_weights = tf.Variable( tf.truncated_normal([embedding_size, k], stddev=1.0 / math.sqrt(k))) attn_biases = tf.Variable(tf.zeros([k])) # weight add bias attn_embed = tf.nn.bias_add(attn_weights, attn_biases) # multiplying it with Ei attn_scalars = tf.tensordot(embed, attn_embed, axes=[[2], [0]]) # get abs of distance train_delta = tf.abs(train_inputs[:, :, 1]) # distance function is log(dist+1) dist_fun = tf.log(tf.to_float(train_delta) + 1.0) # reshape the dist_fun dist_fun = tf.reshape( dist_fun, [tf.shape(dist_fun)[0], tf.shape(dist_fun)[1], 1]) # the attribution logits attn_logits = tf.multiply(attn_scalars, dist_fun) # the attribution logits sum attn_logits_sum = tf.reduce_sum(attn_logits, -1, keepdims=True) attn_logits_sum = exp_mask_for_high_rank(attn_logits_sum, mask) # get weights via softmax attn_softmax = tf.nn.softmax(attn_logits_sum, 1) # the weighted sum attn_embed_weighted = tf.multiply(attn_softmax, embed) attn_embed_weighted = mask_for_high_rank(attn_embed_weighted, mask) reduced_embed = tf.reduce_sum(attn_embed_weighted, 1) # obtain two scalars scalar1 = tf.log(tf.to_float(tf.shape(embed)[1]) + 1.0) scalar2 = tf.reduce_sum(tf.pow(attn_softmax, 2), 1) # the scalared embed reduced_embed = tf.multiply(reduced_embed, scalar1) reduced_embed = tf.multiply(reduced_embed, scalar2) return reduced_embed, attn_embed_weighted
def sentence_encoding_models(rep_tensor, rep_mask, method, activation_function, scope=None, wd=0., is_train=None, keep_prob=1., **kwargs): method_name_list = [ 'cnn_kim', 'no_ct', 'lstm', 'gru', 'sru', 'sru_normal', # rnn 'cnn', 'multi_head', 'multi_head_git', 'disa', 'mlsa', 'block' ] with tf.variable_scope(scope or 'sentence_encoding_models'): if method == 'cnn_kim': sent_coding = cnn_for_sentence_encoding(rep_tensor, rep_mask, (3, 4, 5), 200, 'sent_encoding_cnn_kim', is_train, keep_prob, wd) elif method == 'none': sent_coding = tf.reduce_sum( mask_for_high_rank(rep_tensor, rep_mask), 1) else: ct_rep = None if method == 'no_ct': ct_rep = tf.identity(rep_tensor) else: ct_rep = context_fusion_layers(rep_tensor, rep_mask, method, activation_function, None, wd, is_train, keep_prob, **kwargs) sent_coding = multi_dimensional_attention( ct_rep, rep_mask, 'multi_dim_attn_for_%s' % method, keep_prob, is_train, wd, activation_function) return sent_coding
def normal_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'normal_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_result = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_tensor_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # bs,sl,vec return output
def pooling_with_mask(rep_tensor, rep_mask, method='max', scope=None): # rep_tensor have one more rank than rep_mask with tf.name_scope(scope or '%s_pooling' % method): if method == 'max': rep_tensor_masked = exp_mask_for_high_rank(rep_tensor, rep_mask) output = tf.reduce_max(rep_tensor_masked, -2) elif method == 'mean': rep_tensor_masked = mask_for_high_rank(rep_tensor, rep_mask) # [...,sl,hn] rep_sum = tf.reduce_sum(rep_tensor_masked, -2) #[..., hn] denominator = tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1, True) # [..., 1] denominator = tf.where( tf.equal(denominator, tf.zeros_like(denominator, tf.int32)), tf.ones_like(denominator, tf.int32), denominator) output = rep_sum / tf.cast(denominator, tf.float32) else: raise AttributeError('No Pooling method name as %s' % method) return output
def self_attention_for_selected_head( head_selection, head_org_idx, sl_head, rep_head_mask, dep_selection, dep_org_idx, sl_dep, rep_dep_mask, rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec ): # data for self-attention rep_map_dp = dropout(rep_map, keep_prob, is_train) rep_dep_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, dep_selection) rep_head_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, head_selection) # mask generation dep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_head, 1]) head_idxs = tf.tile(tf.expand_dims(head_org_idx, 2), [1, 1, sl_dep]) if direction is None: direct_mask = tf.not_equal(head_idxs, dep_idxs) # [bs, slh, sld] else: if direction == 'forward': direct_mask = tf.greater(head_idxs, dep_idxs) # [bs, slh, sld] else: direct_mask = tf.less(head_idxs, dep_idxs) # [bs, slh, sld] # [bs, slh, slh] rep_mask_tile = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_head_mask, 2)) attn_mask = tf.logical_and(direct_mask, rep_mask_tile) # [bs, slh, sld] # tensor tile rep_map_tile = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_head, 1, 1]) # bs,slh,sld,vec with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_dep_tensor_dp, ivec, False, scope='linear_dependent') # bs,sld,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sld,vec head = linear(rep_head_tensor_dp, ivec, False, scope='linear_head') # bs,slh,vec head_etd = tf.expand_dims(head, 2) # bs,slh,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,slh,sld,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) # bs,slh,sld,vec attn_score = tf.nn.softmax(logits_masked, 2) # bs,slh,sld,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,slh,vec -> head_org_idx return attn_result
def directional_attention_with_dense(rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None, hn=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = hn or rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask = tf.cast( tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, ivec, False, scope='linear_dependent') # bs,sl,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,vec head = linear(rep_map_dp, ivec, False, scope='linear_head') # bs,sl,vec head_etd = tf.expand_dims(head, 2) # bs,sl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score tensor_dict[name + '_gate'] = fusion_gate return output
def build_network(self): # Look up embeddings for inputs. with tf.name_scope('code_embeddings'): init_code_embed = tf.random_uniform( [self.vocabulary_size, self.embedding_size], -1.0, 1.0) code_embeddings = tf.Variable(init_code_embed) context_embed = tf.nn.embedding_lookup(code_embeddings, self.context_codes) if self.model_type == 'tesa': with tf.name_scope(self.model_type): # Embedding size is calculated as shape(train_inputs) + shape(embeddings)[1:] init_date_embed = tf.random_uniform( [self.dates_size, self.embedding_size], -1.0, 1.0) date_embeddings = tf.Variable(init_date_embed) if self.is_date_encoding: date_embed = tf.nn.embedding_lookup( date_embeddings, self.context_dates) # self_attention cntxt_embed = temporal_date_sa_with_dense( rep_tensor=context_embed, rep_mask=self.context_mask, date_tensor=date_embed, is_train=True, activation=self.activation, is_scale=self.is_scale) else: date_embed = tf.nn.embedding_lookup( date_embeddings, self.train_masks) # self_attention cntxt_embed = temporal_delta_sa_with_dense( rep_tensor=context_embed, rep_mask=self.context_mask, delta_tensor=date_embed, is_train=True, activation=self.activation, is_scale=self.is_scale) # Attention pooling context_fusion = multi_dimensional_attention(cntxt_embed, self.context_mask, is_train=True) elif self.model_type == 'delta': with tf.name_scope(self.model_type): #self_attention init_date_embed = tf.random_uniform( [self.dates_size, self.embedding_size], -1.0, 1.0) date_embeddings = tf.Variable(init_date_embed) date_embed = tf.nn.embedding_lookup(date_embeddings, self.train_masks) cntxt_embed = delta_with_dense(rep_tensor=context_embed, rep_mask=self.context_mask, delta_tensor=date_embed, is_train=True, activation=self.activation, is_scale=self.is_scale) # attention pooling context_fusion = multi_dimensional_attention(cntxt_embed, self.context_mask, is_train=True) elif self.model_type == 'sa': with tf.name_scope(self.model_type): #self_attention cntxt_embed = self_attention_with_dense( rep_tensor=context_embed, rep_mask=self.context_mask, is_train=True, activation=self.activation, is_scale=self.is_scale) # attention pooling context_fusion = multi_dimensional_attention(cntxt_embed, self.context_mask, is_train=True) elif self.model_type == 'normal': with tf.name_scope(self.model_type): #self_attention cntxt_embed = normal_attention(rep_tensor=context_embed, rep_mask=self.context_mask, is_train=True, activation=self.activation) # attention pooling context_fusion = multi_dimensional_attention(cntxt_embed, self.context_mask, is_train=True) elif self.model_type == 'cbow': with tf.name_scope(self.model_type): cntxt_embed = mask_for_high_rank( context_embed, self.context_mask) # bs,sl,vec context_fusion = tf.reduce_mean(cntxt_embed, 1) elif self.model_type == 'ta_attn': context_fusion = time_aware_attention(self.train_inputs, context_embed, self.context_mask, self.embedding_size, k=100) elif self.model_type == 'fusion': with tf.name_scope(self.model_type): # self-attention code2code = self_attention_with_dense( rep_tensor=context_embed, rep_mask=self.context_mask, is_train=True, activation=self.activation) # attention pooling source2code = multi_dimensional_attention(code2code, self.context_mask, is_train=True) # time-aware attention ta_attn_res = time_aware_attention(self.train_inputs, context_embed, self.context_mask, self.embedding_size, k=100) ivec = ta_attn_res.get_shape()[1] concat_context = tf.concat([source2code, ta_attn_res], 1) # context_fusion = fusion_gate(source2code,ta_attn_res,wd=0., keep_prob=1., is_train=True) context_fusion = bn_dense_layer(concat_context, ivec, True, 0., 'bn_dense_map', self.activation, False, wd=0., keep_prob=1., is_train=True) return context_fusion, code_embeddings
def multi_head_attention(rep_tensor, rep_mask, head_num=8, hidden_units_num=64, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'multi_head_attention'): with tf.variable_scope('positional_encoding'): seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1), [1, ivec]) # sl, ivec feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0), [sl, 1]) # sl, ivec pos_enc = tf.where( tf.equal(tf.mod(feature_idxs, 2), 0), tf.sin( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs, tf.float32) / (1.0 * ivec))), tf.cos( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) / (1.0 * ivec))), ) rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc, rep_mask) # bs, sl, ivec with tf.variable_scope('multi_head_attention'): W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num], tf.float32) rep_tile = tf.tile( tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0), [3, head_num, 1, 1, 1]) # 3,head_num,bs,sl,ivec rep_tile_reshape = tf.reshape( rep_tile, [3, head_num, bs * sl, ivec]) # head_num,bs*sl,ivec maps = tf.reshape( # 3,head_num,bs*sl,hn -> 3,head_num,bs,sl,hn tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W), [3, head_num, bs, sl, hidden_units_num]) Q_map, K_map, V_map = tf.split(maps, 3, 0) Q_map = tf.squeeze(Q_map, [0]) # head_num,bs,sl,hn K_map = tf.squeeze(K_map, [0]) # head_num,bs,sl,hn V_map = tf.squeeze(V_map, [0]) # head_num,bs,sl,hn # head_num,bs,sl,sl # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num) similarity_mat = tf.matmul(Q_map, tf.transpose( K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num) # mask: bs,sl -> head_num,bs,sl multi_mask = tf.tile(tf.expand_dims(rep_mask, 0), [head_num, 1, 1]) # head_num,bs,sl multi_mask_tile_1 = tf.expand_dims(multi_mask, 2) # head_num,bs,1,sl multi_mask_tile_2 = tf.expand_dims(multi_mask, 3) # head_num,bs,sl,1 multi_mask_tile = tf.logical_and( multi_mask_tile_1, multi_mask_tile_2) # head_num,bs,sl,sl similarity_mat_masked = exp_mask( similarity_mat, multi_mask_tile) # head_num,bs,sl,sl prob_dist = tf.nn.softmax( similarity_mat_masked) # head_num,bs,sl,sl prob_dist_dp = dropout(prob_dist, keep_prob, is_train) attn_res = tf.matmul(prob_dist_dp, V_map) # head_num,bs,sl,hn attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3]) output = tf.reshape(attn_res_tran, [bs, sl, head_num * hidden_units_num]) if wd > 0.: add_reg_without_bias() return output
def context_fusion_layers(rep_tensor, rep_mask, method, activation_function, scope=None, wd=0., is_train=None, keep_prob=1., **kwargs): method_name_list = [ 'lstm', 'gru', 'sru', 'sru_normal', # rnn 'cnn', 'multi_head', 'multi_head_git', 'disa', 'mpsa', 'block' ] bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] context_fusion_output = None with tf.variable_scope(scope or 'context_fusion_layers'): if method in ['lstm', 'gru', 'sru_normal']: context_fusion_output = contextual_bi_rnn(rep_tensor, rep_mask, ivec, method, False, wd, keep_prob, is_train, 'ct_bi_%s' % method) elif method == 'sru': context_fusion_output = bi_sru_recurrent_network( rep_tensor, rep_mask, is_train, keep_prob, wd, 'ct_bi_sru') elif method == 'cnn': context_fusion_output = cnn_for_context_fusion( rep_tensor, rep_mask, (3, 4, 5), 200, 'ct_cnn', is_train, keep_prob, wd) elif method == 'multi_head': context_fusion_output = multi_head_attention( rep_tensor, rep_mask, 8, 75, 'ct_multi_head', is_train, keep_prob, wd) elif method == 'multi_head_git': context_fusion_output = multi_head_attention_git( rep_tensor, rep_mask, 8, 600, 'ct_multi_head', is_train, keep_prob, wd) elif method == 'disa': with tf.variable_scope('ct_disa'): disa_fw = directional_attention_with_dense( rep_tensor, rep_mask, 'forward', 'fw_disa', keep_prob, is_train, wd, activation_function) disa_bw = directional_attention_with_dense( rep_tensor, rep_mask, 'backward', 'bw_disa', keep_prob, is_train, wd, activation_function) context_fusion_output = tf.concat([disa_fw, disa_bw], -1) elif method == 'block': if 'block_len' in kwargs.keys(): block_len = kwargs['block_len'] else: block_len = None if block_len is None: block_len = tf.cast( tf.ceil(tf.pow(tf.cast(2 * sl, tf.float32), 1.0 / 3)), tf.int32) context_fusion_output = bi_directional_simple_block_attention( rep_tensor, rep_mask, block_len, 'ct_block_attn', keep_prob, is_train, wd, activation_function) elif method == 'mpsa': with tf.variable_scope('ct_mpsa'): mpsa_fw = masked_positional_self_attention( 0, rep_tensor, rep_mask, 'forward', 'fw_mpsa', keep_prob, is_train, wd, activation_function) mpsa_bw = masked_positional_self_attention( 0, rep_tensor, rep_mask, 'backward', 'bw_mpsa', keep_prob, is_train, wd, activation_function) mpsa_2g = masked_positional_self_attention( 2, rep_tensor, rep_mask, None, '2g_mpsa', keep_prob, is_train, wd, activation_function) mpsa_3g = masked_positional_self_attention( 3, rep_tensor, rep_mask, None, '3g_mpsa', keep_prob, is_train, wd, activation_function) sen_tensor = mask_for_high_rank(rep_tensor, rep_mask) sen_tensor_t = tf.expand_dims(sen_tensor, 2) fw_res = tf.expand_dims(mpsa_fw, 2) bw_res = tf.expand_dims(mpsa_bw, 2) g2_res = tf.expand_dims(mpsa_2g, 2) g3_res = tf.expand_dims(mpsa_3g, 2) tmp_res = tf.concat( [sen_tensor_t, fw_res, bw_res, g2_res, g3_res], 2) # bs,sl,5,ivec bs, sl = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1] ivec = rep_tensor.get_shape()[2] num = tmp_res.get_shape()[2] bias = tf.get_variable('bias', [num * ivec], tf.float32, tf.constant_initializer(0.)) softmax_gate = linear(sen_tensor, num * ivec, True, 0., 'linear_softmax', False, wd, keep_prob, is_train) + bias # bs,sl,5*ivec fusion_gate = tf.nn.softmax( tf.reshape(softmax_gate, [bs, sl, num, ivec]), 2) context_fusion_output = tf.reduce_sum(fusion_gate * tmp_res, 2) # bs,sl,ivec else: raise RuntimeError return context_fusion_output
def simple_block_attention(rep_tensor, rep_mask, block_len=5, scope=None, direction=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None): assert direction is not None def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'block_simple'): # @1. split sequence with tf.variable_scope('split_seq'): block_num = tf.cast( tf.ceil( tf.divide(tf.cast(sl, tf.float32), tf.cast(block_len, tf.float32))), tf.int32) comp_len = block_num * block_len - sl rep_tensor_comp = tf.concat( [rep_tensor, tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1) rep_mask_comp = tf.concat([ rep_mask, tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool) ], 1) rep_tensor_split = tf.reshape( rep_tensor_comp, [bs, block_num, block_len, org_ivec]) # bs,bn,bl,d rep_mask_split = tf.reshape(rep_mask_comp, [bs, block_num, block_len]) # bs,bn,bl # non-linear rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # bs,bn,bl,vec rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2), [1, 1, block_len, 1, 1]) # bs,bn,bl,bl,vec # rep_map_dp = dropout(rep_map, keep_prob, is_train) bn = block_num bl = block_len with tf.variable_scope('self_attention'): # @2.self-attention in block # mask generation sl_indices = tf.range(block_len, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) # bl,bl else: direct_mask = tf.greater(sl_col, sl_row) # bl,bl direct_mask_tile = tf.tile( tf.expand_dims(tf.expand_dims(direct_mask, 0), 0), [bs, bn, 1, 1]) # bs,bn,bl,bl rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2), [1, 1, bl, 1]) # bs,bn,bl,bl rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3), [1, 1, 1, bl]) # bs,bn,bl,bl rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2) attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile, name='attn_mask') # bs,bn,bl,bl # attention f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent_head = linear(rep_map, 2 * ivec, False, 0., 'linear_dependent_head', False, wd, keep_prob, is_train) # bs,bn,bl,2vec dependent, head = tf.split(dependent_head, 2, 3) dependent_etd = tf.expand_dims(dependent, 2) # bs,bn,1,bl,vec head_etd = tf.expand_dims(head, 3) # bs,bn,bl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,bn,bl,bl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 3) # bs,bn,bl,bl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) # bs,bn,bl,bl,vec self_attn_result = tf.reduce_sum(attn_score * rep_map_tile, 3) # bs,bn,bl,vec with tf.variable_scope('source2token_self_attn'): inter_block_logits = bn_dense_layer(self_attn_result, ivec, True, 0., 'bn_dense_map', 'linear', False, wd, keep_prob, is_train) # bs,bn,bl,vec inter_block_logits_masked = exp_mask_for_high_rank( inter_block_logits, rep_mask_split) # bs,bn,bl,vec inter_block_soft = tf.nn.softmax(inter_block_logits_masked, 2) # bs,bn,bl,vec inter_block_attn_output = tf.reduce_sum( self_attn_result * inter_block_soft, 2) # bs,bn,vec with tf.variable_scope('self_attn_inter_block'): inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32), tf.bool) block_ct_res = directional_attention_with_dense( inter_block_attn_output, inter_block_attn_output_mask, direction, 'disa', keep_prob, is_train, wd, activation) # [bs,bn,vec] block_ct_res_tile = tf.tile(tf.expand_dims( block_ct_res, 2), [1, 1, bl, 1]) #[bs,bn,vec]->[bs,bn,bl,vec] with tf.variable_scope('combination'): # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec] rep_tensor_with_ct = tf.concat( [rep_map, self_attn_result, block_ct_res_tile], -1) # [bs,bn,bl,3vec] new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True, 0., 'linear_new_context_and_gate', False, wd, keep_prob, is_train) # [bs,bn,bl,2vec] new_context, gate = tf.split(new_context_and_gate, 2, 3) # bs,bn,bl,vec if activation == "relu": new_context_act = tf.nn.relu(new_context) elif activation == "elu": new_context_act = tf.nn.elu(new_context) elif activation == "linear": new_context_act = tf.identity(new_context) else: raise RuntimeError gate_sig = tf.nn.sigmoid(gate) combination_res = gate_sig * new_context_act + ( 1 - gate_sig) * rep_map # bs,bn,bl,vec with tf.variable_scope('restore_original_length'): combination_res_reshape = tf.reshape( combination_res, [bs, bn * bl, ivec]) # bs,bn*bl,vec output = combination_res_reshape[:, :sl, :] return output
def gated_self_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None, position_mask_type=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'gated_self_attention_%s' % (position_mask_type or 'None')): rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # mask generation rep_mask_epd1 = tf.expand_dims(rep_mask, 1) # bs,1,sl rep_mask_epd2 = tf.expand_dims(rep_mask, 2) # bs,sl,1 rep_mask_mat = tf.logical_and(rep_mask_epd1, rep_mask_epd2) # bs,sl,sl if position_mask_type in ['forward', 'backward']: sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if position_mask_type == 'forward': position_mask = tf.greater(sl_row, sl_col) else: position_mask = tf.greater(sl_col, sl_row) position_mask = tf.tile(tf.expand_dims(position_mask, 0), [bs, 1, 1]) position_mask = tf.logical_and(rep_mask_mat, position_mask) else: position_mask = rep_mask_mat position_mask_ft = tf.cast(position_mask, tf.float32) # attention with tf.variable_scope('intra_sent_attn'): # bs,sl,hn # rep_tensor_mean = pooling_with_mask(rep_tensor, rep_mask, 'mean') # bs, hn rep_tensor_for_attn = rep_map pre_align_score = bn_dense_layer( # bs,sl,hn rep_tensor_for_attn, ivec, True, 0., 'intra_sent_map1', activation, False, wd, keep_prob, is_train) align_score = bn_dense_layer( # bs,sl,hn pre_align_score, ivec, True, 0., 'intra_sent_map2', 'linear', False, wd, keep_prob, is_train) align_score_w_mask = exp_mask_for_high_rank(align_score, rep_mask) # bs,sl,hn exp_align_score = tf.exp(align_score_w_mask) # bs,sl,hn accum_z_deno = tf.matmul(position_mask_ft, exp_align_score) accum_z_deno = tf.where( tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)), accum_z_deno, tf.ones_like(accum_z_deno)) rep_mul_score = rep_map * exp_align_score accum_rep_mul_score = tf.matmul(position_mask_ft, rep_mul_score) attn_res = accum_rep_mul_score / accum_z_deno with tf.variable_scope('context_fusion_gate'): fusion_gate = tf.nn.sigmoid( bn_dense_layer([rep_map, attn_res], hn, True, 0., 'linear_fusion_gate', activation, False, wd, keep_prob, is_train)) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_res output = mask_for_high_rank(output, rep_mask) return output
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1./scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask0 = tf.greater(sl_row + sigma, sl_col) direct_mask1 = tf.greater(sl_col + sigma, sl_row) direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool) direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,1 f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, 1, False, scope='linear_dependent') # bs,sl,1 dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,1 head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1 head_etd = tf.expand_dims(head, 2) # bs,sl,1,1 logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,1 logits_masked = exp_mask_for_high_rank(logits, attn_mask) if direction is not None: dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + tf.diag(tf.ones([sl], tf.int32)), tf.float32)) logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec]) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): output = attn_result # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score return output
def visit_sa_with_dense(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu', hn=None, is_scale=True, is_plus_sa=True): batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope('temporal_attention'): # mask generation attn_mask = tf.cast( tf.diag(-tf.ones([sw_len], tf.int32)) + 1, tf.bool) # batch_size, code_len, code_len # non-linear for context rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sw_len, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear( rep_map_dp, ivec, False, scope='linear_dependent') # batch_size, code_len, vec_size dependent_etd = tf.expand_dims( dependent, 1) # batch_size, code_len,code_len, vec_size head = linear( rep_map_dp, ivec, False, scope='linear_head') # batch_size, code_len, vec_size head_etd = tf.expand_dims( head, 2) # batch_size, code_len,code_len, vec_size if is_plus_sa: attention_fact = dependent_etd + head_etd + f_bias else: return rep_map if is_scale: logits = scaled_tanh(attention_fact, 5.0) # bs,sl,sl,vec else: logits = linear(tf.nn.tanh(attention_fact), ivec, True, scope='linear_attn_fact') logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result return output
def build_network(self): with tf.name_scope('code_embeddings'): if self.model_type == 'raw': # init_code_embed = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0) # code_embeddings = tf.Variable(init_code_embed) init_code_embed = tf.one_hot(self.inputs, self.vocabulary_size,on_value=1.0, off_value=0.0,axis=-1) inputs_embed = bn_dense_layer(init_code_embed, self.embedding_size, True, 0., 'bn_dense_map_linear', 'linear', False, wd=0., keep_prob=1., is_train=True) elif self.model_type == 'tesa': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'delta': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'sa': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'normal': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'cbow': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'sg': init_code_embed = tesan_trans(self.model_type) # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'mce': init_code_embed = mce_trans() # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) elif self.model_type == 'glove': init_code_embed = glove_trans() # code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) else: init_code_embed = med2vec_trans() # code_embeddings = tf.constant(init_code_embed, dtype=tf.float32) code_embeddings = tf.Variable(init_code_embed, dtype=tf.float32) inputs_embed = tf.nn.embedding_lookup(code_embeddings, self.inputs) with tf.name_scope('visit_embedding'): # bs, max_visits, max_len_visit, embed_size inputs_masked = mask_for_high_rank(inputs_embed, self.inputs_mask) inputs_reduced = tf.reduce_mean(inputs_masked, 2) # batch_size, max_visits, embed_size with tf.name_scope('visit_masking'): visit_mask = tf.reduce_sum(tf.cast(self.inputs_mask, tf.int32), -1) # [bs,max_visits] visit_mask = tf.cast(visit_mask, tf.bool) tensor_len = tf.reduce_sum(tf.cast(visit_mask, tf.int32), -1) # [bs] with tf.name_scope('RNN_computaion'): reuse = None if not tf.get_variable_scope().reuse else True if cfg.cell_type == 'gru': cell = tf.contrib.rnn.GRUCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'lstm': cell = tf.contrib.rnn.LSTMCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'basic_lstm': cell = tf.contrib.rnn.BasicLSTMCell(cfg.hn, reuse=reuse) elif cfg.cell_type == 'basic_rnn': cell = tf.contrib.rnn.BasicRNNCell(cfg.hn, reuse=reuse) outputs, final_state = dynamic_rnn(cell, inputs_reduced, tensor_len, dtype=tf.float32) return outputs, final_state, tensor_len