def __init__(self, item_max_len, dim_wd_emb, dim_item_hidden, rnn_config): super(ItemBiRNNModule, self).__init__(item_max_len=item_max_len, dim_wd_emb=dim_wd_emb, dim_item_hidden=dim_item_hidden) rnn_config['num_units'] = dim_item_hidden / 2 # bidirectional self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN)
def __init__(self, dim_q_hidden, rnn_config, q_max_len=None, dim_wd_emb=None): super(QBiRNNModule, self).__init__(q_max_len=q_max_len, dim_wd_emb=dim_wd_emb, dim_q_hidden=dim_q_hidden) rnn_config['num_units'] = dim_q_hidden / 2 # bidirectional self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN)
def __init__(self, path_max_len, dim_item_hidden, dim_kb_emb, dim_sk_hidden, data_source, rnn_config): super(SkBiRNNModule, self).__init__(path_max_len=path_max_len, dim_item_hidden=dim_item_hidden, dim_kb_emb=dim_kb_emb, dim_sk_hidden=dim_sk_hidden) self.data_source = data_source assert self.data_source in ('kb', 'word', 'both') rnn_config['num_units'] = dim_sk_hidden / 2 self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN)
def encode_question(self, question, question_len, answer, config): """ Encode question with answer-aware attention :param question: [B, T, dim] :param question_len: [B, T, ] :param answer: [B, dim] :param config: parameter dict :return: [B, hidden_dim] """ # bi-LSTM with tf.name_scope("rnn_encoder"): rnn_config = dict() key_list = [ "cell_class", "num_units", "dropout_input_keep_prob", "dropout_output_keep_prob", "num_layers", "reuse" ] for key in key_list: rnn_config[key] = config[key] rnn_encoder = BidirectionalRNNEncoder(rnn_config, config["mode"]) encoder_output = rnn_encoder.encode(question, question_len) # attention mechanism with tf.name_scope("attention"): att_config = dict() key_list = ["num_units"] for key in key_list: att_config[key] = config[key] if config["attention"] == "bah": att = AttentionLayerBahdanau(att_config) question_hidden = att.build( answer, encoder_output.attention_values, encoder_output.attention_values_length) elif config["attention"] == "avg": att = AttentionLayerAvg() question_hidden = att.build( encoder_output.attention_values, encoder_output.attention_values_length) return question_hidden
class ItemBiRNNModule(ItemBaseModule): # Below: parameters in BidirectionalRNNEncoder # key_list = ["cell_class", "num_units", "dropout_input_keep_prob", # "dropout_output_keep_prob", "num_layers", "reuse"] # Without any attention mechanism def __init__(self, item_max_len, dim_wd_emb, dim_item_hidden, rnn_config): super(ItemBiRNNModule, self).__init__(item_max_len=item_max_len, dim_wd_emb=dim_wd_emb, dim_item_hidden=dim_item_hidden) rnn_config['num_units'] = dim_item_hidden / 2 # bidirectional self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN) # Input: # item_wd_embedding: (batch, item_max_len, dim_wd_emb) # item_len: (batch, ) as int32 # Output: # item_wd_hidden: (batch, dim_item_hidden) def forward(self, item_wd_embedding, item_len, reuse=None): LogInfo.begin_track('ItemBiRNNModule forward: ') with tf.variable_scope('ItemBiRNNModule', reuse=reuse): # stamps = item_wd_embedding.get_shape().as_list()[1] stamps = self.item_max_len show_tensor(item_wd_embedding) birnn_inputs = tf.unstack(item_wd_embedding, num=stamps, axis=1, name='birnn_inputs') # rnn_input: a list of stamps elements: (batch, n_emb) encoder_output = self.rnn_encoder.encode(inputs=birnn_inputs, sequence_length=item_len, reuse=reuse) birnn_outputs = tf.stack( encoder_output.outputs, axis=1, name='birnn_outputs') # (data_size, q_len, n_hidden_emb) LogInfo.logs('birnn_output = %s', birnn_outputs.get_shape().as_list()) sum_wd_hidden = tf.reduce_sum(birnn_outputs, axis=1) # (data_size, n_hidden_emb) item_len_mat = tf.cast(tf.expand_dims(item_len, axis=1), dtype=tf.float32) # (data_size, 1) as float item_wd_hidden = tf.div( sum_wd_hidden, tf.maximum(item_len_mat, 1), # avoid dividing by 0 name='item_wd_hidden') # (data_size, n_hidden_emb) LogInfo.logs('item_wd_hidden = %s', item_wd_hidden.get_shape().as_list()) LogInfo.end_track() return item_wd_hidden
def apply_seq_repr(self, input_emb, input_len, mode): assert self.repr_mode in ('raw', 'cnn', 'rnn') LogInfo.logs('apply_seq_repr: %s', self.repr_mode) if self.repr_mode == 'raw': return input_emb elif self.repr_mode == 'cnn': return tf.layers.conv1d(inputs=input_emb, padding='same', activation=tf.nn.relu, reuse=tf.AUTO_REUSE, **self.cnn_config) # (ds, x_max_len, num_filters == dim_hidden) else: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) return seq_encoding(emb_input=input_emb, len_input=input_len, encoder=rnn_encoder)
def forward(self, v_emb, v_len, tag_indices, mode): """ :param v_emb: (ds, q_max_len, dim_emb) :param v_len: (ds,) as int :param tag_indices: (ds, q_max_len) as int :param mode: TRAIN / INFER """ LogInfo.begin_track('Build kernel: [segment_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) encoder_args = {'config': self.rnn_config, 'mode': mode} seg_encoder = BidirectionalRNNEncoder(**encoder_args) with tf.variable_scope('segment_kernel', reuse=tf.AUTO_REUSE): transition = tf.get_variable( name='transition', dtype=tf.float32, shape=[self.num_classes, self.num_classes ]) # (num_classes, num_classes) as transition matrix v_hidden = seq_encoding( emb_input=v_emb, len_input=v_len, encoder=seg_encoder) # (ds, q_max_len, dim_seg_hidden) v_hidden_flat = tf.reshape( v_hidden, [-1, self.dim_seg_hidden]) # (ds * q_max_len, dim_seg_hidden) seg_logits = tf.reshape( tf.contrib.layers.fully_connected(inputs=v_hidden_flat, num_outputs=self.num_classes, activation_fn=None, scope='fc'), shape=[-1, self.q_max_len, self.num_classes], name='seg_logits') # (ds, q_max_len, num_classes) log_lik, _ = tf.contrib.crf.crf_log_likelihood( inputs=seg_logits, tag_indices=tag_indices, sequence_lengths=v_len, transition_params=transition) best_seg, viterbi_score = tf.contrib.crf.crf_decode( potentials=seg_logits, transition_params=transition, sequence_length=v_len) # output_seq: (ds, q_max_len) as int LogInfo.end_track() return v_hidden, seg_logits, log_lik, best_seg
def get_seq_hidden(self, seq_emb, seq_len, mode): encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) if self.seq_merge_mode == 'fwbw': return seq_encoding(emb_input=seq_emb, len_input=seq_len, encoder=rnn_encoder, fwbw=True) else: seq_hidden = seq_encoding(emb_input=seq_emb, len_input=seq_len, encoder=rnn_encoder) if self.seq_merge_mode == 'max': return seq_hidden_max_pooling(seq_hidden_input=seq_hidden, len_input=seq_len) else: # avg return seq_hidden_averaging(seq_hidden_input=seq_hidden, len_input=seq_len)
class QBiRNNModule(QBaseModule): # Below: parameters in BidirectionalRNNEncoder # key_list = ["cell_class", "num_units", "dropout_input_keep_prob", # "dropout_output_keep_prob", "num_layers", "reuse"] # Without any attention mechanism def __init__(self, dim_q_hidden, rnn_config, q_max_len=None, dim_wd_emb=None): super(QBiRNNModule, self).__init__(q_max_len=q_max_len, dim_wd_emb=dim_wd_emb, dim_q_hidden=dim_q_hidden) rnn_config['num_units'] = dim_q_hidden / 2 # bidirectional self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN) def forward(self, q_embedding, q_len, reuse=None): LogInfo.begin_track('QBiRNNModule forward: ') with tf.variable_scope('QBiRNNModule', reuse=reuse): # stamps = q_embedding.get_shape().as_list()[1] stamps = self.q_max_len birnn_inputs = tf.unstack(q_embedding, num=stamps, axis=1, name='birnn_inputs') # rnn_input: a list of stamps elements: (batch, n_emb) encoder_output = self.rnn_encoder.encode(inputs=birnn_inputs, sequence_length=q_len, reuse=reuse) q_hidden = tf.stack( encoder_output.outputs, axis=1, name='q_hidden') # (batch, q_max_len, dim_q_hidden) LogInfo.end_track() return q_hidden
def _build_graph(self): self.context_idx = tf.placeholder( dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.context_seq = tf.placeholder(dtype=tf.int32, shape=[ None, ]) self.pinlei_idx = tf.placeholder(dtype=tf.int32, shape=[ None, ]) with tf.device('/cpu:0'), tf.name_scope("embedding_layer"): # LogInfo.logs("Embedding shape: %s (%d*%d).", self.embedding.shape, # self.config.get("vocab_size"), self.config.get("embedding_dim")) term_embedding = tf.get_variable( name="embedding", shape=[ self.config.get("vocab_size"), self.config.get("embedding_dim") ], dtype=tf.float32, initializer=tf.constant_initializer(self.embedding)) self.context_embedding = tf.nn.embedding_lookup( term_embedding, self.context_idx) self.pinlei_embedding = tf.nn.embedding_lookup( term_embedding, self.pinlei_idx) # shape = [max_seq_len, batch_size, embedding_dim], feed to rnn_encoder self.context_slice = [ tf.squeeze(_input, [1]) for _input in tf.split(self.context_embedding, self.config.get("max_seq_len"), axis=1) ] # bi-LSTM with tf.name_scope("rnn_encoder"): rnn_config = dict() key_list = [ "cell_class", "num_units", "dropout_input_keep_prob", "dropout_output_keep_prob", "num_layers", "reuse" ] for key in key_list: rnn_config[key] = self.config.get(key) rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode) self.encoder_output = rnn_encoder.encode(self.context_slice, self.context_seq) # attention mechanism with tf.name_scope("attention"): att_config = dict() key_list = ["num_units"] for key in key_list: att_config[key] = self.config.get(key) if self.config.get("attention") == "bah": att = AttentionLayerBahdanau_old(att_config) self.query_hidden = att.build( self.pinlei_embedding, self.encoder_output.attention_values, self.encoder_output.attention_values_length) elif self.config.get("attention") == "avg": att = AttentionLayerAvg_old() self.query_hidden = att.build( self.encoder_output.attention_values, self.encoder_output.attention_values_length) self.hidden_dim = self.query_hidden.get_shape().as_list()[-1] # training parameters with tf.name_scope("parameters"): self.W_p = tf.get_variable( name="W_p", shape=[self.config.get("embedding_dim"), self.hidden_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_p = tf.get_variable( name="b_p", shape=[self.hidden_dim], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) self.W_f = tf.get_variable( name="W_f", shape=[self.hidden_dim * 2, self.hidden_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_f = tf.get_variable( name="b_f", shape=[self.hidden_dim], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) self.W_o = tf.get_variable( name="W_o", shape=[self.hidden_dim, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_o = tf.get_variable( name="b_o", shape=[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) # above bi-LSTM + attention with tf.name_scope("score"): self.pinlei_hidden = self.transfer( tf.add(tf.matmul(self.pinlei_embedding, self.W_p), self.b_p)) self.final = self.transfer( tf.add( tf.matmul( tf.concat([self.query_hidden, self.pinlei_hidden], 1), self.W_f), self.b_f)) # self.score = tf.add(tf.matmul(self.final, self.W_o), self.b_o) # tensorflow 1.0.0 self.score = tf.nn.xw_plus_b(self.final, self.W_o, self.b_o) # hinge loss if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.loss = hinge_loss( self.score, int(self.config.get("batch_size") / self.config.get("PN")), self.config.get("PN"), self.config.get("margin")) self.train_op = get_optimizer(self.config.get("optimizer"), self.config.get("lr")).minimize( self.loss)
def forward(self, el_size, qw_emb, qw_len, pw_sup_emb, pw_sup_len, type_trans, el_sup_mask, el_type_signa, el_indv_feats, el_comb_feats, mode): """ Note: number of paths in a schema == number of entities in the schema local_mem_size: the local number of relevant paths in the current batch. :param el_size: (ds, ) :param qw_emb: (ds, el_max_size, qw_max_len, dim_emb) :param qw_len: (ds, el_max_size) :param pw_sup_emb: (local_mem_size, pw_max_len, dim_emb) :param pw_sup_len: (local_mem_size,) :param type_trans: (local_mem_size, dim_type) :param el_sup_mask: (ds, el_max_size, local_mem_size) :param el_type_signa: (ds, el_max_size, dim_type) :param el_indv_feats: (ds, el_max_size, el_feat_size) :param el_comb_feats: (ds, 1) :param mode: TRAIN / INFER """ """ 180416: Let's assume ds=16*2=32, el_max_size=3, qw_max_len=20, dim_emb=300, local_mem_size=6K Then ds*el_max_size*qw_max_len ~= 2K """ LogInfo.begin_track('Build kernel: [el_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) raw_shape = tf.shape(el_sup_mask) el_max_size = raw_shape[1] local_mem_size = raw_shape[2] dim_type = tf.shape(type_trans)[1] """ Possible reshapes """ qw_emb = tf.reshape(qw_emb, [-1, self.qw_max_len, self.dim_emb]) # (ds * el_max_size, qw_max_len, dim_emb) qw_len = tf.reshape(qw_len, [-1]) # (ds * el_max_size) """ Calculate attention / non-attention question representation """ pw_sup_repr = seq_encoding_with_aggregation(emb_input=pw_sup_emb, len_input=pw_sup_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (local_mem_size, dim_hidden) if self.att_config is not None: att_func = self.att_config['att_func'] assert att_func == 'dot' # TODO: Currently only support dot product qw_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) # (ds*el_max_size, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (ds*el_max_size, qw_max_len) flat_qw_hidden = tf.reshape(qw_hidden, shape=[-1, self.dim_hidden], name='flat_qw_hidden') # (ds*el_max_size*qw_max_len, dim_hidden) """ Step 1: Very simple & fast way to calculate dot attention """ raw_mutual_att_mat = tf.matmul( flat_qw_hidden, tf.transpose(pw_sup_repr), name='raw_mutual_att_mat' ) # (ds*el_max_size*qw_max_len, local_mem_size) mutual_att_mat = tf.reshape( raw_mutual_att_mat, shape=[-1, self.qw_max_len, local_mem_size], name='mutual_att_mat') # (ds*el_max_size, qw_max_len, local_mem_size) """ Step 2: Prepare masked att_mat and normalized distribution """ qw_mask_3dim = tf.expand_dims(qw_mask, axis=-1, name='qw_mask_3dim') # (ds*el_max_size, qw_max_len, 1) masked_att_mat = ( qw_mask_3dim * mutual_att_mat + (1. - qw_mask_3dim) * mutual_att_mat * tf.float32.min ) # (ds*el_max_size, qw_max_len, local_mem_size) unnorm_weight = tf.transpose(masked_att_mat, [0, 2, 1], name='masked_att_mat') # (ds*el_max_size, local_mem_size, qw_max_len) norm_weight = tf.nn.softmax(unnorm_weight, name='norm_weight') """ Step 3: Got final qw_repr w.r.t different support paths """ qw_repr = tf.matmul(norm_weight, qw_hidden, name='qw_repr') # batch_matmul: (ds*el_max_size, local_mem_size, qw_max_len) else: # noAtt, very simple raw_qw_repr = seq_encoding_with_aggregation(emb_input=qw_emb, len_input=qw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size, dim_hidden) qw_repr = tf.expand_dims(raw_qw_repr, axis=1, name='qw_repr') # (ds*el_max_size, 1, dim_hidden) with tf.variable_scope('el_kernel', reuse=tf.AUTO_REUSE): """ Calculate cosine similarity """ flat_pw_sup_repr = tf.expand_dims(pw_sup_repr, axis=0, name='flat_pw_sup_repr') # (1, local_mem_size, dim_hidden) sim_score = cosine_sim( lf_input=qw_repr, # (ds*el_max_size, [1 or local_mem_size], qw_max_len) rt_input=flat_pw_sup_repr # (1, local_mem_size, dim_hidden) ) # (ds*el_max_size, local_mem_size) """ Turning into type distribution """ flat_el_sup_mask = tf.reshape(el_sup_mask, shape=[-1, local_mem_size], name='flat_el_sup_mask') # (ds*el_max_size, local_mem_size) mask_score = flat_el_sup_mask * sim_score + (1. - flat_el_sup_mask) * tf.float32.min pred_prob = tf.nn.softmax(logits=mask_score, name='pred_prob') # (ds*el_max_size, local_mem_size) raw_type_prob = tf.matmul(pred_prob, type_trans, name='raw_type_prob') # (ds*el_max_size, dim_type) type_prob = tf.reshape(raw_type_prob, shape=[-1, el_max_size, dim_type], name='type_prob') # (ds, el_max_size, dim_type) type_match_score = tf.reduce_sum(el_type_signa*type_prob, axis=-1, keep_dims=True, name='type_match_score') # (ds, el_max_size, 1) """ Feature concat and produce scores """ el_indv_concat = tf.concat([type_match_score, el_indv_feats], axis=-1, name='el_indv_concat') # (ds, el_max_size, 1+el_feat_size) el_mask = tf.sequence_mask(lengths=el_size, maxlen=el_max_size, dtype=tf.float32, name='el_mask') # (ds, el_max_size) sum_indv_feats = tf.reduce_sum( el_indv_concat * tf.expand_dims(el_mask, axis=-1), axis=1, name='sum_indv_feats' ) # (ds, 1+el_feat_size) final_feats = tf.concat([sum_indv_feats, el_comb_feats], axis=-1, name='final_feats') # (ds, 1+el_max_size+1) --> type_match + indv_feats + comb_feat el_score = tf.contrib.layers.fully_connected( inputs=final_feats, num_outputs=1, activation_fn=None, scope='out_fc', reuse=tf.AUTO_REUSE ) # (ds, 1), representing type matching score LogInfo.end_track() return el_score, final_feats
from kangqi.util.LogUtil import LogInfo max_len = 10 dim_emb = 30 n_words = 500 dim_hidden = 16 v_input = tf.placeholder(tf.int32, shape=[None, max_len]) v_len = tf.placeholder(tf.int32, shape=[None]) rnn_config = { 'cell_class': 'GRU', 'num_units': dim_hidden, 'reuse': tf.AUTO_REUSE } encoder_args = {'config': rnn_config, 'mode': tf.contrib.learn.ModeKeys.INFER} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) with tf.variable_scope('embedding_lookup', reuse=tf.AUTO_REUSE): with tf.device('/cpu:0'): w_embedding_init = tf.placeholder(dtype=tf.float32, shape=(n_words, dim_emb), name='w_embedding_init') w_embedding = tf.get_variable(name='w_embedding', initializer=w_embedding_init) v_emb = tf.nn.embedding_lookup(params=w_embedding, ids=v_input) v_hidden = seq_encoding(emb_input=v_emb, len_input=v_len, encoder=rnn_encoder, reuse=tf.AUTO_REUSE) LogInfo.logs('v_hidden: %s', v_hidden.get_shape().as_list())
def forward(self, el_size, qw_emb, qw_len, pw_sup_emb, pw_sup_len, sup_size, type_trans, el_type_signa, el_indv_feats, el_comb_feats, mode): """ Note: number of paths in a schema == number of entities in the schema :param el_size: (ds, ) :param qw_emb: (ds, path_max_size, qw_max_len, dim_emb) :param qw_len: (ds, path_max_size) :param pw_sup_emb: (ds, path_max_size, sup_max_size, pw_max_len, dim_emb) :param pw_sup_len: (ds, path_max_size, sup_max_size) :param sup_size: (ds, path_max_size) :param type_trans: (ds, path_max_size, sup_max_size, dim_type) :param el_type_signa: (ds, el_max_size, dim_type) :param el_indv_feats: (ds, el_max_size, el_feat_size) :param el_comb_feats: (ds, 1) :param mode: TRAIN / INFER """ LogInfo.begin_track('Build kernel: [el_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) raw_shape = tf.shape(pw_sup_len) dyn_el_max_size = raw_shape[1] dyn_sup_max_size = raw_shape[2] """ Possible reshapes """ qw_emb = tf.reshape(qw_emb, [-1, self.qw_max_len, self.dim_emb]) # (ds * el_max_size, qw_max_len, dim_emb) qw_len = tf.reshape(qw_len, [-1]) # (ds * el_max_size) pw_sup_emb = tf.reshape(pw_sup_emb, [-1, self.pw_max_len, self.dim_emb]) # (ds * el_max_size * sup_max_size, pw_max_len, dim_emb) pw_sup_len = tf.reshape(pw_sup_len, [-1]) """ Calculate attention / non-attention question representation """ pw_sup_repr = seq_encoding_with_aggregation( emb_input=pw_sup_emb, len_input=pw_sup_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size*sup_max_size, dim_hidden) if self.att_config is not None: dim_att_len = self.att_config['dim_att_hidden'] att_func = self.att_config['att_func'] qw_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) # (ds * el_max_size, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (DS, qw_max_len) tile_qw_hidden = tf.tile( tf.expand_dims( qw_hidden, axis=1), # (ds*el_max_size, 1, qw_max_len, dim_hidden) multiples=[1, dyn_sup_max_size, 1, 1], name='tile_qw_hidden' ) # (ds*el_max_size, sup_max_size, qw_max_len, dim_hidden) tile_qw_mask = tf.tile( tf.expand_dims(qw_mask, axis=1), multiples=[1, dyn_sup_max_size, 1], name='tile_qw_mask' ) # (ds*el_max_size, sup_max_size, qw_max_len) expand_qw_mask = tf.reshape(tile_qw_mask, [-1, self.qw_max_len]) expand_qw_hidden = tf.reshape( tile_qw_hidden, [-1, self.qw_max_len, self.dim_hidden]) # (ds*el_max_size*sup_max_size, qw_max_len, dim_hidden) simple_att = SimpleAttention(lf_max_len=self.qw_max_len, dim_att_hidden=dim_att_len, att_func=att_func) qw_att_repr, _, _ = simple_att.forward(lf_input=expand_qw_hidden, lf_mask=expand_qw_mask, fix_rt_input=pw_sup_repr) # (ds*el_max_size*sup_max_size, dim_hidden) final_qw_repr = qw_att_repr else: qw_repr = seq_encoding_with_aggregation( emb_input=qw_emb, len_input=qw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size, dim_hidden) tile_qw_repr = tf.tile( tf.expand_dims(qw_repr, axis=1), multiples=[1, dyn_sup_max_size, 1], name='tile_qw_repr' ) # (ds*el_max_size, sup_max_size, dim_hidden) expand_qw_repr = tf.reshape(tile_qw_repr, [-1, self.dim_hidden]) final_qw_repr = expand_qw_repr with tf.variable_scope('el_kernel', reuse=tf.AUTO_REUSE): """ Calculate cosine similarity, and turning into type distribution """ sim_score = cosine_sim( lf_input=final_qw_repr, rt_input=pw_sup_repr) # (ds*el_max_size, sup_max_size) sim_score = tf.reshape( sim_score, shape=raw_shape, name='sim_score') # (ds, el_max_size, sup_max_size) sup_mask = tf.sequence_mask( lengths=sup_size, maxlen=dyn_sup_max_size, dtype=tf.float32, name='sup_mask') # (ds, el_max_size, sup_max_size) mask_score = sup_mask * sim_score + (1. - sup_mask) * tf.float32.min pred_prob = tf.nn.softmax( logits=mask_score, name='pred_prob') # (ds, el_max_size, sup_max_size) type_prob = tf.matmul( a=tf.expand_dims(pred_prob, axis=2), # (ds, el_max_size, 1, sup_max_size) b=type_trans # (ds, el_max_size, sup_max_size, dim_type) ) # (ds, el_max_size, 1, dim_type) type_prob = tf.squeeze( input=type_prob, axis=2, name='type_prob') # (ds, el_max_size, dim_type) type_match_score = tf.reduce_sum( el_type_signa * type_prob, axis=-1, keep_dims=True, name='type_match_score') # (ds, el_max_size, 1) """ Feature concat and produce scores """ el_indv_concat = tf.concat( [type_match_score, el_indv_feats], axis=-1, name='el_indv_concat') # (ds, el_max_size, 1+el_feat_size) el_mask = tf.sequence_mask(lengths=el_size, maxlen=dyn_el_max_size, dtype=tf.float32, name='el_mask') # (ds, el_max_size) sum_indv_feats = tf.reduce_sum( el_indv_concat * tf.expand_dims(el_mask, axis=-1), axis=1, name='sum_indv_feats') # (ds, 1+el_feat_size) final_feats = tf.concat([sum_indv_feats, el_comb_feats], axis=-1, name='final_feats') # (ds, 1+el_max_size+1) --> type_match + indv_feats + comb_feat el_score = tf.contrib.layers.fully_connected( inputs=final_feats, num_outputs=1, activation_fn=None, scope='out_fc', reuse=tf.AUTO_REUSE ) # (ds, 1), representing type matching score LogInfo.end_track() return el_score, final_feats
def __init__(self, sess, n_words, n_preds, dim_emb, q_max_len, path_max_len, pword_max_len, dim_hidden, rnn_cell, merge_config, reuse=tf.AUTO_REUSE, verbose=0): LogInfo.begin_track('SimpqEvalModel Building ...') super(SimpqEvalModel, self).__init__(sess=sess, verbose=verbose) # ======== declare sub-modules (the same as optm part) ======== # num_units = dim_hidden / 2 # bidirectional rnn_config = {'num_units': num_units, 'cell_class': rnn_cell} encoder_args = { 'config': rnn_config, 'mode': tf.contrib.learn.ModeKeys.TRAIN } q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) merge_func = get_merge_function(merge_config=merge_config, dim_hidden=dim_hidden, reuse=reuse) LogInfo.logs('Sub-modules declared.') # ======== define tensors ======== # q_words_input = tf.placeholder( dtype=tf.int32, shape=[None, q_max_len], name='q_words_input') # (data_size, q_max_len) q_words_len_input = tf.placeholder( dtype=tf.int32, shape=[None], name='q_words_len_input') # (data_size, ) preds_input = tf.placeholder( dtype=tf.int32, shape=[None, path_max_len], name='preds_input') # (data_size, path_max_len) preds_len_input = tf.placeholder( dtype=tf.int32, shape=[None], name='preds_len_input') # (data_size, ) pwords_input = tf.placeholder( dtype=tf.int32, shape=[None, pword_max_len], name='pwords_input') # (data_size, path_max_len) pwords_len_input = tf.placeholder( dtype=tf.int32, shape=[None], name='pwords_len_input') # (data_size, ) self.eval_input_tf_list = [ q_words_input, q_words_len_input, preds_input, preds_len_input, pwords_input, pwords_len_input ] LogInfo.begin_track('Showing %d input tensors:', len(self.eval_input_tf_list)) for tensor in self.eval_input_tf_list: show_tensor(tensor) LogInfo.end_track() # ======== start building model ======== # with tf.variable_scope('Embedding_Lookup', reuse=reuse): with tf.device("/cpu:0"): self.w_embedding_init = tf.placeholder(dtype=tf.float32, shape=(n_words, dim_emb), name='w_embedding_init') self.p_embedding_init = tf.placeholder(dtype=tf.float32, shape=(n_preds, dim_emb), name='p_embedding_init') w_embedding = tf.get_variable( name='w_embedding', initializer=self.w_embedding_init) p_embedding = tf.get_variable( name='p_embedding', initializer=self.p_embedding_init) q_words_embedding = tf.nn.embedding_lookup( params=w_embedding, ids=q_words_input, name='q_embedding') # (batch, q_max_len, dim_emb) preds_embedding = tf.nn.embedding_lookup( params=p_embedding, ids=preds_input, name='preds_embedding') # (batch, path_max_len, dim_emb) pwords_embedding = tf.nn.embedding_lookup( params=w_embedding, ids=pwords_input, name='pwords_embedding') # (batch, pword_max_len, dim_emb) with tf.variable_scope('Question', reuse=reuse): q_words_hidden = seq_encoding( emb_input=q_words_embedding, len_input=q_words_len_input, encoder=q_encoder, reuse=reuse) # (data_size, q_max_len, dim_emb) q_hidden = tf.reduce_max( q_words_hidden, axis=1, name='q_hidden') # (data_size, dim_hidden) with tf.variable_scope('Schema', reuse=reuse): with tf.variable_scope('Path', reuse=reuse): preds_hidden = seq_encoding( emb_input=preds_embedding, len_input=preds_len_input, encoder=pred_encoder, reuse=reuse) # (data_size, path_max_len, dim_emb) with tf.variable_scope('Pword', reuse=reuse): pwords_hidden = seq_encoding( emb_input=pwords_embedding, len_input=pwords_len_input, encoder=pword_encoder, reuse=reuse) # (data_size, pword_max_len, dim_emb) schema_hidden = schema_encoding(preds_hidden=preds_hidden, preds_len=preds_len_input, pwords_hidden=pwords_hidden, pwords_len=pwords_len_input) with tf.variable_scope('Merge', reuse=reuse): # self.score = cosine_sim(lf_input=q_hidden, rt_input=schema_hidden) # (data_size, ) self.score = merge_func(q_hidden, schema_hidden) # (data_size, ) # Now final score defined. self.eval_summary = tf.summary.merge_all(key='eval') LogInfo.logs('* final score defined.') LogInfo.end_track()
def __init__(self, sess, n_words, n_preds, dim_emb, q_max_len, path_max_len, pword_max_len, dim_hidden, rnn_cell, merge_config, margin, learning_rate, optm_name, reuse=tf.AUTO_REUSE, verbose=0): LogInfo.begin_track('SimpqOptmModel Building ...') super(SimpqOptmModel, self).__init__(sess=sess, ob_batch_num=100, verbose=verbose) assert optm_name in ('Adam', 'Adadelta', 'Adagrad', 'GradientDescent') optm_name += 'Optimizer' # ======== declare sub-modules ======== # num_units = dim_hidden / 2 # bidirectional rnn_config = {'num_units': num_units, 'cell_class': rnn_cell} encoder_args = {'config': rnn_config, 'mode': tf.contrib.learn.ModeKeys.TRAIN} q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) merge_func = get_merge_function(merge_config=merge_config, dim_hidden=dim_hidden, reuse=reuse) LogInfo.logs('Sub-modules declared.') # ======== define tensors ======== # q_words_input = tf.placeholder(dtype=tf.int32, shape=[None, q_max_len], name='q_words_input') # (data_size, q_max_len) q_words_len_input = tf.placeholder(dtype=tf.int32, shape=[None], name='q_words_len_input') # (data_size, ) self.optm_input_tf_list = [q_words_input, q_words_len_input] sc_tensor_groups = [] # [ pos_tensors, neg_tensors ] for cate in ('pos', 'neg'): preds_input = tf.placeholder(dtype=tf.int32, shape=[None, path_max_len], name=cate+'_preds_input') # (data_size, path_max_len) preds_len_input = tf.placeholder(dtype=tf.int32, shape=[None], name=cate+'_preds_len_input') # (data_size, ) pwords_input = tf.placeholder(dtype=tf.int32, shape=[None, pword_max_len], name=cate+'_pwords_input') # (data_size, pword_max_len) pwords_len_input = tf.placeholder(dtype=tf.int32, shape=[None], name=cate+'_pwords_len_input') # (data_size, ) tensor_group = [preds_input, preds_len_input, pwords_input, pwords_len_input] sc_tensor_groups.append(tensor_group) self.optm_input_tf_list += tensor_group LogInfo.begin_track('Showing %d input tensors:', len(self.optm_input_tf_list)) for tensor in self.optm_input_tf_list: show_tensor(tensor) LogInfo.end_track() # ======== start building model ======== # with tf.variable_scope('Embedding_Lookup', reuse=reuse): with tf.device('/cpu:0'): self.w_embedding_init = tf.placeholder(dtype=tf.float32, shape=(n_words, dim_emb), name='w_embedding_init') self.p_embedding_init = tf.placeholder(dtype=tf.float32, shape=(n_preds, dim_emb), name='p_embedding_init') w_embedding = tf.get_variable(name='w_embedding', initializer=self.w_embedding_init) p_embedding = tf.get_variable(name='p_embedding', initializer=self.p_embedding_init) q_words_embedding = tf.nn.embedding_lookup(params=w_embedding, ids=q_words_input, name='q_embedding') # (batch, q_max_len, dim_emb) with tf.variable_scope('Question', reuse=reuse): q_words_hidden = seq_encoding( emb_input=q_words_embedding, len_input=q_words_len_input, encoder=q_encoder, reuse=reuse) # (data_size, q_max_len, dim_emb) # q_hidden = tf.reduce_max(q_words_hidden, # axis=1, name='q_hidden') # (data_size, dim_hidden) q_hidden = seq_hidden_max_pooling(seq_hidden_input=q_words_hidden, len_input=q_words_len_input) # TODO: Currently we just follow yu2017. logits_list = [] # store two tensors: positive and negative score for cate, sc_tensor_group in zip(('pos', 'neg'), sc_tensor_groups): LogInfo.logs('Calculate score at %s side ...', cate) preds_input, preds_len_input, pwords_input, pwords_len_input = sc_tensor_group with tf.variable_scope('Embedding_Lookup', reuse=reuse): with tf.device("/cpu:0"): preds_embedding = tf.nn.embedding_lookup( params=p_embedding, ids=preds_input, name='preds_embedding' ) # (batch, path_max_len, dim_emb) pwords_embedding = tf.nn.embedding_lookup( params=w_embedding, ids=pwords_input, name='pwords_embedding' ) # (batch, pword_max_len, dim_emb) with tf.variable_scope('Schema', reuse=reuse): with tf.variable_scope('Path', reuse=reuse): preds_hidden = seq_encoding( emb_input=preds_embedding, len_input=preds_len_input, encoder=pred_encoder, reuse=reuse) # (data_size, path_max_len, dim_hidden) with tf.variable_scope('Pword', reuse=reuse): pwords_hidden = seq_encoding( emb_input=pwords_embedding, len_input=pwords_len_input, encoder=pword_encoder, reuse=reuse) # (data_size, pword_max_len, dim_hidden) schema_hidden = schema_encoding( preds_hidden=preds_hidden, preds_len=preds_len_input, pwords_hidden=pwords_hidden, pwords_len=pwords_len_input) with tf.variable_scope('Merge', reuse=reuse): # logits = cosine_sim(lf_input=q_hidden, rt_input=schema_hidden) # (data_size, ) logits = merge_func(q_hidden, schema_hidden) # (data_size, ) logits_list.append(logits) # ======== define loss and updates ======== # pos_logits, neg_logits = logits_list margin_loss = tf.nn.relu(neg_logits + margin - pos_logits, name='margin_loss') self.avg_loss = tf.reduce_mean(margin_loss, name='avg_loss') tf.summary.scalar('avg_loss', self.avg_loss, collections=['optm']) optimizer = getattr(tf.train, optm_name) self.optm_step = optimizer(learning_rate).minimize(self.avg_loss) self.optm_summary = tf.summary.merge_all(key='optm') LogInfo.logs('* avg_loss and optm_step defined.') LogInfo.end_track()
def forward(self, qw_emb, qw_len, sc_len, p_emb, pw_emb, p_len, pw_len, mode): """ :param qw_emb: (ds, qw_max_len, dim_qw_emb) :param qw_len: (ds, ) :param sc_len: (ds, ) :param p_emb: (ds, sc_max_len, p_max_len, dim_p_emb) :param pw_emb: (ds, sc_max_len, pw_max_len, dim_pw_emb) :param p_len: (ds, sc_max_len) :param pw_len: (ds, sc_max_len) :param mode: tf.contrib.learn.ModeKeys. TRAIN / INFER :return: (ds, ) as the overall relation matching score """ LogInfo.begin_track('Build kernel: [att_rm_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) LogInfo.logs('repr_mode = %s, scoring_mode = %s', self.repr_mode, self.scoring_mode) encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) comb_tensor_list = [] for tensor_input in (p_emb, pw_emb, p_len, pw_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [-1] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) comb_tensor_list.append(tf.reshape(tensor_input, shape=comb_shape)) p_emb, pw_emb, p_len, pw_len = comb_tensor_list # p/pw_emb: (ds * sc_max_len, x_max_len, dim_x_emb) # p/pw_len: (ds * sc_max_len,) with tf.variable_scope('att_rm_kernel', reuse=tf.AUTO_REUSE): with tf.variable_scope('qw_repr', reuse=tf.AUTO_REUSE): qw_hidden = self.apply_seq_repr(input_emb=qw_emb, input_len=qw_len, mode=mode) # (ds, qw_max_len, dim_hidden) if self.residual: # RNN hidden + RNN input LogInfo.logs('Applying residual at qw_repr.') assert self.dim_hidden == self.dim_emb qw_hidden = tf.add(qw_hidden, qw_emb, name='qw_hidden_residual') # (ds, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (ds, qw_max_len) qw_hidden = tf.reshape( tf.stack([qw_hidden] * self.sc_max_len, axis=1), shape=[-1, self.qw_max_len, self.dim_hidden], name='qw_hidden' ) # (ds * sc_max_len, qw_max_len, dim_hidden) qw_mask = tf.reshape( tf.stack([qw_mask] * self.sc_max_len, axis=1), shape=[-1, self.qw_max_len], name='qw_mask' ) # (ds * sc_max_len, qw_max_len) with tf.variable_scope('pw_repr', reuse=tf.AUTO_REUSE): if self.seq_merge_mode in ('fwbw', 'nfwbw'): pw_rep = seq_encoding(emb_input=pw_emb, len_input=pw_len, encoder=rnn_encoder, fwbw=True) # (ds * sc_max_len, dim_hidden) else: pw_hidden = seq_encoding(emb_input=pw_emb, len_input=pw_len, encoder=rnn_encoder) if self.seq_merge_mode == 'avg': pw_rep = seq_hidden_averaging(seq_hidden_input=pw_hidden, len_input=pw_len) else: pw_rep = seq_hidden_max_pooling(seq_hidden_input=pw_hidden, len_input=pw_len) # (ds * sc_max_len, dim_hidden) # Ready for attention calculation LogInfo.logs('Sequence merge mode: %s', self.seq_merge_mode) if self.seq_merge_mode != 'nfwbw': simple_att = SimpleAttention(lf_max_len=self.qw_max_len, dim_att_hidden=self.dim_att_hidden, att_func=self.att_func) q_att_rep, att_mat, q_weight = simple_att.forward(lf_input=qw_hidden, lf_mask=qw_mask, fix_rt_input=pw_rep) # q_att_rep: (ds * sc_max_len, dim_hidden) # att_mat: (ds * sc_max_len, qw_max_len) # q_weight: (ds * sc_max_len, qw_max_len) att_mat = tf.reshape(att_mat, shape=[-1, self.sc_max_len, self.qw_max_len], name='att_mat') # (ds, sc_max_len, qw_max_len) q_weight = tf.reshape(q_weight, shape=[-1, self.sc_max_len, self.qw_max_len], name='q_weight') # (ds, sc_max_len, qw_max_len) final_ret_dict = self.final_merge( q_rep=q_att_rep, path_rep=pw_rep, sc_len=sc_len, sc_max_len=self.sc_max_len, dim_hidden=self.dim_hidden, scoring_mode=self.scoring_mode ) final_ret_dict['rm_att_mat'] = att_mat final_ret_dict['rm_q_weight'] = q_weight # rm_score, rm_path_score (optional), rm_att_mat, rm_q_weight else: """ Working in nfwbw mode, the fw/bw information are separated & calculating attention """ fw_qw_hidden, bw_qw_hidden = tf.split(qw_hidden, num_or_size_splits=2, axis=-1) # both (ds * sc_max_len, qw_max_len, dim_hidden / 2) fw_pw_rep, bw_pw_rep = tf.split(pw_rep, num_or_size_splits=2, axis=-1) # both (ds * sc_max_len, dim_hidden / 2) simple_att = SimpleAttention(lf_max_len=self.qw_max_len, dim_att_hidden=self.dim_att_hidden, att_func=self.att_func) fw_q_att_rep, fw_att_mat, fw_q_weight = simple_att.forward(lf_input=fw_qw_hidden, lf_mask=qw_mask, fix_rt_input=fw_pw_rep) bw_q_att_rep, bw_att_mat, bw_q_weight = simple_att.forward(lf_input=bw_qw_hidden, lf_mask=qw_mask, fix_rt_input=bw_pw_rep) # fw/bw_q_att_rep: (ds * sc_max_len, dim_hidden / 2) # fw/bw_att_mat: (ds * sc_max_len, qw_max_len) # fw/bw_q_weight: (ds * sc_max_len, qw_max_len) fw_att_mat = tf.reshape(fw_att_mat, shape=[-1, self.sc_max_len, self.qw_max_len], name='fw_att_mat') # (ds, sc_max_len, qw_max_len) bw_att_mat = tf.reshape(bw_att_mat, shape=[-1, self.sc_max_len, self.qw_max_len], name='bw_att_mat') # (ds, sc_max_len, qw_max_len) fw_q_weight = tf.reshape(fw_q_weight, shape=[-1, self.sc_max_len, self.qw_max_len], name='fw_q_weight') # (ds, sc_max_len, qw_max_len) bw_q_weight = tf.reshape(bw_q_weight, shape=[-1, self.sc_max_len, self.qw_max_len], name='bw_q_weight') # (ds, sc_max_len, qw_max_len) q_att_rep = tf.concat([fw_q_att_rep, bw_q_att_rep], axis=-1, name='q_att_rep') # (ds * sc_max_len, dim_hidden) final_ret_dict = self.final_merge( q_rep=q_att_rep, path_rep=pw_rep, sc_len=sc_len, sc_max_len=self.sc_max_len, dim_hidden=self.dim_hidden, scoring_mode=self.scoring_mode ) final_ret_dict['rm_fw_att_mat'] = fw_att_mat final_ret_dict['rm_bw_att_mat'] = bw_att_mat final_ret_dict['rm_fw_q_weight'] = fw_q_weight final_ret_dict['rm_bw_q_weight'] = bw_q_weight # rm_score, rm_path_score (optional), rm_fw/bw_att_mat, rm_fw/bw_q_weight LogInfo.end_track() return final_ret_dict
def forward(self, qw_emb, qw_len, sc_len, p_emb, pw_emb, p_len, pw_len, mode): """ :param qw_emb: (ds, qw_max_len, dim_qw_emb) :param qw_len: (ds, ) :param sc_len: (ds, ) :param p_emb: (ds, sc_max_len, p_max_len, dim_p_emb) :param pw_emb: (ds, sc_max_len, pw_max_len, dim_pw_emb) :param p_len: (ds, sc_max_len) :param pw_len: (ds, sc_max_len) :param mode: tf.contrib.learn.ModeKeys. TRAIN / INFER :return: (ds, ) as the overall relation matching score """ LogInfo.begin_track('Build kernel: [noatt_rm_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) LogInfo.logs('repr_mode = %s, scoring_mode = %s', self.repr_mode, self.scoring_mode) encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) comb_tensor_list = [] for tensor_input in (p_emb, pw_emb, p_len, pw_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [ -1 ] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) comb_tensor_list.append(tf.reshape(tensor_input, shape=comb_shape)) p_emb, pw_emb, p_len, pw_len = comb_tensor_list # p/pw_emb: (ds * sc_max_len, x_max_len, dim_x_emb) # p/pw_len: (ds * sc_max_len,) with tf.variable_scope('noatt_rm_kernel', reuse=tf.AUTO_REUSE): with tf.variable_scope('qw_repr', reuse=tf.AUTO_REUSE): if self.seq_merge_mode == 'fwbw': q_rep = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder, fwbw=True) # (ds, dim_hidden) else: q_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) if self.seq_merge_mode == 'avg': q_rep = seq_hidden_averaging(seq_hidden_input=q_hidden, len_input=qw_len) else: q_rep = seq_hidden_max_pooling( seq_hidden_input=q_hidden, len_input=qw_len) # (ds, dim_hidden) q_rep = tf.reshape(tf.stack([q_rep] * self.sc_max_len, axis=1), shape=[-1, self.dim_hidden], name='q_rep') # (ds * sc_max_len, dim_hidden) with tf.variable_scope('pw_repr', reuse=tf.AUTO_REUSE): if self.seq_merge_mode == 'fwbw': pw_rep = seq_encoding(emb_input=pw_emb, len_input=pw_len, encoder=rnn_encoder, fwbw=True) # (ds, dim_hidden) else: pw_hidden = seq_encoding(emb_input=pw_emb, len_input=pw_len, encoder=rnn_encoder) if self.seq_merge_mode == 'avg': pw_rep = seq_hidden_averaging( seq_hidden_input=pw_hidden, len_input=pw_len) else: pw_rep = seq_hidden_max_pooling( seq_hidden_input=pw_hidden, len_input=pw_len) # (ds * sc_max_len, dim_hidden) final_ret_dict = self.final_merge(q_rep=q_rep, path_rep=pw_rep, sc_len=sc_len, sc_max_len=self.sc_max_len, dim_hidden=self.dim_hidden, scoring_mode=self.scoring_mode) LogInfo.end_track() return final_ret_dict # rm_score, rm_path_score (optional)
def build_graph(self, mode_str): LogInfo.begin_track('Build graph: [MT-%s]', mode_str) mode = tf.contrib.learn.ModeKeys.INFER if mode_str == 'eval' else tf.contrib.learn.ModeKeys.TRAIN training = False if mode_str == 'eval' else True with tf.device('/cpu:0'): qw_emb = tf.nn.embedding_lookup(params=self.w_embedding, ids=self.input_tensor_dict['qw_input'], name='qw_emb') # (ds, path_max_size, qw_max_len, dim_emb) dep_emb = tf.nn.embedding_lookup(params=self.w_embedding, ids=self.input_tensor_dict['dep_input'], name='dep_emb') # (ds, path_max_size, qw_max_len, dim_emb) pw_emb = tf.nn.embedding_lookup(params=self.w_embedding, ids=self.input_tensor_dict['pw_input'], name='pw_emb') # (ds, path_max_size, pw_max_len, dim_emb) pseq_emb = tf.nn.embedding_lookup(params=self.m_embedding, ids=self.input_tensor_dict['pseq_ids'], name='pseq_emb') # (ds, path_max_size, pseq_max_size, dim_emb) path_emb = tf.nn.embedding_lookup(params=self.p_embedding, ids=self.input_tensor_dict['path_ids'], name='path_emb') # (ds, path_max_size, dim_emb) pw_len = self.input_tensor_dict['pw_len'] pseq_len = self.input_tensor_dict['pseq_len'] qw_len = self.input_tensor_dict['qw_len'] dep_len = self.input_tensor_dict['dep_len'] qw_emb = self.dropout_layer(qw_emb, training=training) dep_emb = self.dropout_layer(dep_emb, training=training) pw_emb = self.dropout_layer(pw_emb, training=training) pseq_emb = self.dropout_layer(pseq_emb, training=training) path_emb = self.dropout_layer(path_emb, training=training) LogInfo.logs('Dropout performed.') rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) """ For RM kernel """ with tf.variable_scope('rm_task', reuse=tf.AUTO_REUSE): path_repr = self.build_path_repr__single(pw_emb=pw_emb, pw_len=pw_len, pseq_emb=pseq_emb, pseq_len=pseq_len, path_emb=path_emb, rnn_encoder=rnn_encoder) """ BiGRU """ qw_repr = self.build_question_seq_repr(seq_emb=qw_emb, seq_len=qw_len, path_repr=path_repr, rnn_encoder=rnn_encoder, scope_name='qw_repr') dep_repr = self.build_question_seq_repr(seq_emb=dep_emb, seq_len=dep_len, path_repr=path_repr, rnn_encoder=rnn_encoder, scope_name='dep_repr') """ Temporal Conv Net """ # qw_repr = self.build_question_seq_repr__tcn(seq_emb=qw_emb, seq_len=qw_len, # training=training, scope_name='qw_repr') # dep_repr = self.build_question_seq_repr__tcn(seq_emb=dep_emb, seq_len=dep_len, # training=training, scope_name='dep_repr') """ Stacking Conv Net (with Attention) """ # qw_repr = self.build_question_seq_repr__scn(seq_emb=qw_emb, seq_len=qw_len, path_repr=path_repr, # training=training, scope_name='qw_repr') # dep_repr = self.build_question_seq_repr__scn(seq_emb=dep_emb, seq_len=dep_len, path_repr=path_repr, # training=training, scope_name='dep_repr') rm_final_feats, rm_score = self.rm_final_merge( path_repr=path_repr, qw_repr=qw_repr, dep_repr=dep_repr, path_cates=self.input_tensor_dict['path_cates'], path_size=self.input_tensor_dict['path_size'] ) """ For EL kernel """ with tf.variable_scope('el_task', reuse=tf.AUTO_REUSE): el_final_feats, el_score = self.el_forward(el_indv_feats=self.input_tensor_dict['el_indv_feats'], el_comb_feats=self.input_tensor_dict['el_comb_feats'], el_mask=self.input_tensor_dict['el_mask']) """ For Full task """ with tf.variable_scope('full_task', reuse=tf.AUTO_REUSE): full_final_feats, full_score = self.full_forward( el_final_feats=el_final_feats, rm_final_feats=rm_final_feats, extra_feats=self.input_tensor_dict['extra_feats'] ) """ Ready to return """ tensor_dict = {'rm_score': rm_score, 'el_score': el_score, 'full_score': full_score, 'rm_final_feats': rm_final_feats, 'el_final_feats': el_final_feats, 'full_final_feats': full_final_feats} LogInfo.logs('%d tensors saved and return: %s', len(tensor_dict), tensor_dict.keys()) LogInfo.end_track() return tensor_dict
class SkBiRNNModule(SkBaseModule): def __init__(self, path_max_len, dim_item_hidden, dim_kb_emb, dim_sk_hidden, data_source, rnn_config): super(SkBiRNNModule, self).__init__(path_max_len=path_max_len, dim_item_hidden=dim_item_hidden, dim_kb_emb=dim_kb_emb, dim_sk_hidden=dim_sk_hidden) self.data_source = data_source assert self.data_source in ('kb', 'word', 'both') rnn_config['num_units'] = dim_sk_hidden / 2 self.rnn_encoder = BidirectionalRNNEncoder( rnn_config, mode=tf.contrib.learn.ModeKeys.TRAIN) # Input: # path_wd_hidden: (batch, path_max_len, dim_item_hidden) # path_kb_hidden: (batch, path_max_len, dim_kb_emb) # path_len: (batch, ) as int32 # focus_wd_hidden: (batch, dim_item_hidden) # focus_kb_hidden: (batch, dim_kb_emb) # Output: # sk_hidden: (batch, dim_sk_hidden) def forward(self, path_wd_hidden, path_kb_hidden, path_len, focus_wd_hidden, focus_kb_hidden, reuse=None): LogInfo.begin_track('SkBiRNNModule forward: ') with tf.variable_scope('SkBiRNNModule', reuse=reuse): if self.data_source == 'kb': use_path_hidden = path_kb_hidden use_focus_hidden = focus_kb_hidden elif self.data_source == 'word': use_path_hidden = path_wd_hidden use_focus_hidden = focus_wd_hidden else: use_path_hidden = tf.concat([path_kb_hidden, path_wd_hidden], axis=-1, name='use_path_hidden') # (batch, path_max_len, dim_item_hidden + dim_kb_hidden) use_focus_hidden = tf.concat( [focus_kb_hidden, focus_wd_hidden], axis=-1, name='use_focus_hidden') # (batch, dim_item_hidden + dim_kb_hidden) use_path_emb_input = tf.concat( [tf.expand_dims(use_focus_hidden, axis=1), use_path_hidden], axis=1, name='use_path_emb_input' ) # (batch, path_max_len + 1, dim_use) show_tensor(use_path_emb_input) use_path_len = path_len + 1 stamps = self.path_max_len + 1 birnn_inputs = tf.unstack(use_path_emb_input, num=stamps, axis=1, name='birnn_inputs') encoder_output = self.rnn_encoder.encode( inputs=birnn_inputs, sequence_length=use_path_len, reuse=reuse) rnn_outputs = tf.stack( encoder_output.outputs, axis=1, name='rnn_outputs') # (batch, path_max_len + 1, dim_sk_hidden) # Since we are in the BiRNN mode, we are simply taking average. sum_sk_hidden = tf.reduce_sum( rnn_outputs, axis=1, name='sum_sk_hidden') # (batch, dim_sk_hidden) use_path_len_mat = tf.cast( tf.expand_dims(use_path_len, axis=1), dtype=tf.float32, name='use_path_len_mat') # (batch, 1) as float32 sk_hidden = tf.div(sum_sk_hidden, use_path_len_mat, name='sk_hidden') # (batch, dim_sk_hidden) LogInfo.end_track() return sk_hidden
def get_score(self, mode, qwords_embedding, qwords_len, sc_len, preds_embedding, preds_len, pwords_embedding, pwords_len): """ Produce the final similarity score. This function is the most important part in the optm/eval model. Just use cosine similarity :param mode: tf.contrib.learn.ModeKeys.TRAIN/INFER, which affects the dropout setting :param qwords_embedding: (ds, q_max_len, dim_emb) :param qwords_len: (ds, ) :param sc_len: (ds, ) :param preds_embedding: (ds, sc_max_len, path_max_len, dim_emb) :param preds_len: (ds, sc_max_len) :param pwords_embedding: (ds, sc_max_len, pword_max_len, dim_emb) :param pwords_len: (ds, sc_max_len) :return: (ds, ) as the final similarity score """ assert mode in (tf.contrib.learn.ModeKeys.TRAIN, tf.contrib.learn.ModeKeys.INFER) if self.rnn_config['cell_class'] == 'None': # won't use any recurrent layer, but just using pure embedding as instead self.dim_hidden = self.dim_emb # force set dim_hidden to be dim_emb q_encoder = pred_encoder = pword_encoder = None else: encoder_args = {'config': self.rnn_config, 'mode': mode} q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) """ BidirectionalRNNEncoder will set the dropout according to the current mode (TRAIN/INFER) """ with tf.name_scope('RelationMatchingKernel'): with tf.variable_scope('Question', reuse=self.reuse): if q_encoder is None: qwords_hidden = qwords_embedding # (ds, q_max_len, dim_hidden=dim_emb) else: qwords_hidden = seq_encoding( emb_input=qwords_embedding, len_input=qwords_len, encoder=q_encoder, reuse=self.reuse) # (ds, q_max_len, dim_hidden) q_hidden = seq_hidden_max_pooling( seq_hidden_input=qwords_hidden, len_input=qwords_len) # (ds, dim_hidden), will be used in the final cosine similarity calculation # Step 1: split schemas into paths # merge ds and sc_max_len into one dimension qwords_hidden = tf.reshape( tf.stack([qwords_hidden] * self.sc_max_len, axis=1), shape=(-1, self.q_max_len, self.dim_hidden), name='qwords_hidden' ) # (ds * sc_max_len, q_max_len, dim_hidden) qwords_len = tf.reshape(tf.stack([qwords_len] * self.sc_max_len, axis=1), shape=(-1, ), name='qwords_len') # (ds * sc_max_len, ) # Now combine ds and sc_max_len into one dimension comb_tensor_list = [] for tensor_input in (preds_embedding, preds_len, pwords_embedding, pwords_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [ -1 ] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) # show_tensor(tensor_input) # LogInfo.logs('ori_shape: %s, comb_shape: %s', ori_shape, comb_shape) comb_tensor_list.append( tf.reshape(tensor_input, shape=comb_shape)) [preds_embedding, preds_len, pwords_embedding, pwords_len] = comb_tensor_list # (ds * sc_max_len, xxxxxxx) # for tensor in comb_tensor_list: # show_tensor(tensor) # Step 2: Compute basic hidden repr. # xxx_final_hidden: (ds * sc_max_len, dim_hidden) # (Optional) xxx_att_mat: (ds * sc_max_len, q_max_len, xxx_max_len) with tf.name_scope('Schema'): with tf.variable_scope('preds', reuse=self.reuse): if pred_encoder is None: preds_hidden = preds_embedding # (ds * sc_max_len, path_max_len, dim_hidden=dim_emb) else: preds_hidden = seq_encoding( emb_input=preds_embedding, len_input=preds_len, encoder=pred_encoder, reuse=self.reuse ) # (ds * sc_max_len, path_max_len, dim_hidden) pred_final_hidden, pred_att_mat = self.aggregate_within_path( qwords_hidden=qwords_hidden, qwords_len=qwords_len, pitems_hidden=preds_hidden, pitems_len=preds_len, item_max_len=self.path_max_len, item_agg_mode=self.preds_agg_mode) with tf.variable_scope('pwords', reuse=self.reuse): if pword_encoder is None: pwords_hidden = pwords_embedding # (ds * sc_max_len, pword_max_len, dim_hidden=dim_emb) else: pwords_hidden = seq_encoding( emb_input=pwords_embedding, len_input=pwords_len, encoder=pword_encoder, reuse=self.reuse ) # (ds * sc_max_len, pword_max_len, dim_hidden) pword_final_hidden, pword_att_mat = self.aggregate_within_path( qwords_hidden=qwords_hidden, qwords_len=qwords_len, pitems_hidden=pwords_hidden, pitems_len=pwords_len, item_max_len=self.pword_max_len, item_agg_mode=self.pwords_agg_mode) # Step 3: 1. merge preds and pwords # 2. combine paths into schemas # 3. produce the final score # path_merge_mode: Max: max pooling # Sum: simple summation with tf.name_scope('PathMerge'): assert not (pword_final_hidden is None and pred_final_hidden is None) if pword_final_hidden is None: # information comes from pwords only path_final_hidden = pred_final_hidden elif pred_final_hidden is None: # information comes from preds only path_final_hidden = pword_final_hidden else: # combine the information from both pwords and preds assert self.path_merge_mode in ('Sum', 'Max') if self.path_merge_mode == 'Sum': path_final_hidden = tf.add( pword_final_hidden, pred_final_hidden, name='path_final_hidden' ) # (ds * sc_max_len, dim_hidden) else: path_final_hidden = tf.reduce_max( tf.stack( [pword_final_hidden, pred_final_hidden], axis=0 ), # (2, ds * sc_max_len, dim_hidden) axis=0, name='path_final_hidden' ) # (ds * sc_max_len, dim_hidden) sc_path_hidden = tf.reshape( path_final_hidden, shape=[-1, self.sc_max_len, self.dim_hidden], name='sc_path_hidden') # (ds, sc_max_len, dim_hidden) # max pooling along all paths sc_hidden = seq_hidden_max_pooling( seq_hidden_input=sc_path_hidden, len_input=sc_len) # (ds, dim_hidden) score = cosine_sim(lf_input=q_hidden, rt_input=sc_hidden) # (ds, ) if pred_att_mat is not None: pred_att_mat = tf.reshape( pred_att_mat, [-1, self.sc_max_len, self.q_max_len, self.path_max_len], name='pred_att_mat' ) # (ds, sc_max_len, q_max_len, path_max_len) if pword_att_mat is not None: pword_att_mat = tf.reshape( pword_att_mat, [-1, self.sc_max_len, self.q_max_len, self.pword_max_len], name='pword_att_mat' ) # (ds, sc_max_len, q_max_len, pword_max_len) return pred_att_mat, pword_att_mat, score
def forward(self, path_size, qw_emb, qw_len, pw_emb, pw_len, mode): """ :param path_size: (ds, ) :param qw_emb: (ds, path_max_size, qw_max_len, dim_qw_emb) :param qw_len: (ds, path_max_size) :param pw_emb: (ds, path_max_size, pw_max_len, dim_pw_emb) :param pw_len: (ds, path_max_size) :param mode: tf.contrib.learn.ModeKeys. TRAIN / INFER """ rm_ret_dict = {} # <tensor_name, tensor> LogInfo.begin_track('Build kernel: [rm_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) dyn_path_max_size = tf.shape(qw_emb)[1] rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) """ Merge first & second dimension: ds * path_max_size = DS """ comb_tensor_list = [] for tensor_input in (qw_emb, qw_len, pw_emb, pw_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [ -1 ] + ori_shape[2:] # keep the dimensions after (ds, path_max_size) comb_tensor_list.append(tf.reshape(tensor_input, shape=comb_shape)) qw_emb, qw_len, pw_emb, pw_len = comb_tensor_list """ pw side representation """ pw_repr = seq_encoding_with_aggregation( emb_input=pw_emb, len_input=pw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (DS, dim_hidden), that is (ds * path_max_size, dim_hidden) """ attention with qw repr """ if self.att_config is not None: dim_att_len = self.att_config['dim_att_hidden'] att_func = self.att_config['att_func'] qw_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) # (DS, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (DS, qw_max_len) simple_att = SimpleAttention(lf_max_len=self.qw_max_len, dim_att_hidden=dim_att_len, att_func=att_func) q_att_rep, att_mat, q_weight = simple_att.forward( lf_input=qw_hidden, lf_mask=qw_mask, fix_rt_input=pw_repr) # q_att_rep: (DS, dim_hidden) # att_mat: (DS, qw_max_len) # q_weight: (DS, qw_max_len) att_mat = tf.reshape( att_mat, shape=[-1, dyn_path_max_size, self.qw_max_len], name='att_mat') # (ds, path_max_size, qw_max_len) q_weight = tf.reshape( q_weight, shape=[-1, dyn_path_max_size, self.qw_max_len], name='q_weight') # (ds, path_max_size, qw_max_len) rm_ret_dict['rm_att_mat'] = att_mat rm_ret_dict['rm_q_weight'] = q_weight qw_repr = q_att_rep else: # no attention, similar with above qw_repr = seq_encoding_with_aggregation( emb_input=qw_emb, len_input=qw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) """ Calculating final score """ final_ret_dict = self.final_merge(qw_repr=qw_repr, pw_repr=pw_repr, path_size=path_size, dyn_path_max_size=dyn_path_max_size, dim_hidden=self.dim_hidden, scoring_mode=self.scoring_mode) rm_ret_dict.update(final_ret_dict) LogInfo.end_track() return rm_ret_dict
def _build_graph(self): self.query_idx = tf.placeholder( dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.query_len = tf.placeholder(dtype=tf.int32, shape=[ None, ]) self.label = tf.placeholder( dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.intent = tf.placeholder(dtype=tf.int32, shape=[ None, ]) self.link_mask = tf.placeholder( dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.entity_idx = tf.placeholder(dtype=tf.int32, shape=[None, self.config.get("PN")]) with tf.device('/cpu:0'), tf.name_scope("embedding_layer"): term_embedding = tf.get_variable( name="embedding", shape=[ self.config.get("vocab_size"), self.config.get("embedding_dim") ], dtype=tf.float32, initializer=tf.constant_initializer(self.embedding_vocab)) self.query_embedding = tf.nn.embedding_lookup( term_embedding, self.query_idx) self.entity_embedding = tf.nn.embedding_lookup( term_embedding, self.entity_idx) # tf.split: Tensor -> list tensors # tf.stack: list of tensors -> list of tensors self.query_slice = [ tf.squeeze(_input, [1]) for _input in tf.split(self.query_embedding, self.config.get("max_seq_len"), axis=1) ] # bi-LSTM with tf.name_scope("rnn_encoder"): rnn_config = dict() key_list = [ "cell_class", "num_units", "dropout_input_keep_prob", "dropout_output_keep_prob", "num_layers" ] for key in key_list: rnn_config[key] = self.config.get(key) rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode) self.encoder_output = rnn_encoder.encode(self.query_slice, self.query_len) # hidden representation for intent detection with tf.name_scope("intent_hidden"): # average attention att_config = dict() key_list = ["num_units"] for key in key_list: att_config[key] = self.config.get(key) att = AttentionLayerAvg() self.query_hidden_avg = att.build( self.encoder_output.attention_values, self.encoder_output.attention_values_length) self.hidden_dim = self.query_hidden_avg.get_shape().as_list()[-1] # training parameters with tf.name_scope("parameters"): self.W_i = tf.get_variable( name="W_i", shape=[self.hidden_dim, self.config.get("intent_num")], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_i = tf.get_variable( name="b_i", shape=[self.config.get("intent_num")], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) self.W_l = tf.get_variable( name="W_l", shape=[self.hidden_dim, self.config.get("label_num")], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_l = tf.get_variable( name="b_l", shape=[self.config.get("label_num")], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) self.W_e = tf.get_variable( name="W_e", shape=[self.hidden_dim * 2, self.config.get("embedding_dim")], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) self.b_e = tf.get_variable( name="b_e", shape=[self.config.get("embedding_dim")], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) # above bi-LSTM # ---------------------------------- Intent Detection --------------------------- # self.intent_layer = tf.nn.xw_plus_b(self.query_hidden_avg, self.W_i, self.b_i) # ---------------------------------- Sequence Labeling -------------------------- # self.outputs = tf.reshape(tensor=self.encoder_output.outputs, shape=[-1, self.hidden_dim]) self.label_layer = tf.nn.xw_plus_b(self.outputs, self.W_l, self.b_l) # [B, T, class_num] self.label_layer = tf.reshape(tensor=self.label_layer, shape=[ -1, self.config.get("max_seq_len"), self.config.get("label_num") ]) # ---------------------------------- Entity Linking--- -------------------------- # """ notice that entity linking in evaluation step is based on the result of sequence nlu so we do two-step evaluation """ # [B, h_dim] self.mention = add_mask_then_avg(self.encoder_output.attention_values, self.link_mask) # [B, h_dim] self.context = add_mask_then_avg(self.encoder_output.attention_values, 1 - self.link_mask) # [B, w2v_dim] self.left = tf.nn.xw_plus_b( tf.concat([self.mention, self.context], axis=1), self.W_e, self.b_e) # [B, 1, w2v_dim] self.left = tf.expand_dims(self.left, axis=1) # [B, PN, w2v_dim] self.left = tf.tile(self.left, multiples=[1, self.config.get("PN"), 1]) # [B*PN, w2v_dim] self.left = tf.reshape(self.left, shape=[-1, self.config.get("embedding_dim")]) # [B*PN, w2v_dim] self.right = tf.reshape(self.entity_embedding, shape=[-1, self.config.get("embedding_dim")]) # [B*PN, ] self.link_score = cosine_sim(self.left, self.right) # ===================================== Loss ====================================== # if self.mode == tf.contrib.learn.ModeKeys.TRAIN: # loss for intent detection self.intent_loss = \ tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.intent_layer, labels=self.intent, name="intent_loss") self.intent_loss = tf.reduce_mean(self.intent_loss) # loss for sequence nlu self.label_loss = softmax_sequence_loss( logits=self.label_layer, targets=self.label, sequence_length=self.query_len) self.label_loss = tf.reduce_mean(self.label_loss) # loss for entity linking self.link_loss = hinge_loss(scores=self.link_score, row=self.config.get("batch_size"), col=self.config.get("PN"), margin=self.config.get("margin")) # train op, currently three losses have equal weights self.train_op = get_optimizer( self.config.get("optimizer"), self.config.get("lr")).minimize(self.intent_loss + self.label_loss + self.link_loss)
def get_score(self, mode, qwords_embedding, qwords_len, sc_len, preds_embedding, preds_len, pwords_embedding, pwords_len): """ Produce the final similarity score. This function is the most important part in the optm/eval model. Just use cosine similarity :param mode: tf.contrib.learn.ModeKeys.TRAIN/INFER, which affects the dropout setting :param qwords_embedding: (ds, q_max_len, dim_emb) :param qwords_len: (ds, ) :param sc_len: (ds, ) :param preds_embedding: (ds, sc_max_len, path_max_len, dim_emb) :param preds_len: (ds, sc_max_len) :param pwords_embedding: (ds, sc_max_len, pword_max_len, dim_emb) :param pwords_len: (ds, sc_max_len) :return: score and attention matrices pred_att_mat: (ds, sc_max_len, q_max_len, path_max_len) pword_att_mat: (ds, sc_max_len, q_max_len, pword_max_len) score: (ds,) """ assert mode in (tf.contrib.learn.ModeKeys.TRAIN, tf.contrib.learn.ModeKeys.INFER) encoder_args = {'config': self.rnn_config, 'mode': mode} # set dropout according to the current mode (TRAIN/INFER) q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) cross_att = IndirectCrossAttention(**self.cross_att_config) with tf.name_scope('separated_relation_matching_kernel'): """ Preprocess: reshaping, merge ds and sc_max_len into one dimension """ qwords_embedding = tf.reshape( tf.stack([qwords_embedding] * self.sc_max_len, axis=1), shape=(-1, self.q_max_len, self.dim_emb), name='qwords_hidden' ) # (ds * sc_max_len, q_max_len, dim_hidden) qwords_len = tf.reshape( tf.stack([qwords_len] * self.sc_max_len, axis=1), shape=(-1,), name='qwords_len' ) # (ds * sc_max_len, ) comb_tensor_list = [] for tensor_input in (preds_embedding, preds_len, pwords_embedding, pwords_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [-1] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) # show_tensor(tensor_input) # LogInfo.logs('ori_shape: %s, comb_shape: %s', ori_shape, comb_shape) comb_tensor_list.append(tf.reshape(tensor_input, shape=comb_shape)) [preds_embedding, preds_len, pwords_embedding, pwords_len] = comb_tensor_list # (ds * sc_max_len, xxxxxxx) # for tensor in comb_tensor_list: # show_tensor(tensor) """ Step 1: Intra-attention (Optional) """ # TODO: Question and pred_words """ Step 2: Cross-attention, make sure pword and preds treat properly """ qwords_att_embedding, preds_att_info, pwords_att_info = cross_att.forward( q_input=qwords_embedding, p_input=preds_embedding, pw_input=pwords_embedding, q_len=qwords_len, p_len=preds_len, pw_len=pwords_len ) preds_att_embedding, preds_att_mat = preds_att_info pwords_att_embedding, pwords_att_mat = pwords_att_info # x_embedding: (ds * sc_max_len, x_max_len, dim_emb) # x_att_mat: (ds * sc_max_len, q_max_len, x_max_len) """ Step 3: Perform RNN over embeddings """ """ Want to share RNN parameters? Put'em into one var_scope """ with tf.variable_scope('qwords', reuse=self.reuse): qwords_hidden = seq_encoding( emb_input=qwords_att_embedding, len_input=qwords_len, encoder=q_encoder, reuse=self.reuse ) # (ds * sc_max_len, q_max_len, dim_hidden) qword_final_hidden = seq_hidden_max_pooling( seq_hidden_input=qwords_hidden, len_input=qwords_len) with tf.variable_scope('preds', reuse=self.reuse): preds_hidden = seq_encoding( emb_input=preds_att_embedding, len_input=preds_len, encoder=pred_encoder, reuse=self.reuse ) # (ds * sc_max_len, path_max_len, dim_hidden) pred_final_hidden = seq_hidden_max_pooling( seq_hidden_input=preds_hidden, len_input=preds_len) with tf.variable_scope('pwords', reuse=self.reuse): pwords_hidden = seq_encoding( emb_input=pwords_att_embedding, len_input=pwords_len, encoder=pword_encoder, reuse=self.reuse ) # (ds * sc_max_len, pword_max_len, dim_hidden) pword_final_hidden = seq_hidden_max_pooling( seq_hidden_input=pwords_hidden, len_input=pwords_len) # x_final_hidden: (ds * sc_max_len, dim_hidden) """ Step 4: Path merging, calculate final score """ # TODO: use pword/pred or not if self.path_merge_mode == 'sum': path_final_hidden = tf.add(pword_final_hidden, pred_final_hidden, name='path_final_hidden') # (ds * sc_max_len, dim_hidden) else: # max path_final_hidden = tf.reduce_max( tf.stack([pword_final_hidden, pred_final_hidden], axis=0), # (2, ds * sc_max_len, dim_hidden) axis=0, name='path_final_hidden') # (ds * sc_max_len, dim_hidden) if self.final_score_mode == 'cos': path_score = cosine_sim(lf_input=qword_final_hidden, rt_input=path_final_hidden) # (ds * sc_max_len, ) else: # dot path_score = tf.reduce_sum(qword_final_hidden * path_final_hidden, axis=-1) # (ds * sc_max_len, ) path_score = tf.reshape(path_score, shape=[-1, self.sc_max_len], name='path_score') # (ds, sc_max_len) sc_mask = tf.sequence_mask(lengths=sc_len, maxlen=self.sc_max_len, dtype=tf.float32, name='sc_mask') # (ds, sc_max_len) as mask score = tf.reduce_sum(path_score * sc_mask, axis=-1, name='score') # (ds, ) pred_att_mat = tf.reshape(preds_att_mat, [-1, self.sc_max_len, self.q_max_len, self.path_max_len], name='pred_att_mat') # (ds, sc_max_len, q_max_len, path_max_len) pword_att_mat = tf.reshape(pwords_att_mat, [-1, self.sc_max_len, self.q_max_len, self.pword_max_len], name='pword_att_mat') # (ds, sc_max_len, q_max_len, pword_max_len) return pred_att_mat, pword_att_mat, score
def _build_graph(self): self.query_idx = tf.placeholder(dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.query_len = tf.placeholder(dtype=tf.int32, shape=[None, ]) self.label = tf.placeholder(dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.batch_size = self.config.get("batch_size") with tf.device('/cpu:0'), tf.name_scope("embedding_layer"): term_embedding = tf.get_variable( name="embedding", shape=[self.config.get("vocab_size"), self.config.get("embedding_dim")], dtype=tf.float32, initializer=tf.constant_initializer(self.embedding_vocab) ) self.query_embedding = tf.nn.embedding_lookup(term_embedding, self.query_idx) # tf.split: Tensor -> list tensors # tf.stack: list of tensors -> one tensor self.query_slice = [ tf.squeeze(_input, [1]) for _input in tf.split(self.query_embedding, self.config.get("max_seq_len"), axis=1) ] # better style: use unstack! one tensor -> list of tensors # equal to the above one # self.query_slice = tf.unstack(self.query_embedding, axis=1) # bi-LSTM with tf.name_scope("rnn_encoder"): rnn_config = dict() key_list = ["cell_class", "num_units", "dropout_input_keep_prob", "dropout_output_keep_prob", "num_layers"] for key in key_list: rnn_config[key] = self.config.get(key) rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode) self.biLstm = rnn_encoder.encode(self.query_slice, self.query_len) # output dim = 2 * rnn cell dim (fw + bw) self.hidden_dim = self.config.get("num_units") * 2 self.biLstm_clip = tf.clip_by_value(self.biLstm.attention_values, -self.config.get("grad_clip"), self.config.get("grad_clip")) # training parameters with tf.name_scope("parameters"): self.W_l = tf.get_variable(name="W_l", shape=[self.hidden_dim, self.config.get("label_num")], dtype=tf.float32, initializer =tf.contrib.layers.xavier_initializer(uniform=True)) self.b_l = tf.get_variable(name="b_l", shape=[self.config.get("label_num")], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) # above bi-LSTM self.outputs = tf.reshape(tensor=self.biLstm_clip, shape=[-1, self.hidden_dim]) self.label_matrix = tf.nn.xw_plus_b(self.outputs, self.W_l, self.b_l) # [B, T, label_num] self.logits = tf.reshape(tensor=self.label_matrix, shape=[-1, self.config.get("max_seq_len"), self.config.get("label_num")]) # [label_num, label_num] self.transition_mat = tf.get_variable( "transitions", shape=[self.config.get("label_num")+1, self.config.get("label_num")+1], initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # ===================================== Loss ====================================== # if self.mode == tf.contrib.learn.ModeKeys.TRAIN: # # softmax sequence loss for sequence nlu # self.loss = softmax_sequence_loss(logits=self.logits, # targets=self.label, # sequence_length=self.query_len) # self.loss = tf.reduce_mean(self.loss) # padding logits for crf loss, length += 1 small = -1000.0 start_logits = tf.concat( [small * tf.ones(shape=[self.batch_size, 1, self.config.get("label_num")]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1 ) LogInfo.logs(start_logits.get_shape().as_list()) pad_logits = tf.cast(small * tf.ones([self.batch_size, self.config.get("max_seq_len"), 1]), tf.float32) LogInfo.logs(pad_logits.get_shape().as_list()) self.logits = tf.concat([self.logits, pad_logits], axis=-1) self.logits = tf.concat([start_logits, self.logits], axis=1) LogInfo.logs(self.logits.get_shape().as_list()) targets = tf.concat( [tf.cast(self.config.get("label_num")*tf.ones([self.batch_size, 1]), tf.int32), self.label], axis=-1 ) LogInfo.logs(targets.get_shape().as_list()) # CRF layer self.log_likelihood, self.transition_mat = \ tf.contrib.crf.crf_log_likelihood( inputs=self.logits, tag_indices=targets, transition_params=self.transition_mat, sequence_lengths=self.query_len+1) self.loss = tf.reduce_mean(-self.log_likelihood) # train op self.global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = get_optimizer(self.config.get("optimizer"), self.config.get("lr")) grads_and_vars = optimizer.compute_gradients(self.loss) self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)