def _make_input(self, embed): self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value=_UNK, shared_name="out_table", name="out_table", checkpoint=True) with tf.variable_scope("input"): self.post_string = tf.placeholder(tf.string, (None, None), 'post_string') self.ref_string = tf.placeholder(tf.string, (None, None), 'ref_string') self.response_string = tf.placeholder(tf.string, (None, None), 'response_string') self.post = self.symbol2index.lookup(self.post_string) self.post_len = tf.placeholder(tf.int32, (None, ), 'post_len') self.ref = self.symbol2index.lookup(self.ref_string) self.ref_len = tf.placeholder(tf.int32, (None, ), 'ref_len') self.response = self.symbol2index.lookup(self.response_string) self.response_len = tf.placeholder(tf.int32, (None, ), 'response_len') with tf.variable_scope("embedding") as scope: if embed is None: # initialize the embedding randomly self.emb_enc = self.emb_dec = tf.get_variable( "emb_share", [self.vocab_size, self.embed_size], dtype=tf.float32) else: # initialize the embedding by pre-trained word vectors print "share pre-trained embed" self.emb_enc = self.emb_dec = tf.get_variable( 'emb_share', dtype=tf.float32, initializer=embed) self.enc_post = tf.nn.embedding_lookup(self.emb_enc, self.post) self.enc_ref = tf.nn.embedding_lookup(self.emb_enc, self.ref) self.enc_response = tf.nn.embedding_lookup(self.emb_enc, self.response) self.batch_len = tf.shape(self.response)[1] self.batch_size = tf.shape(self.response)[0] self.response_input = tf.concat([ tf.ones((self.batch_size, 1), dtype=tf.int64) * GO_ID, tf.split(self.response, [self.batch_len - 1, 1], axis=1)[0] ], 1) self.dec_inp = tf.nn.embedding_lookup(self.emb_dec, self.response_input) self.keep_prob = tf.placeholder_with_default(1.0, ()) self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")
def _init_vocabs(self): self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True)
def __init__( self, embed, # 词嵌入 vocabulary, vocabulary_count, num_layers, # encoder和decoder的层数 num_units, # encoder和decoder的隐藏状态维度 learning_rate, max_gradient_norm, max_len): self.post_string = tf.placeholder( dtype=tf.string, shape=(None, None), name="post_string") # post字符串,batch_size*length self.response_string = tf.placeholder( dtype=tf.string, shape=(None, None), name="response_string") # response字符串,batch_size*length self.label_string = tf.placeholder(dtype=tf.string, shape=(None, None), name="label_string") self.post_len = tf.placeholder(dtype=tf.int32, shape=(None, ), name="post_len") # post长度 self.response_len = tf.placeholder(dtype=tf.int32, shape=(None, ), name="reponse_len") # response长度 self.embed = tf.get_variable(dtype=tf.float32, initializer=embed, name="embed") # 词嵌入,作为变量训练 self.vocabulary = tf.constant(vocabulary, dtype=tf.string) # 词汇表 self.batch_size = tf.shape(self.post_string)[0] self.encoder_len = tf.shape(self.post_string)[1] self.decoder_len = tf.shape(self.response_string)[1] self.mask = tf.cumsum(tf.one_hot(self.response_len - 1, self.decoder_len), axis=1, reverse=True) # 将字符转化成id表示的表 self.string_to_id = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=1, shared_name="string_to_id", name="string_to_id", checkpoint=True) # 将id转化成字符串表示的表 self.id_to_string = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_NDW", shared_name="id_to_string", name="id_to_string", checkpoint=True) # 将post和response转化成id表示 self.post_id = self.string_to_id.lookup( self.post_string) # batch_size*length self.response_id = self.string_to_id.lookup( self.response_string) # batch_size*length self.label_id = self.string_to_id.lookup(self.label_string) # # 将post和response转化成嵌入表示 self.post_embed = tf.nn.embedding_lookup( embed, self.post_id) # batch_size*length*embed_size self.response_embed = tf.nn.embedding_lookup( embed, self.response_id) # batch_size*length*embed_size # encoder和decoder的层数和维度 encoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) projection_fn, loss_fn, inference_fn = get_project_funtion( vocabulary_count) with tf.variable_scope("encoder"): self.encoder_output, self.encoder_state = tf.nn.dynamic_rnn( encoder_cell, self.post_embed, self.post_len, dtype=tf.float32) # self.encoder_output_shape = tf.shape(self.encoder_output) # [batch_size encoder_len num_units] # self.encoder_state_shape = tf.shape(self.encoder_state) # [num_layers 2 batch_size num_units] with tf.variable_scope("decoder"): self.decoder_output, self.decoder_state, self.loop_state = dynamic_decoder( decoder_cell, encoder_state=self.encoder_state, input=self.response_embed, response_len=self.response_len) # self.decoder_output_shape = tf.shape(self.decoder_output) # [batch_size decoder_len num_units] # self.decoder_state_shape = tf.shape(self.decoder_state) # [num_layers 2 batch_size num_units] # self.softmaxed_probability = projection_function(self.decoder_output) # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count] # self.maximum_likelihood_id = tf.argmax(self.softmaxed_probability, axis=2) # [batch_size decoder_len] # self.output_string = self.id_to_string.lookup(self.maximum_likelihood_id) self.loss, self.avg_loss = loss_fn(self.decoder_output, self.label_id, self.mask) with tf.variable_scope("decoder", reuse=True): self.inference_output, self.inference_state, self.inference_loop_state = dynamic_decoder( decoder_cell, encoder_state=self.encoder_state, projection_function=projection_fn, embed=self.embed, max_len=max_len) self.inference_maximum_likelihood_id = inference_fn( self.inference_output) self.inference_string = self.id_to_string.lookup( self.inference_maximum_likelihood_id ) # [batch_size decoder_len] self.global_step = tf.Variable(0, trainable=False, name="global_step") self.params = tf.global_variables() opt = tf.train.AdamOptimizer(learning_rate=learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=3)
class CVAE(object): def __init__(self, tfFLAGS, embed=None): self.vocab_size = tfFLAGS.vocab_size self.embed_size = tfFLAGS.embed_size self.num_units = tfFLAGS.num_units self.num_layers = tfFLAGS.num_layers self.beam_width = tfFLAGS.beam_width self.use_lstm = tfFLAGS.use_lstm self.attn_mode = tfFLAGS.attn_mode self.train_keep_prob = tfFLAGS.keep_prob self.max_decode_len = tfFLAGS.max_decode_len self.bi_encode = tfFLAGS.bi_encode self.recog_hidden_units = tfFLAGS.recog_hidden_units self.prior_hidden_units = tfFLAGS.prior_hidden_units self.z_dim = tfFLAGS.z_dim self.full_kl_step = tfFLAGS.full_kl_step self.global_step = tf.Variable(0, name="global_step", trainable=False) self.max_gradient_norm = 5.0 if tfFLAGS.opt == 'SGD': self.learning_rate = tf.Variable(float(tfFLAGS.learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * tfFLAGS.learning_rate_decay_factor) self.opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif tfFLAGS.opt == 'Momentum': self.opt = tf.train.MomentumOptimizer( learning_rate=tfFLAGS.learning_rate, momentum=tfFLAGS.momentum) else: self.learning_rate = tfFLAGS.learning_rate self.opt = tf.train.AdamOptimizer() self._make_input(embed) with tf.variable_scope("output_layer"): self.output_layer = Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1)) with tf.variable_scope("encoders", initializer=tf.orthogonal_initializer()): self.enc_post_outputs, self.enc_post_state = self._build_encoder( scope='post_encoder', inputs=self.enc_post, sequence_length=self.post_len) self.enc_ref_outputs, self.enc_ref_state = self._build_encoder( scope='ref_encoder', inputs=self.enc_ref, sequence_length=self.ref_len) self.enc_response_outputs, self.enc_response_state = self._build_encoder( scope='resp_encoder', inputs=self.enc_response, sequence_length=self.response_len) self.post_state = self._get_representation_from_enc_state( self.enc_post_state) self.ref_state = self._get_representation_from_enc_state( self.enc_ref_state) self.response_state = self._get_representation_from_enc_state( self.enc_response_state) self.cond_embed = tf.concat([self.post_state, self.ref_state], axis=-1) with tf.variable_scope("RecognitionNetwork"): recog_input = tf.concat([self.cond_embed, self.response_state], axis=-1) recog_hidden = tf.layers.dense(inputs=recog_input, units=self.recog_hidden_units, activation=tf.nn.tanh) recog_mulogvar = tf.layers.dense(inputs=recog_hidden, units=self.z_dim * 2, activation=None) # recog_mulogvar = tf.layers.dense(inputs=recog_input, units=self.z_dim * 2, activation=None) recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=-1) with tf.variable_scope("PriorNetwork"): prior_input = self.cond_embed prior_hidden = tf.layers.dense(inputs=prior_input, units=self.prior_hidden_units, activation=tf.nn.tanh) prior_mulogvar = tf.layers.dense(inputs=prior_hidden, units=self.z_dim * 2, activation=None) prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=-1) with tf.variable_scope("GenerationNetwork"): latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar), name='latent_sample') gen_input = tf.concat([self.cond_embed, latent_sample], axis=-1) if self.use_lstm: self.dec_init_state = tuple([ tf.contrib.rnn.LSTMStateTuple( c=tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None), h=tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None)) for _ in range(self.num_layers) ]) print self.dec_init_state else: self.dec_init_state = tuple([ tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None) for _ in range(self.num_layers) ]) kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) self.kl_weights = tf.minimum( tf.to_float(self.global_step) / self.full_kl_step, 1.0) self.kl_loss = self.kl_weights * self.avg_kld self._build_decoder() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) for var in tf.trainable_variables(): print var def _make_input(self, embed): self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value=_UNK, shared_name="out_table", name="out_table", checkpoint=True) with tf.variable_scope("input"): self.post_string = tf.placeholder(tf.string, (None, None), 'post_string') self.ref_string = tf.placeholder(tf.string, (None, None), 'ref_string') self.response_string = tf.placeholder(tf.string, (None, None), 'response_string') self.post = self.symbol2index.lookup(self.post_string) self.post_len = tf.placeholder(tf.int32, (None, ), 'post_len') self.ref = self.symbol2index.lookup(self.ref_string) self.ref_len = tf.placeholder(tf.int32, (None, ), 'ref_len') self.response = self.symbol2index.lookup(self.response_string) self.response_len = tf.placeholder(tf.int32, (None, ), 'response_len') with tf.variable_scope("embedding") as scope: if embed is None: # initialize the embedding randomly self.emb_enc = self.emb_dec = tf.get_variable( "emb_share", [self.vocab_size, self.embed_size], dtype=tf.float32) else: # initialize the embedding by pre-trained word vectors print "share pre-trained embed" self.emb_enc = self.emb_dec = tf.get_variable( 'emb_share', dtype=tf.float32, initializer=embed) self.enc_post = tf.nn.embedding_lookup(self.emb_enc, self.post) self.enc_ref = tf.nn.embedding_lookup(self.emb_enc, self.ref) self.enc_response = tf.nn.embedding_lookup(self.emb_enc, self.response) self.batch_len = tf.shape(self.response)[1] self.batch_size = tf.shape(self.response)[0] self.response_input = tf.concat([ tf.ones((self.batch_size, 1), dtype=tf.int64) * GO_ID, tf.split(self.response, [self.batch_len - 1, 1], axis=1)[0] ], 1) self.dec_inp = tf.nn.embedding_lookup(self.emb_dec, self.response_input) self.keep_prob = tf.placeholder_with_default(1.0, ()) self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") def _build_encoder(self, scope, inputs, sequence_length): with tf.variable_scope(scope): if self.bi_encode: cell_fw, cell_bw = self._build_biencoder_cell() outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=inputs, sequence_length=sequence_length, dtype=tf.float32) enc_outputs = tf.concat(outputs, axis=-1) enc_state = [] for i in range(self.num_layers): if self.use_lstm: encoder_state_c = tf.concat( [states[0][i].c, states[1][i].c], axis=-1) encoder_state_h = tf.concat( [states[0][i].h, states[1][i].h], axis=-1) enc_state.append( tf.contrib.rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)) else: enc_state.append( tf.concat([states[0][i], states[1][i]], axis=-1)) enc_state = tuple(enc_state) return enc_outputs, enc_state else: enc_cell = self._build_encoder_cell() enc_outputs, enc_state = tf.nn.dynamic_rnn( cell=enc_cell, inputs=inputs, sequence_length=sequence_length, dtype=tf.float32) return enc_outputs, enc_state def _get_representation_from_enc_state(self, enc_state): if self.use_lstm: return tf.concat([state.h for state in enc_state], axis=-1) else: return tf.concat(enc_state, axis=-1) def _build_decoder(self): with tf.variable_scope("decode", initializer=tf.orthogonal_initializer()): dec_cell, init_state = self._build_decoder_cell( self.enc_post_outputs, self.post_len, self.dec_init_state) train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=self.dec_inp, sequence_length=self.response_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=dec_cell, helper=train_helper, initial_state=init_state, output_layer=self.output_layer) train_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder, maximum_iterations=self.max_decode_len, ) logits = train_output.rnn_output mask = tf.sequence_mask(self.response_len, self.batch_len, dtype=tf.float32) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.response, logits=logits) crossent = tf.reduce_sum(crossent * mask) self.sen_loss = crossent / tf.to_float(self.batch_size) # ppl(loss avg) across each timestep, the same as : # self.loss = tf.contrib.seq2seq.sequence_loss(train_output.rnn_output, # self.response, # mask) self.ppl_loss = crossent / tf.reduce_sum(mask) # add kld: self.elbo = self.sen_loss + self.kl_loss # Calculate and clip gradients params = tf.trainable_variables() gradients = tf.gradients(self.elbo, params) clipped_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = self.opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.train_out = self.index2symbol.lookup(tf.cast( train_output.sample_id, tf.int64), name='train_out') with tf.variable_scope("decode", reuse=True): dec_cell, init_state = self._build_decoder_cell( self.enc_post_outputs, self.post_len, self.dec_init_state) start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [self.batch_size]) end_token = EOS_ID infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.emb_dec, start_tokens, end_token) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=dec_cell, helper=infer_helper, initial_state=init_state, output_layer=self.output_layer) infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=infer_decoder, maximum_iterations=self.max_decode_len, ) self.inference = self.index2symbol.lookup(tf.cast( infer_output.sample_id, tf.int64), name='inference') with tf.variable_scope("decode", reuse=True): dec_init_state = tf.contrib.seq2seq.tile_batch( self.dec_init_state, self.beam_width) enc_outputs = tf.contrib.seq2seq.tile_batch( self.enc_post_outputs, self.beam_width) post_len = tf.contrib.seq2seq.tile_batch(self.post_len, self.beam_width) dec_cell, init_state = self._build_decoder_cell( enc_outputs, post_len, dec_init_state, beam_width=self.beam_width) beam_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=dec_cell, embedding=self.emb_dec, start_tokens=tf.ones_like(self.post_len) * GO_ID, end_token=EOS_ID, initial_state=init_state, beam_width=self.beam_width, output_layer=self.output_layer) beam_output, _, beam_lengths = tf.contrib.seq2seq.dynamic_decode( decoder=beam_decoder, maximum_iterations=self.max_decode_len, ) self.beam_out = self.index2symbol.lookup(tf.cast( beam_output.predicted_ids, tf.int64), name='beam_out') def _build_encoder_cell(self): if self.use_lstm: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell(self.num_units), self.keep_prob) for _ in range(self.num_layers) ]) else: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.num_units), self.keep_prob) for _ in range(self.num_layers) ]) return cell def _build_biencoder_cell(self): if self.use_lstm: cell_fw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell(self.num_units / 2), self.keep_prob) for _ in range(self.num_layers) ]) cell_bw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell(self.num_units / 2), self.keep_prob) for _ in range(self.num_layers) ]) else: cell_fw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.num_units / 2), self.keep_prob) for _ in range(self.num_layers) ]) cell_bw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.num_units / 2), self.keep_prob) for _ in range(self.num_layers) ]) return cell_fw, cell_bw def _build_decoder_cell(self, memory, memory_len, encode_state, beam_width=1): if self.use_lstm: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell(self.num_units), self.keep_prob) for _ in range(self.num_layers) ]) else: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.GRUCell(self.num_units), self.keep_prob) for _ in range(self.num_layers) ]) if self.attn_mode == 'Luong': attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units=self.num_units, memory=memory, memory_sequence_length=memory_len, scale=True) elif self.attn_mode == 'Bahdanau': attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=self.num_units, memory=memory, memory_sequence_length=memory_len, scale=True) else: return cell, encode_state attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell=cell, attention_mechanism=attention_mechanism, attention_layer_size=self.num_units, ) return attn_cell, attn_cell.zero_state( self.batch_size * beam_width, tf.float32).clone(cell_state=encode_state) def initialize(self, sess, vocab): op_in = self.symbol2index.insert( constant_op.constant(vocab), constant_op.constant(range(len(vocab)), dtype=tf.int64)) op_out = self.index2symbol.insert( constant_op.constant(range(len(vocab)), dtype=tf.int64), constant_op.constant(vocab)) sess.run(tf.global_variables_initializer()) sess.run([op_in, op_out]) def step(self, sess, data, is_train=False): input_feed = { self.post_string: data['post'], self.post_len: data['post_len'], self.ref_string: data['ref'], self.ref_len: data['ref_len'], self.response_string: data['response'], self.response_len: data['response_len'], self.use_prior: is_train, } if is_train: output_feed = [ self.train_op, self.ppl_loss, self.elbo, self.sen_loss, self.kl_loss, self.avg_kld, self.kl_weights, # self.post_string, # self.response_string, # self.train_out, # self.inference, # self.beam_out, ] input_feed[self.keep_prob] = self.train_keep_prob else: output_feed = [ self.ppl_loss, self.elbo, self.sen_loss, self.kl_loss, self.avg_kld, self.kl_weights, # self.post_string, # self.response_string, # self.train_out, # self.inference, # self.beam_out, ] return sess.run(output_feed, input_feed)
def __init__( self, num_symbols, # 词汇表size num_embed_units, # 词嵌入size num_units, # RNN 每层单元数 num_layers, # RNN 层数 embed, # 词嵌入 entity_embed=None, # 实体+关系的嵌入 num_entities=0, # 实体+关系的总个数 num_trans_units=100, # 实体嵌入的维度 memory_units=100, learning_rate=0.0001, # 学习率 learning_rate_decay_factor=0.95, # 学习率衰退,并没有采用这种方式 max_gradient_norm=5.0, # num_samples=500, # 样本个数,sampled softmax max_length=60, mem_use=True, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # [batch_size, encoder_len] self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # [batch_size] self.responses = tf.placeholder( tf.string, (None, None), 'dec_inps') # [batch_size, decoder_len] self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # [batch_size] self.entities = tf.placeholder( tf.string, (None, None, None), 'entities') # [batch_size, triple_num, triple_len] self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # 没用到 self.triples = tf.placeholder( tf.string, (None, None, None, 3), 'triples') # [batch_size, triple_num, triple_len, 3] self.posts_triple = tf.placeholder( tf.int32, (None, None, 1), 'enc_triples') # [batch_size, encoder_len, 1] self.responses_triple = tf.placeholder( tf.string, (None, None, 3), 'dec_triples') # [batch_size, decoder_len, 3] self.match_triples = tf.placeholder( tf.int32, (None, None, None), 'match_triples') # [batch_size, decoder_len, triple_num] # 编码器batch_size,编码器encoder_len encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] # 知识图个数 triple_len = tf.shape(self.triples)[2] # 知识三元组个数 # 使用的知识三元组 one_hot_triples = tf.one_hot( self.match_triples, triple_len) # [batch_size, decoder_len, triple_num, triple_len] # 用 1 标注了哪个时间步产生的回复用了知识三元组 use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # [batch_size, decoder_len] # 词汇映射到index的hash table self.symbol2index = MutableHashTable( key_dtype=tf.string, # key张量的类型 value_dtype=tf.int64, # value张量的类型 default_value=UNK_ID, # 缺少key的默认值 shared_name= "in_table", # If non-empty, this table will be shared under the given name across multiple sessions name="in_table", # 操作名 checkpoint=True ) # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name. # index映射到词汇的hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 实体映射到index的hash table self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) # index映射到实体的hash table self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) self.posts_word_id = self.symbol2index.lookup( self.posts) # [batch_size, encoder_len] self.posts_entity_id = self.entity2index.lookup( self.posts) # [batch_size, encoder_len] self.responses_target = self.symbol2index.lookup( self.responses) # [batch_size, decoder_len] # 获得解码器的batch_size,decoder_len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # 去掉responses_target的最后一列,给第一列加上GO_ID self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # [batch_size, decoder_len] # 得到response的mask self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # [batch_size, decoder_len] # 初始化词嵌入和实体嵌入,传入了参数就直接赋值,没有的话就随机初始化 if embed is None: self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # 实体嵌入不随着模型的训练而更新 self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # 将实体嵌入传入一个全连接层 self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') # 添加['_NONE', '_PAD_H', '_PAD_R', '_PAD_T', '_NAF_H', '_NAF_R', '_NAF_T']这7个的嵌入 padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) # triples_embedding: [batch_size, triple_num, triple_len, 3*num_trans_units] 知识图三元组的嵌入 triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) # entities_word_embedding: [batch_size, triple_num*triple_len, num_embed_units] 知识图中用到的所有实体的嵌入 entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) # 分离知识图三元组的头、关系和尾 [batch_size, triple_num, triple_len, num_trans_units] head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 静态图注意力机制 with tf.variable_scope('graph_attention'): # 将头尾连接起来 [batch_size, triple_num, triple_len, 2*num_trans_units] head_tail = tf.concat([head, tail], axis=3) # 将头尾送入全连接层 [batch_size, triple_num, triple_len, num_trans_units] head_tail_transformed = tf.layers.dense(head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform') # 将关系送入全连接层 [batch_size, triple_num, triple_len, num_trans_units] relation_transformed = tf.layers.dense(relation, num_trans_units, name='relation_transform') # 求头尾和关系两个向量的内积,获得对三元组的注意力系数 e_weight = tf.reduce_sum( relation_transformed * head_tail_transformed, axis=3) # [batch_size, triple_num, triple_len] alpha_weight = tf.nn.softmax( e_weight) # [batch_size, triple_num, triple_len] # tf.expand_dims 使 alpha_weight 维度+1 [batch_size, triple_num, triple_len, 1] # 对第2个维度求和,由此产生静态图的向量表示 graph_embed = tf.reduce_sum( tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) # [batch_size, triple_num, 2*num_trans_units] """graph_embed_input 1、首先一维的range列表[0, 1, 2... encoder_batch_size个]转化成三维的[encoder_batch_size, 1, 1]的矩阵 [[[0]], [[1]], [[2]],...] 2、然后tf.tile将矩阵的第1维复制encoder_len遍,变成[encoder_batch_size, encoder_len, 1] [[[0],[0]...]],...] 3、与posts_triple: [batch_size, encoder_len, 1]在第2维上进行拼接,形成一个indices: [batch_size, encoder_len, 2]矩阵, indices矩阵: [ [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len], [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len], [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len] ,...batch_size ] 4、tf.gather_nd根据索引检索graph_embed: [batch_size, triple_num, 2*num_trans_units]再回填至indices矩阵 indices矩阵最后一个维度是2,例如有[0, 2],表示这个时间步第1个batch用了第2个图, 则找到这个知识图的静态图向量填入到indices矩阵的[0, 2]位置最后得到结果维度 [encoder_batch_size, encoder_len, 2*num_trans_units]表示每个时间步用的静态图向量 """ # graph_embed_input = tf.gather_nd(graph_embed, tf.concat( # [tf.tile(tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), # self.posts_triple], # axis=2)) # 将responses_triple转化成实体嵌入 [batch_size, decoder_len, 300],标识了response每个时间步用了哪个三元组的嵌入 # triple_embed_input = tf.reshape( # tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.responses_triple)), # [batch_size, decoder_len, 3 * num_trans_units]) post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # [batch_size, encoder_len, num_embed_units] response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id ) # [batch_size, decoder_len, num_embed_units] # post_word_input和graph_embed_input拼接构成编码器输入 [batch_size, encoder_len, num_embed_units+2*num_trans_units] # self.encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2) # response_word_input和triple_embed_input拼接构成解码器输入 [batch_size, decoder_len, num_embed_units+3*num_trans_units] # self.decoder_input = tf.concat([response_word_input, triple_embed_input], axis=2) encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder # encoder_state: [num_layers, 2, batch_size, num_units] 编码器输出状态 LSTM GRU:[num_layers, batch_size, num_units] encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell, post_word_input, self.posts_length, dtype=tf.float32, scope="encoder") # self.encoder_state_shape = tf.shape(encoder_state) ########记忆网络 ### response_encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) response_encoder_output, response_encoder_state = tf.nn.dynamic_rnn( response_encoder_cell, response_word_input, self.responses_length, dtype=tf.float32, scope="response_encoder") # graph_embed: [batch_size, triple_num, 2*num_trans_units] 静态图向量 # encoder_state: [num_layers, batch_size, num_units] with tf.variable_scope("post_memory_network"): # 将静态知识图转化成输入向量m post_input = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="post_weight_a") post_input = tf.tile( tf.reshape(post_input, (1, encoder_batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将静态知识库转化成输出向量c post_output = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="post_weight_c") post_output = tf.tile( tf.reshape(post_output, (1, encoder_batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将question转化成状态向量u encoder_hidden_state = tf.reshape( tf.concat(encoder_state, axis=0), (num_layers, encoder_batch_size, num_units)) post_state = tf.layers.dense(encoder_hidden_state, memory_units, use_bias=False, name="post_weight_b") post_state = tf.tile( tf.reshape(post_state, (num_layers, encoder_batch_size, 1, memory_units)), multiples=( 1, 1, triple_num, 1)) # [num_layers, batch_size, triple_num, memory_units] # 概率p post_p = tf.reshape( tf.nn.softmax(tf.reduce_sum(post_state * post_input, axis=3)), (num_layers, encoder_batch_size, triple_num, 1)) # [num_layers, batch_size, triple_num, 1] # 输出o post_o = tf.reduce_sum( post_output * post_p, axis=2) # [num_layers, batch_size, memory_units] post_xstar = tf.concat( [ tf.layers.dense(post_o, memory_units, use_bias=False, name="post_weight_r"), encoder_state ], axis=2) # [num_layers, batch_size, num_units+memory_units] with tf.variable_scope("response_memory_network"): # 将静态知识图转化成输入向量m response_input = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="response_weight_a") response_input = tf.tile( tf.reshape(response_input, (1, batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将静态知识库转化成输出向量c response_output = tf.layers.dense(graph_embed, memory_units, use_bias=False, name="response_weight_c") response_output = tf.tile( tf.reshape(response_output, (1, batch_size, triple_num, memory_units)), multiples=( num_layers, 1, 1, 1)) # [num_layers, batch_size, triple_num, memory_units] # 将question转化成状态向量u response_hidden_state = tf.reshape( tf.concat(response_encoder_state, axis=0), (num_layers, batch_size, num_units)) response_state = tf.layers.dense(response_hidden_state, memory_units, use_bias=False, name="response_weight_b") response_state = tf.tile( tf.reshape(response_state, (num_layers, batch_size, 1, memory_units)), multiples=( 1, 1, triple_num, 1)) # [num_layers, batch_size, triple_num, memory_units] # 概率p response_p = tf.reshape( tf.nn.softmax( tf.reduce_sum(response_state * response_input, axis=3)), (num_layers, batch_size, triple_num, 1)) # [num_layers, batch_size, triple_num, 1] # 输出o response_o = tf.reduce_sum( response_output * response_p, axis=2) # [num_layers, batch_size, memory_units] response_ystar = tf.concat( [ tf.layers.dense(response_o, memory_units, use_bias=False, name="response_weight_r"), response_encoder_state ], axis=2) # [num_layers, batch_size, num_units+memory_units] with tf.variable_scope("memory_network"): memory_hidden_state = tf.layers.dense(tf.concat( [post_xstar, response_ystar], axis=2), num_units, use_bias=False, activation=tf.tanh, name="output_weight") memory_hidden_state = tf.reshape( memory_hidden_state, (num_layers * batch_size, num_units)) # [num_layers, batch_size, num_units] memory_hidden_state = tuple( tf.split(memory_hidden_state, [batch_size] * num_layers, axis=0)) # self.memory_hidden_state_shape = tf.shape(memory_hidden_state) ######## ### output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss =\ output_projection_layer(num_units, num_symbols, num_samples) ########用于训练的decoder ### with tf.variable_scope('decoder'): attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) # 训练时处理每个时间步输出和下个时间步输入的函数 decoder_fn_train = attention_decoder_fn_train( memory_hidden_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, response_word_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1, 0, 2, 3]) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, name='ppx_loss') else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) ######## ### ########用于推导的decoder ### with tf.variable_scope('decoder', reuse=True): attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use) decoder_fn_inference = \ attention_decoder_fn_inference(output_fn, memory_hidden_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, # imem: ([batch_size,triple_num*triple_len,num_embed_units], tf.reshape(triples_embedding, [encoder_batch_size, -1, 3*num_trans_units])), # [encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体词嵌入和三元组嵌入的元组 selector_fn=selector_fn) # decoder_distribution: [batch_size, decoder_len, num_symbols] # output_ids_ta: tensorarray: decoder_len [batch_size] self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] # decoder_len output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对output的值域行裁剪,因为存在负值表示用了实体词 word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) # [batch_size, decoder_len] # 计算的是实体词在entities中的实际位置 [batch_size, decoder_len] # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len # 2、tf.range(encoder_batch_size): [batch_size] # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在entities中的基地址 # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词在entities中的偏移量 # 5、entity_ids: [batch_size, decoder_len] 实体词在entities中的实际位置 entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) # 计算的是所用的实体词 [batch_size, decoder_len] # 1、entities: [batch_size, triple_num, triple_len] # 2、tf.reshape(self.entities, [-1]): [batch_size*triple_num*triple_len] # 3、tf.gather: [batch_size*decoder_len] # 4、entities: [batch_size, decoder_len] entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 将id转化为实际的词 # output_ids>0为bool张量,True的位置用words中该位置的词替换 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity( self.generation, name='generation') # [batch_size, decoder_len] ######## ### # 初始化训练过程 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) # 并没有使用衰退的学习率 self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # 更新参数的次数 self.global_step = tf.Variable(0, trainable=False) # 要训练的参数 self.params = tf.global_variables() # 选择优化算法 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr # 根据 decoder_loss 计算 params 梯度 gradients = tf.gradients(self.decoder_loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # 记录损失 tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) # 记录变量的训练情况 self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True ) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)] self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.origin_sims = self.sims self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0, param_da=150, param_r=10, model_choose='lstm'): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(tf.int32, (None, ), 'texts_length') # shape: [batch] self.labels = tf.placeholder(tf.int64, (None, ), 'labels') # shape: [batch] self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) batch_size = tf.shape(self.texts)[0] # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup(self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #shape: [batch, length, num_embed_units] #todo: implement 3 RNNCells (BasicRNNCell, GRUCell, BasicLSTMCell) in a multi-layer setting with #num_units neurons and #num_layers layers if model_choose not in ['rnn','lstm', 'gru']: model_choose = 'lstm' cell_type = {'rnn': BasicRNNCell, 'lstm': BasicLSTMCell, 'gru': GRUCell}[model_choose] cell_fw = MultiRNNCell([cell_type(num_units) for x in range(num_layers)]) cell_bw = MultiRNNCell([cell_type(num_units) for x in range(num_layers)]) #todo: implement bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units) with tf.variable_scope('logits'): #todo: implement self-attention mechanism, feel free to add codes to calculate temporary results Ws1 = tf.get_variable("Ws1", [2 * num_units, param_da]) Ws2 = tf.get_variable("Ws2", [param_da, param_r]) A = tf.nn.softmax(tf.einsum("ijk,kl->ijl", tf.nn.tanh(tf.einsum("ijk,kl->ijl", H, Ws1)), Ws2)) M = tf.matmul(A, H, transpose_a=True) # shape: [batch, param_r, 2*num_units] flatten_M = tf.reshape(M, shape=[batch_size, param_r*2*num_units]) # shape: [batch, param_r*2*num_units] logits = tf.layers.dense(flatten_M, num_labels, activation=None, name='projection') # shape: [batch, num_labels] #todo: calculate additional loss, feel free to add codes to calculate temporary results identity = tf.reshape(tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r]) self.penalized_term = tf.norm(tf.matmul(A, A, transpose_a=True) - identity) self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + 0.0001*self.penalized_term predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)
def __init__(self, num_symbols, # 18430, vocabulary size. num_embed_units, # 300, Size of word embedding. num_units, # 512, Size of each model layer. num_layers, # 1, Number of layers in the model. num_labels, # 5, Number of labels. embed, # (18430, 300), word2vector list. learning_rate=0.5, max_gradient_norm=5.0): # todo: implement placeholders self.texts = tf.placeholder(dtype=tf.string, shape=[None, None], name='texts') # shape: batch*len self.texts_length = tf.placeholder(dtype=tf.int64, shape=[None], name='texts_length') # shape: batch self.labels = tf.placeholder(dtype=tf.int64, shape=[None], name='labels') # shape: batch self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit if num_layers == 1: # cell = BasicLSTMCell(num_units) cell = GRUCell(num_units) # cell = BasicRNNCell(num_units) keep_prob = 0.95 dropped_input = tf.nn.dropout(self.embed_input, keep_prob=keep_prob) outputs, states = dynamic_rnn(cell, dropped_input, self.texts_length, dtype=tf.float32, scope="rnn") # todo: implement unfinished networks # logits = tf.layers.dense(inputs=states, units=num_labels) l1 = tf.nn.dropout(states, keep_prob=keep_prob) inner_layer = tf.layers.dense(inputs=l1, units=256, activation=tf.nn.relu) l2 = tf.nn.dropout(inner_layer, keep_prob=keep_prob) logits = tf.layers.dense(inputs=l2, units=num_labels) self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__( self, num_symbols, # 词汇表size num_embed_units, # 词嵌入size num_units, # RNN 每层单元数 num_layers, # RNN 层数 embed, # 词嵌入 entity_embed=None, # num_entities=0, # num_trans_units=100, # learning_rate=0.0001, learning_rate_decay_factor=0.95, # max_gradient_norm=5.0, # num_samples=500, # 样本个数,sampled softmax max_length=60, mem_use=True, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch_size * encoder_len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch_size self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch_size * decoder_len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch_size self.entities = tf.placeholder( tf.string, (None, None, None), 'entities') # batch_size * triple_num * triple_len self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # 没用到 self.triples = tf.placeholder( tf.string, (None, None, None, 3), 'triples') # batch_size * triple_num * triple_len * 3 self.posts_triple = tf.placeholder( tf.int32, (None, None, 1), 'enc_triples') # batch_size * encoder_len self.responses_triple = tf.placeholder( tf.string, (None, None, 3), 'dec_triples') # batch_size * decoder_len * 3 self.match_triples = tf.placeholder( tf.int32, (None, None, None), 'match_triples') # batch_size * decoder_len * triple_num # 获得 encoder_batch_size ,编码器的 encoder_len encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) # 获得 triple_num # 每个 post 包含的知识图个数(补齐过的) triple_num = tf.shape(self.triples)[1] # 获得 triple_len # 每个知识图包含的关联实体个数(补齐过的) triple_len = tf.shape(self.triples)[2] # 使用的知识三元组 one_hot_triples = tf.one_hot( self.match_triples, triple_len) # batch_size * decoder_len * triple_num * triple_len # 用 1 标注了哪个时间步产生的回复用了知识三元组 use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # batch_size * decoder_len # 词汇映射到 index 的 hash table self.symbol2index = MutableHashTable( key_dtype=tf.string, # key张量的类型 value_dtype=tf.int64, # value张量的类型 default_value=UNK_ID, # 缺少key的默认值 shared_name= "in_table", # If non-empty, this table will be shared under the given name across multiple sessions name="in_table", # 操作名 checkpoint=True ) # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name. # index 映射到词汇的 hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 实体映射到 index 的 hash table self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) # index 映射到实体的 hash table self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # 将 post 的 string 映射成词汇 id self.posts_word_id = self.symbol2index.lookup( self.posts) # batch_size * encoder_len # 将 post 的 string 映射成实体 id self.posts_entity_id = self.entity2index.lookup( self.posts) # batch_size * encoder_len # 将 response 的 string 映射成词汇 id self.responses_target = self.symbol2index.lookup( self.responses) # batch_size * decoder_len # 获得解码器的 batch_size,decoder_len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # 去掉 responses_target 的最后一列,给第一列加上 GO_ID self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch_size * decoder_len # 得到 response 的 mask # 首先将回复的长度 one_hot 编码 # 然后横着从右向左累计求和,形成一个如果该位置在长度范围内,则为1,否则则为0的矩阵,最后一步 reshape 应该没有必要 self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # batch_size * decoder_len # 初始化 词嵌入 和 实体嵌入,传入了参数就直接赋值,没有的话就随机初始化 if embed is None: self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # 添加一个全连接层,输入是实体的嵌入,该层的 size=num_trans_units,激活函数是tanh # 为什么还要用全连接层连一下?????? self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') # 7 * num_trans_units 的全零初始化的数组 padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) # 把 padding_entity 添加到 entity_trans_transformed 的最前,补了有什么用????????????? self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) # tf.nn.embedding_lookup 以后维度会+1,所以通过reshape来取消这个多出来的维度 triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units ]) # [batch_size,triple_num*triple_len,num_embed_units] # 把 head,relation,tail分割开来 head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 静态图注意力机制 with tf.variable_scope('graph_attention'): # 将头和尾连接起来 head_tail = tf.concat( [head, tail], axis=3) # batch_size * triple_num * triple_len * 200 # tanh(dot(W, head_tail)) head_tail_transformed = tf.layers.dense( head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform' ) # batch_size * triple_num * triple_len * 100 # dot(W, relation) relation_transformed = tf.layers.dense( relation, num_trans_units, name='relation_transform' ) # batch_size * triple_num * triple_len * 100 # 两个向量先元素乘,再求和,等于两个向量的内积 # dot(traspose(dot(W, relation)), tanh(dot(W, head_tail))) e_weight = tf.reduce_sum( relation_transformed * head_tail_transformed, axis=3) # batch_size * triple_num * triple_len # 图中每个三元组的 alpha 权值 alpha_weight = tf.nn.softmax( e_weight) # batch_size * triple_num * triple_len # tf.expand_dims 使 alpha_weight 维度+1 batch_size * triple_num * triple_len * 1 # 对第2个维度求和,由此产生每个图 100 维的图向量表示 graph_embed = tf.reduce_sum( tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) # batch_size * triple_num * 100 """ [0, 1, 2... encoder_batch_size] 转化成 encoder_batch_size * 1 * 1 的矩阵 [[[0]], [[1]], [[2]],...] tf.tile 将矩阵的第 1 维进行扩展 encoder_batch_size * encoder_len * 1 [[[0],[0]...]],...] 与 posts_triple 在第 2 维度上进行拼接,形成 indices 矩阵 indices 矩阵: [ [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len], [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len], [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len] ,...batch_size ] tf.gather_nd 将 graph_embed 中根据上面矩阵提供的索引检索图向量,再回填至 indices 矩阵 encoder_batch_size * encoder_len * 100 """ graph_embed_input = tf.gather_nd( graph_embed, tf.concat([ tf.tile( tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), self.posts_triple ], axis=2)) # 将 responses_triple 转化成实体嵌入 batch_size * decoder_len * 300 triple_embed_input = tf.reshape( tf.nn.embedding_lookup( self.entity_embed, self.entity2index.lookup(self.responses_triple)), [batch_size, decoder_len, 3 * num_trans_units]) # 将 posts_word_id 转化成词嵌入 post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # batch_size * encoder_len * 300 # 将 responses_word_id 转化成词嵌入 response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id) # batch_size * decoder_len * 300 # post_word_input, graph_embed_input 在第二个维度上拼接 self.encoder_input = tf.concat( [post_word_input, graph_embed_input], axis=2) # batch_size * encoder_len * 400 # response_word_input, triple_embed_input 在第二个维度上拼接 self.decoder_input = tf.concat( [response_word_input, triple_embed_input], axis=2) # batch_size * decoder_len * 600 # 构造 deep RNN encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # 由于词汇表维度过大,所以输出的维度不可能和词汇表一样。通过 projection 函数,可以实现从低维向高维的映射 # 返回:输出函数,选择器函数,计算序列损失,采样序列损失,总体损失的函数 output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer( num_units, num_symbols, num_samples) # 用于训练的 decoder with tf.variable_scope('decoder'): # 得到注意力函数 # 准备注意力 # attention_keys_init: 注意力的 keys # attention_values_init: 注意力的 values # attention_score_fn_init: 计算注意力上下文的函数 # attention_construct_fn_init: 计算所有上下文拼接的函数 attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) # 返回训练时解码器每一个时间步对输入的处理函数 decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) # 输出,最终状态,alignments 的 TensorArray self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity( self.sentence_ppx, name='ppx_loss') # 将 sentence_ppx 转化成一步操作 else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) # 用于推导的 decoder with tf.variable_scope('decoder', reuse=True): # 得到注意力函数 attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, tf.reshape( triples_embedding, [encoder_batch_size, -1, 3 * num_trans_units])), selector_fn=selector_fn) # imem: ([batch_size,triple_num*triple_len,num_embed_units],[encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体次嵌入和三元组嵌入的元组 self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] # decoder_len output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对 output 的值域行裁剪 word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) # [batch_size, decoder_len] # 计算的是采用的实体词在 entities 的位置 # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len # 2、tf.range(encoder_batch_size): [batch_size] # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在 entities 中的偏移量 # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词的相对位置 # 5、entity_ids: [batch_size * decoder_len] 加上偏移量之后在 entities 中的实际位置 entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) # 计算的是所用的实体词 # 1、entities: [batch_size, triple_num, triple_len] # 2、tf.reshape(self.entities, [-1]): [batch_size * triple_num * triple_len] # 3、tf.gather: [batch_size*decoder_len] # 4、entities: [batch_size, output_len] entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 将 id 转化为实际的词 # output_ids > 0 为 bool 张量,True 的位置用 words 中该位置的词替换 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity(self.generation, name='generation') # 初始化训练过程 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) # ??? self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # 更新参数的次数 self.global_step = tf.Variable(0, trainable=False) # 要训练的参数 self.params = tf.global_variables() # 选择优化算法 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr # 根据 decoder_loss 计算 params 梯度 gradients = tf.gradients(self.decoder_loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.5, max_gradient_norm=5.0): #todo: implement placeholders self.texts = tf.placeholder(tf.string, [None, None], name="texts") # shape: batch*len self.texts_length = tf.placeholder(tf.int64, [None], name="texts_length") # shape: batch self.labels = tf.placeholder(tf.int64, [None], name="labels") # shape: batch self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_factor = 0.9 self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #batch*len*embed_unit model = 'lstm' if num_layers == 1: if (model == 'rnn'): cell = BasicRNNCell(num_units) elif (model == 'gru'): cell = GRUCell(num_units) elif (model == 'lstm'): cell = BasicLSTMCell(num_units) cell_do = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) outputs, states = dynamic_rnn(cell_do, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") #todo: implement unfinished networks outputs_flat = tf.reduce_mean(outputs, 1) if (model == 'lstm'): states = states[0] # W_f = weight_variable([tf.app.flags.FLAGS.units, 5]) # b_f = bias_variable([5]) # logits = tf.matmul(outputs_flat, W_f) + b_f # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu) logits = tf.layers.dense(inputs=states, units=5, activation=None) else: self.reverse_texts = tf.placeholder( tf.string, [None, None], name="reverse_texts") # shape: batch*len self.index_reverse_input = self.symbol2index.lookup( self.reverse_texts) self.embed_reverse_input = tf.nn.embedding_lookup( self.embed, self.index_reverse_input) #batch*len*embed_unit if (model == 'rnn'): cell1 = BasicRNNCell(num_units) cell2 = BasicRNNCell(num_units) elif (model == 'gru'): cell1 = GRUCell(num_units) cell2 = GRUCell(num_units) elif (model == 'lstm'): cell1 = BasicLSTMCell(num_units) cell2 = BasicLSTMCell(num_units) cell1_do = tf.nn.rnn_cell.DropoutWrapper( cell1, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) cell2_do = tf.nn.rnn_cell.DropoutWrapper( cell2, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) outputs1, states1 = dynamic_rnn(cell1_do, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") outputs2, states2 = dynamic_rnn(cell2_do, self.embed_reverse_input, self.texts_length, dtype=tf.float32, scope="rnn") if (model == 'lstm'): states = states1[0] + states2[0] else: states = states1 + states2 # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu) logits = tf.layers.dense(inputs=states, units=5, activation=None) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) # opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
class RNN(object): def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # batch*len self.texts_length = tf.placeholder(tf.int32, (None), 'texts_length') # batch self.labels = tf.placeholder(tf.int64, (None), 'labels') # batch self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit cell = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)]) outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") vectors = states[-1][-1] with tf.variable_scope('logits'): weight = tf.get_variable("weights", [num_units, num_labels]) bias = tf.get_variable("biases", [num_labels]) logits = tf.matmul(vectors, weight) + bias self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True) def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, data): input_feed = {self.texts: data['texts'], self.texts_length: data['texts_length'], self.labels: data['labels']} output_feed = [self.loss, self.accuracy, self.gradient_norm, self.update] return session.run(output_feed, input_feed)
def __init__(self, num_lstm_units, num_labels, embed, max_gradient_norm=5.0): self.num_lstm_units = num_lstm_units self.texts1 = tf.placeholder(tf.string, [None, None], name='texts1') # batch_size*max_len self.texts2 = tf.placeholder( tf.string, [None, None], name='texts2' ) # batch_size*max_len, PAD THE TWO TEXTS TO SAME LENGTH self.texts_length1 = tf.placeholder( tf.int32, [None], name='texts_length1') # shape: batch self.texts_length2 = tf.placeholder( tf.int32, [None], name='texts_length2') # shape: batch self.labels = tf.placeholder(tf.int64, [None], name='labels') # shape: batch self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.learning_rate = tf.Variable(0.01, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input1 = self.symbol2index.lookup( self.texts1) # batch*max_len self.index_input2 = self.symbol2index.lookup( self.texts2) # batch*max_len self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*max_len*embed_unit self.embed_input2 = tf.nn.embedding_lookup( self.embed, self.index_input2) # batch*max_len*embed_unit # zero padding self._batch_size = tf.shape(self.texts_length1)[0] self._max_length = tf.shape(self.texts1)[1] self.mask1 = tf.sequence_mask(self.texts_length1, maxlen=self._max_length, dtype=tf.float32) # shape: batch*max_len self.mask1_extended = tf.concat( [tf.zeros([self._batch_size, 1], tf.float32), self.mask1], 1) self.mask2 = tf.sequence_mask(self.texts_length2, maxlen=self._max_length, dtype=tf.float32) # shape: batch*max_len self.mask2_extended = tf.concat( [tf.zeros([self._batch_size, 1], tf.float32), self.mask2], 1) # debug print("mask1 size: " + str(self.mask1.shape)) self.embed_input1 = tf.transpose( self.embed_input1, [2, 0, 1]) * self.mask1 # shape: embed_unit*batch*max_len self.embed_input1 = tf.transpose( self.embed_input1, [2, 1, 0]) # shape: max_len*batch*embed_units self.embed_input2 = tf.transpose( self.embed_input2, [2, 0, 1]) * self.mask2 # shape: embed_unit*batch*max_len self.embed_input2 = tf.transpose( self.embed_input2, [2, 1, 0]) # shape: max_len*batch*embed_units zero_state = tf.zeros(shape=[self._batch_size, self.num_lstm_units], dtype=tf.float32) h_s1 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_s1 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_s1 = h_s1.write(0, zero_state) c_s1 = c_s1.write(0, zero_state) h_s2 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_s2 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_s2 = h_s2.write(0, zero_state) c_s2 = c_s2.write(0, zero_state) h_r = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_r = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_r = h_r.write(0, zero_state) c_r = c_r.write(0, zero_state) self._initializer = tf.truncated_normal_initializer(stddev=0.1) t = tf.constant(1, dtype=tf.int32) # TO DO: check this c = lambda x, hs1, cs1, hs2, cs2, hr, cr: tf.less( x, self._max_length + 1) b = lambda x, hs1, cs1, hs2, cs2, hr, cr: self._match_step( x, hs1, cs1, hs2, cs2, hr, cr) t, self.h_s1, self.c_s1, self.h_s2, self.c_s2, self.h_r, self.c_r = tf.while_loop( cond=c, body=b, loop_vars=(t, h_s1, c_s1, h_s2, c_s2, h_r, c_r)) self.h_r = tf.transpose( self.h_r.stack(), [1, 0, 2]) # shape: [batch_size, max_len, num_lstm_units] # get final states. don't need to subtract seqlen by 1 because we take zero states also in count self.final_h_r = tf.gather_nd( self.h_r, tf.stack([ tf.range(self._batch_size), tf.maximum(self.texts_length1, self.texts_length2) ], axis=1)) # shape: [batch_size, num_lstm_units] with tf.variable_scope('fully_connect'): self.w_fc = tf.get_variable(shape=[num_lstm_units, num_labels], initializer=self._initializer, name='w_fc') self.b_fc = tf.get_variable(shape=[num_labels], initializer=self._initializer, name='b_fc') self.logits = tf.matmul(self.final_h_r, self.w_fc) + self.b_fc self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(self.logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int64), name='accuracy') self.params = tf.trainable_variables() opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.5, max_gradient_norm=5.0, keep_prob=1., weight_decay=1e-10, RNN_type="BasicRNN"): #todo: implement placeholders self.texts = tf.placeholder(dtype = tf.string, shape = [None, None]) self.texts_length = tf.placeholder(dtype = tf.int32, shape = [None]) self.labels = tf.placeholder(dtype = tf.int64, shape = [None]) ''' self.texts = tf.placeholder() # shape: batch*len self.texts_length = tf.placeholder() # shape: batch self.labels = tf.placeholder() # shape: batch ''' self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.weight_decay = tf.Variable(float(weight_decay), trainable=False, dtype=tf.float32) self.keep_prob = tf.Variable(float(keep_prob), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit # bi-LSTM with tf.variable_scope("foward_cell"): #fw_cell = tf.contrib.rnn.GRUCell(num_units) if RNN_type == "LSTM": fw_cell = BasicLSTMCell(num_units) else: fw_cell = GRUCell(num_units) ''' fw_cell = tf.contrib.rnn.GRUCell(num_units) fw_cell = tf.contrib.rnn.GRUCell(num_units) ''' with tf.variable_scope("barkward_cell"): #bw_cell = tf.contrib.rnn.GRUCell(num_units) if RNN_type == "LSTM": bw_cell = BasicLSTMCell(num_units) else: bw_cell = GRUCell(num_units) ''' if num_layers == 1: if RNN_type == "BasicRNN": cell = BasicRNNCell(num_units) # cell = tf.contrib.rnn.BasicRNNCell(num_units) elif RNN_type == "GRU": cell = GRUCell(num_units) elif RNN_type == "LSTM": cell = BasicLSTMCell(num_units) outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") ''' outputs, states = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, self.embed_input, self.texts_length, dtype = tf.float32, scope = "bi_lstm") #print "***state: ", states #self.y0 = tf.reduce_max(outputs, axis = 1) #self.y0 = tf.reduce_max(outputs[0] + outputs[1], axis = 1) #self.y0 = tf.reduce_sum(states, axis = 0) self.y0 = states[0][1] + states[1][1] #print "****** y0:", self.y0 self.y0_dp = tf.nn.dropout(self.y0, keep_prob = self.keep_prob) self.y1 = tf.layers.dense(inputs = self.y0_dp, units = 128, activation = tf.nn.sigmoid) self.y2 = tf.layers.dense(inputs = self.y0_dp, units = num_labels) logits = self.y2 ''' self.W1 = tf.Variable(tf.truncated_normal(stddev = .1, shape = [num_units, 128])) self.b1 = tf.Variable(tf.constant(.1, shape = [128])) self.u1 = tf.matmul(self.y0_dp, self.W1) + self.b1 self.y1 = tf.nn.sigmoid(self.u1) self.W2 = tf.Variable(tf.truncated_normal(stddev = .1, shape = [128, 5])) self.b2 = tf.Variable(tf.constant(.1, shape = [5])) self.u2 = tf.matmul(self.y1, self.W2) + self.b2 ''' # logits = tf.layers.dense(inputs = self.y1, units = 5) # logits = self.u2 #todo: implement unfinished networks with tf.name_scope("l2_loss"): vars = tf.trainable_variables() self.lossL2 = tf.add_n([ tf.nn.l2_loss(v) for v in vars ]) * self.weight_decay self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + self.lossL2 mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) #opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.train_op = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step,var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
class InteractiveMatchLSTM(object): def __init__(self, num_lstm_units, num_labels, embed, max_gradient_norm=5.0): self.num_lstm_units = num_lstm_units self.texts1 = tf.placeholder(tf.string, [None, None], name='texts1') # batch_size*max_len self.texts2 = tf.placeholder( tf.string, [None, None], name='texts2' ) # batch_size*max_len, PAD THE TWO TEXTS TO SAME LENGTH self.texts_length1 = tf.placeholder( tf.int32, [None], name='texts_length1') # shape: batch self.texts_length2 = tf.placeholder( tf.int32, [None], name='texts_length2') # shape: batch self.labels = tf.placeholder(tf.int64, [None], name='labels') # shape: batch self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.learning_rate = tf.Variable(0.01, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input1 = self.symbol2index.lookup( self.texts1) # batch*max_len self.index_input2 = self.symbol2index.lookup( self.texts2) # batch*max_len self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*max_len*embed_unit self.embed_input2 = tf.nn.embedding_lookup( self.embed, self.index_input2) # batch*max_len*embed_unit # zero padding self._batch_size = tf.shape(self.texts_length1)[0] self._max_length = tf.shape(self.texts1)[1] self.mask1 = tf.sequence_mask(self.texts_length1, maxlen=self._max_length, dtype=tf.float32) # shape: batch*max_len self.mask1_extended = tf.concat( [tf.zeros([self._batch_size, 1], tf.float32), self.mask1], 1) self.mask2 = tf.sequence_mask(self.texts_length2, maxlen=self._max_length, dtype=tf.float32) # shape: batch*max_len self.mask2_extended = tf.concat( [tf.zeros([self._batch_size, 1], tf.float32), self.mask2], 1) # debug print("mask1 size: " + str(self.mask1.shape)) self.embed_input1 = tf.transpose( self.embed_input1, [2, 0, 1]) * self.mask1 # shape: embed_unit*batch*max_len self.embed_input1 = tf.transpose( self.embed_input1, [2, 1, 0]) # shape: max_len*batch*embed_units self.embed_input2 = tf.transpose( self.embed_input2, [2, 0, 1]) * self.mask2 # shape: embed_unit*batch*max_len self.embed_input2 = tf.transpose( self.embed_input2, [2, 1, 0]) # shape: max_len*batch*embed_units zero_state = tf.zeros(shape=[self._batch_size, self.num_lstm_units], dtype=tf.float32) h_s1 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_s1 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_s1 = h_s1.write(0, zero_state) c_s1 = c_s1.write(0, zero_state) h_s2 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_s2 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_s2 = h_s2.write(0, zero_state) c_s2 = c_s2.write(0, zero_state) h_r = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) c_r = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False) h_r = h_r.write(0, zero_state) c_r = c_r.write(0, zero_state) self._initializer = tf.truncated_normal_initializer(stddev=0.1) t = tf.constant(1, dtype=tf.int32) # TO DO: check this c = lambda x, hs1, cs1, hs2, cs2, hr, cr: tf.less( x, self._max_length + 1) b = lambda x, hs1, cs1, hs2, cs2, hr, cr: self._match_step( x, hs1, cs1, hs2, cs2, hr, cr) t, self.h_s1, self.c_s1, self.h_s2, self.c_s2, self.h_r, self.c_r = tf.while_loop( cond=c, body=b, loop_vars=(t, h_s1, c_s1, h_s2, c_s2, h_r, c_r)) self.h_r = tf.transpose( self.h_r.stack(), [1, 0, 2]) # shape: [batch_size, max_len, num_lstm_units] # get final states. don't need to subtract seqlen by 1 because we take zero states also in count self.final_h_r = tf.gather_nd( self.h_r, tf.stack([ tf.range(self._batch_size), tf.maximum(self.texts_length1, self.texts_length2) ], axis=1)) # shape: [batch_size, num_lstm_units] with tf.variable_scope('fully_connect'): self.w_fc = tf.get_variable(shape=[num_lstm_units, num_labels], initializer=self._initializer, name='w_fc') self.b_fc = tf.get_variable(shape=[num_labels], initializer=self._initializer, name='b_fc') self.logits = tf.matmul(self.final_h_r, self.w_fc) + self.b_fc self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(self.logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int64), name='accuracy') self.params = tf.trainable_variables() opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) def _match_step(self, t, h_s1, c_s1, h_s2, c_s2, h_r, c_r): """ :param t: time index(start from 1) :self.embed_input1: tensor, shape: [max_length, batch_size, embed_units] :param h_s1: TensorArray, hidden states of text1 till last time step, t tensors of size [batch_size, num_lstm_units] :self.embed_input2: similar to input1 :param h_s2: similar to h_s1 :param h_r: TensorArray, hidden states of lstmr till last time step, t tensors of size [batch_size, num_lstm_units] :return: new t and new h_r(t tensors) """ # lstms calculate first inputs_s1 = tf.concat( [self.embed_input1[t - 1, :, :], h_r.read(t - 1)], axis=1) # shape: [batch_size, num_lstm_units * 2] inputs_s2 = tf.concat( [self.embed_input2[t - 1, :, :], h_r.read(t - 1)], axis=1) with tf.variable_scope('lstm_s'): newc_s1, newh_s1 = self._lstm(inputs=inputs_s1, states=(c_s1.read(t - 1), h_s1.read(t - 1))) with tf.variable_scope('lstm_s', reuse=True): newc_s2, newh_s2 = self._lstm(inputs=inputs_s2, states=(c_s2.read(t - 1), h_s2.read(t - 1))) c_s1 = c_s1.write(t, newc_s1) h_s1 = h_s1.write(t, newh_s1) c_s2 = c_s2.write(t, newc_s2) h_s2 = h_s2.write(t, newh_s2) # calculate attention with tf.variable_scope('attention'): at1 = self._attention(t, h_s1, h_s2, h_r, self.mask1_extended[:, :t + 1]) with tf.variable_scope('attention', reuse=True): at2 = self._attention(t, h_s2, h_s1, h_r, self.mask2_extended[:, :t + 1]) # lstmr update inputs_r = tf.concat([at1, at2], axis=1) # shape: [batch_size, num_lstm_units * 2] with tf.variable_scope('lstm_r'): newc_r, newh_r = self._lstm(inputs=inputs_r, states=(c_r.read(t - 1), h_r.read(t - 1))) c_r = c_r.write(t, newc_r) h_r = h_r.write(t, newh_r) t = tf.add(t, 1) return t, h_s1, c_s1, h_s2, c_s2, h_r, c_r def _attention(self, t, h_self, h_other, h_r, mask_self): """ :param t: time index(start from 1) :param h_self: TensorArray, hidden states of self till last time step, t + 1 tensors of size [batch_size, num_lstm_units] :param h_other: TensorArray, hidden states of other, size and tensor shape: same as above :param h_r: TensorArray, hidden states of rlstm, t tensors of shape: [batch_size, num_lstm_units] :return: a attention-based presentation of 'self', shape: [batch_size, num_lstm_units] """ We = tf.get_variable(shape=[self.num_lstm_units, 1], initializer=self._initializer, name='W_e') Wo = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units], initializer=self._initializer, name='W_other') Ws = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units], initializer=self._initializer, name='W_self') Wa = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units], initializer=self._initializer, name='W_attention') # shape: batch_size etj = tf.einsum('ijk,kl->ijl', h_self.stack(), Ws) + tf.matmul( h_other.read(t), Wo) + tf.matmul(h_r.read(t - 1), Wa) etj = tf.transpose(etj, [1, 0, 2]) # shape: [batch_size, t, num_lstm_units] etj = tf.squeeze(tf.einsum('ijk,kl->ijl', tf.tanh(etj), We), axis=2) # shape: [batch_size, t] etj = tf.exp(etj) * mask_self etj_sums = tf.reduce_sum(etj, axis=1) atj = tf.transpose(tf.transpose(etj) / etj_sums) at = tf.transpose( tf.transpose(h_self.stack(), [2, 1, 0]) * atj, [1, 2, 0]) at = tf.reduce_sum(at, axis=1) # shape: [batch_size, num_lstm_units] return at def _lstm(self, inputs, states): c, h = states _wi = tf.get_variable('lstm_cell_wi', dtype=tf.float32, shape=[ inputs.get_shape()[-1] + h.get_shape()[-1], self.num_lstm_units ], initializer=tf.orthogonal_initializer()) _bi = tf.get_variable('lstm_cell_bi', dtype=tf.float32, shape=[self.num_lstm_units], initializer=tf.constant_initializer(0.0)) _wo = tf.get_variable('lstm_cell_wo', dtype=tf.float32, shape=[ inputs.get_shape()[-1] + h.get_shape()[-1], self.num_lstm_units ], initializer=tf.orthogonal_initializer()) _bo = tf.get_variable('lstm_cell_bo', dtype=tf.float32, shape=[self.num_lstm_units], initializer=tf.constant_initializer(0.0)) _wf = tf.get_variable('lstm_cell_wf', dtype=tf.float32, shape=[ inputs.get_shape()[-1] + h.get_shape()[-1], self.num_lstm_units ], initializer=tf.orthogonal_initializer()) _bf = tf.get_variable('lstm_cell_bf', dtype=tf.float32, shape=[self.num_lstm_units], initializer=tf.constant_initializer(1.0)) _wc = tf.get_variable('lstm_cell_wc', dtype=tf.float32, shape=[ inputs.get_shape()[-1] + h.get_shape()[-1], self.num_lstm_units ], initializer=tf.orthogonal_initializer()) _bc = tf.get_variable('lstm_cell_bc', dtype=tf.float32, shape=[self.num_lstm_units], initializer=tf.constant_initializer(0.0)) i = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wi) + _bi) o = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wo) + _bo) f = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wf) + _bf) _c = tf.tanh(tf.matmul(tf.concat([inputs, h], 1), _wc) + _bc) new_c = f * c + i * _c new_h = o * tf.tanh(new_c) return new_c, new_h def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, data): input_feed = { self.texts1: data['texts1'], self.texts2: data['texts2'], self.texts_length1: data['texts_length1'], self.texts_length2: data['texts_length2'], self.labels: data['labels'] } # for debug # output_feed = [self.loss, self.accuracy, self.update, self.embed_input1, self.embed_input2, self.h_r, self.final_h_r] output_feed = [ self.loss, self.accuracy, self.update, self.final_h_r, self.logits ] return session.run(output_feed, input_feed)
class RNN(object): def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0, prob=1): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(tf.float32, None, 'texts_length') # shape: [batch] self.labels = tf.placeholder(tf.int64, None, 'labels') # shape: [batch] self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.prob = tf.Variable(float(prob), trainable=False, dtype=tf.float32) self.index_input = self.symbol2index.lookup( self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #shape: [batch, length, num_embed_units] #todo: implement other RNNCell to replace BasicRNNCell #修改下面语句,BasicRNNCell换成GRUCell和BasicLSTMCell分别得到对应模型 cell = MultiRNNCell( [BasicRNNCell(num_units) for _ in range(num_layers)]) if prob < 1: cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=prob) outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") #todo: vectors is the last hidden states of the BasicRNNCell, u may need to change the code to get the right vectors of other RNNCell #vectors = states[-1][1] #for lstm vectors = states[-1] #for others with tf.variable_scope('logits'): weight = tf.get_variable("weights", [num_units, num_labels]) bias = tf.get_variable("biases", [num_labels]) #todo: implement the linear transformation: [batch, num_units] -> [batch, num_labels], using vectors, weight, bias logits = tf.matmul(vectors, weight) + bias self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters # opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, name = 'Adam') opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True) def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, data): input_feed = { self.texts: data['texts'], self.texts_length: data['texts_length'], self.labels: data['labels'] } output_feed = [ self.loss, self.accuracy, self.gradient_norm, self.update ] return session.run(output_feed, input_feed)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.001, max_gradient_norm=5.0, learning_rate_decay_factor=0.1): #todo: implement placeholders # PROBLEMS REMAIN self.texts = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.texts_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.labels = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.output_keep_prob = tf.placeholder(dtype=tf.float32, shape=[]) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_update_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #batch*len*embed_unit if num_layers == 1: # basic rnn # cell = BasicRNNCell(num_units) # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") # gru # cell = GRUCell(num_units) # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") # lstm # cell = BasicLSTMCell(num_units) # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") # states = states[1] # final model cell = tf.nn.rnn_cell.DropoutWrapper( BasicLSTMCell(num_units), output_keep_prob=self.output_keep_prob) cell_bw = tf.nn.rnn_cell.DropoutWrapper( BasicLSTMCell(num_units), output_keep_prob=self.output_keep_prob) outputs, states = bidirectional_dynamic_rnn(cell, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") states = states[0][1] + states[1][1] else: cells = [] cells_bw = [] for _ in range(num_layers): cell = tf.nn.rnn_cell.DropoutWrapper( GRUCell(num_units), output_keep_prob=output_keep_prob) cells.append(cell) cell_bw = tf.nn.rnn_cell.DropoutWrapper( GRUCell(num_units), output_keep_prob=output_keep_prob) cells_bw.append(cell_bw) cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) cell_bw = tf.contrib.rnn.MultiRNNCell(cells_bw, state_is_tuple=True) outputs, states = bidirectional_dynamic_rnn(cell, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="stacked_rnn") states = states[0][num_layers - 1] + states[1][num_layers - 1] #todo: implement unfinished networks self.w1 = tf.Variable( tf.random_normal(shape=[num_units, num_labels], stddev=tf.sqrt(2.0 / (num_units + num_labels)))) self.b1 = tf.Variable(tf.constant(0.0, shape=[num_labels])) logits = tf.matmul(states, self.w1) + self.b1 self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) self.predict_labels = tf.argmax(logits, 1, 'predict_labels', output_type=tf.int32) self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, self.predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
class RNN(object): def __init__(self, num_symbols, num_embed_units, num_units, num_labels, batch_size, embed, learning_rate=0.001, max_gradient_norm=5.0, learning_rate_decay_factor=0.9): # todo: implement placeholders self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1') self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2') # shape: batch*len self.texts_length1 = tf.placeholder( tf.int32, [batch_size], name='texts_length1') # shape: batch self.texts_length2 = tf.placeholder(tf.int32, [batch_size], name='texts_length2') self.max_length = tf.placeholder(tf.int32, name='max_length') self.labels = tf.placeholder(tf.int64, [batch_size], name='labels') # shape: batch self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.embed_units = num_embed_units self.num_units = num_units self.batch_size = batch_size self._initializer = tf.truncated_normal_initializer(stddev=0.1) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.index_input1 = self.symbol2index.lookup(self.texts1) # batch*len self.index_input2 = self.symbol2index.lookup(self.texts2) self.long_length = tf.maximum(self.texts_length1, self.texts_length2) print self.long_length.get_shape() self.mask_table = tf.sequence_mask(self.long_length, dtype=tf.float32) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*len*embed_unit self.embed_input2 = tf.nn.embedding_lookup(self.embed, self.index_input2) with tf.variable_scope('lstm_s'): self.lstm_s = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) with tf.variable_scope('lstm_r'): self.lstm_r = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) out_s1, state_s1 = dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length1, dtype=tf.float32, scope='rnn') out_s2, state_s2 = dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length2, dtype=tf.float32, scope='rnn') self.h_s1 = out_s1 self.h_s2 = out_s2 reshaped_s1 = tf.reshape(self.h_s1, [-1, self.num_units]) reshaped_s2 = tf.reshape(self.h_s2, [-1, self.num_units]) with tf.variable_scope('Attn_'): W_s = tf.get_variable(shape=[self.num_units, self.num_units], initializer=self._initializer, name='W_s') self.s_1 = tf.matmul(reshaped_s1, W_s) self.s_2 = tf.matmul(reshaped_s2, W_s) self.s_1 = tf.transpose( tf.reshape(self.s_1, [self.batch_size, -1, self.num_units]), [1, 2, 0]) self.s_2 = tf.transpose( tf.reshape(self.s_2, [self.batch_size, -1, self.num_units]), [1, 2, 0]) i = tf.constant(0) state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32) def c(t, sr): return tf.less(t, self.max_length) def b(t, sr): return self.attention(t, sr) i, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_r)) with tf.variable_scope('fully_connect'): w_fc = tf.get_variable(shape=[self.num_units, num_labels], initializer=self._initializer, name='w_fc') b_fc = tf.get_variable(shape=[num_labels], initializer=self._initializer, name='b_fc') logits = tf.matmul(state_r.h, w_fc) + b_fc #logits = tf.layers.dense(outputs, num_labels) # todo: implement unfinished networks self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / \ tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int64), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters for item in tf.global_variables(): print item opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step, #var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) def attention(self, t, hr): with tf.variable_scope('Attn_'): W_o = tf.get_variable(shape=[self.num_units, self.num_units], initializer=self._initializer, name='W_o') W_e = tf.get_variable(shape=[self.num_units, 1], initializer=self._initializer, name='W_e') W_a = tf.get_variable(shape=[self.num_units, self.num_units], initializer=self._initializer, name='W_a') e1_tj = tf.tanh(self.s_1 + tf.transpose( tf.matmul(self.h_s2[:, t, :], W_o) + tf.matmul(hr.h, W_a))) e2_tj = tf.tanh(self.s_2 + tf.transpose( tf.matmul(self.h_s1[:, t, :], W_o) + tf.matmul(hr.h, W_a))) print e1_tj.get_shape() #(max_len, num_units, batch_size) e1_tj = tf.matmul( tf.reshape(tf.transpose(e1_tj, [2, 0, 1]), [-1, self.num_units]), W_e) e2_tj = tf.matmul( tf.reshape(tf.transpose(e2_tj, [2, 0, 1]), [-1, self.num_units]), W_e) #(max_len*batch_size, 1) print e1_tj.get_shape() e1_tj = tf.reshape(e1_tj, [self.batch_size, -1]) e2_tj = tf.reshape(e2_tj, [self.batch_size, -1]) #(batch_size, max_len) print e1_tj.get_shape() alpha1_tj = tf.exp(e1_tj) * self.mask_table alpha2_tj = tf.exp(e2_tj) * self.mask_table alpha1_tj = tf.transpose(alpha1_tj) / tf.reduce_sum(alpha1_tj, 1) alpha2_tj = tf.transpose(alpha2_tj) / tf.reduce_sum(alpha2_tj, 1) print alpha1_tj.get_shape() #(max_len, batch_size) a1tj = alpha1_tj * tf.transpose(self.h_s1, [2, 1, 0]) a2tj = alpha2_tj * tf.transpose(self.h_s2, [2, 1, 0]) print a1tj.get_shape() #(num_units, max_len, batch_size) a1tj = tf.reduce_sum(a1tj, 1) a2tj = tf.reduce_sum(a2tj, 1) print a1tj.get_shape() #(num_units, batch_size) r_t = tf.transpose(tf.concat([a1tj, a2tj], 0)) print r_t.get_shape() #(batch_size, 2*num_units) with tf.variable_scope('lstm_r'): out_r, hr = self.lstm_r(inputs=r_t, state=hr) t = tf.add(t, 1) return t, hr def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, data, summary=False): input_feed = { self.texts1: data['texts1'], self.texts2: data['texts2'], self.texts_length1: data['texts_length1'], self.texts_length2: data['texts_length2'], self.max_length: data['max_length'], self.labels: data['labels'], self.keep_prob: data['keep_prob'] } output_feed = [ self.loss, self.accuracy, #self.train_op] self.gradient_norm, self.update ] ''' ,self.assign_op1, self.assign_op2, self.assign_op3, self.assign_op4, self.assign_op5, self.ini_op1, self.ini_op2, self.ini_op3, self.ini_op4, self.ini_op5] ''' #print self.symbol2index.lookup(data['texts1']) if summary: output_feed.append(self.merged_summary_op) #print session.run([self.texts1[0,:10],self.index_input1[0,:10]], input_feed) return session.run(output_feed, input_feed)
def __init__( self, num_symbol, # 词汇表大小 num_units, # 隐藏层维度 num_layers, # 编码/解码器层数 embed, # 词嵌入 max_length=60, learning_rate=0.0001, max_gradient_norm=5.0, output_alignments=False): # 是否保存注意力权重 # 词汇映射到 index 的 hash table self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # index 映射到词汇的 hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 模型变量 self.posts_string = tf.placeholder( name="posts_string", shape=(None, None), dtype=tf.string) # [batch_size, encoder_len] self.posts_len = tf.placeholder(name="posts_len", shape=(None), dtype=tf.int32) # [batch_size] self.responses_string = tf.placeholder( name="responses_string", shape=(None, None), dtype=tf.string) # [batch_size, decoder_len] self.responses_len = tf.placeholder(name="responses_len", shape=(None), dtype=tf.int32) # [batch_size] self.embed = tf.get_variable("word_embed", dtype=tf.float32, initializer=embed) batch_size, encoder_len = tf.unstack(tf.shape(self.posts_string)) decoder_len = tf.shape(self.responses_string)[1] # posts 和 responses 的序列表示 self.posts_index = self.symbol2index.lookup( self.posts_string) # [batch_size, encoder_len] self.responses_index = self.symbol2index.lookup( self.responses_string) # [batch_size, decoder_len] # decoder 输入的序列表示 self.responses_input_index = tf.concat([ tf.ones((batch_size, 1), dtype=tf.int64) * GO_ID, tf.split(self.responses_index, [decoder_len - 1, 1], axis=1)[0] ], axis=1) # encoder 和 decoder 的输入 self.encoder_input = tf.nn.embedding_lookup( embed, self.posts_index) # [batch_size, encoder_len, embedding_size] # decoder_label = tf.nn.embedding_lookup(embed, responses_index) # [batch_size, decoder_len, embedding_size] self.decoder_input = tf.nn.embedding_lookup( embed, self.responses_input_index ) # [batch_size, decoder_len, embedding_size] self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_len - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # [batch_size, decoder_len] encoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell, self.encoder_input, self.posts_len, dtype=tf.float32, scope="encoder") output_fn, sequence_loss = output_projection_layer( num_units, num_symbol) # 训练 with tf.variable_scope("decoder"): keys, values, attention_score_fn, attention_construct_fn = \ prepare_attention(encoder_output, num_units, reuse=False) decoder_fn_train = attention_decoder_fn_train( encoder_state, keys, values, attention_score_fn, attention_construct_fn, output_alignments=output_alignments, decoder_len=decoder_len) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, inputs=self.decoder_input, sequence_length=self.responses_len, scope="decoder_rnn") self.total_loss, self.loss = sequence_loss(self.decoder_output, self.responses_index, self.decoder_mask) # 推导 with tf.variable_scope("decoder", reuse=True): # 得到注意力函数 keys, values, attention_score_fn, attention_construct_fn = \ prepare_attention(encoder_output, num_units, reuse=True) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, keys, values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbol) # decoder_distribution: [batch_size, decoder_len, num_symbol] 未 softmax 的预测分布 # output_ids_ta: decoder_len [bath_size] self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") # self.word_ids = tf.cast(tf.argmax(tf.nn.softmax(self.decoder_distribution), 2), dtype=tf.int64) # self.output_ids = tf.transpose(output_ids_ta.stack()) output_len = tf.shape(self.decoder_distribution)[1] # decoder_len self.output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对 output 的值域行裁剪 self.word_ids = tf.cast( tf.clip_by_value(self.output_ids, 0, num_symbol), tf.int64) # [batch_size, decoder_len] self.words = self.index2symbol.lookup(self.word_ids) self.global_step = tf.Variable(0, trainable=False, name="global_step") self.params = tf.global_variables() opt = tf.train.AdamOptimizer(learning_rate=learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=3)
def __init__(self, num_symbols, num_embed_units, num_units, num_labels, batch_size, embed, learning_rate=0.001, max_gradient_norm=5.0, learning_rate_decay_factor=0.9): # todo: implement placeholders self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1') self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2') # shape: batch*len self.texts_length1 = tf.placeholder( tf.int32, [batch_size], name='texts_length1') # shape: batch self.texts_length2 = tf.placeholder(tf.int32, [batch_size], name='texts_length2') self.max_length = tf.placeholder(tf.int32, name='max_length') self.labels = tf.placeholder(tf.int64, [batch_size], name='labels') # shape: batch self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.embed_units = num_embed_units self.num_units = num_units self.batch_size = batch_size self._initializer = tf.truncated_normal_initializer(stddev=0.1) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.index_input1 = self.symbol2index.lookup(self.texts1) # batch*len self.index_input2 = self.symbol2index.lookup(self.texts2) self.long_length = tf.maximum(self.texts_length1, self.texts_length2) print self.long_length.get_shape() self.mask_table = tf.sequence_mask(self.long_length, dtype=tf.float32) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*len*embed_unit self.embed_input2 = tf.nn.embedding_lookup(self.embed, self.index_input2) with tf.variable_scope('lstm_s'): self.lstm_s = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) with tf.variable_scope('lstm_r'): self.lstm_r = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) out_s1, state_s1 = dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length1, dtype=tf.float32, scope='rnn') out_s2, state_s2 = dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length2, dtype=tf.float32, scope='rnn') self.h_s1 = out_s1 self.h_s2 = out_s2 reshaped_s1 = tf.reshape(self.h_s1, [-1, self.num_units]) reshaped_s2 = tf.reshape(self.h_s2, [-1, self.num_units]) with tf.variable_scope('Attn_'): W_s = tf.get_variable(shape=[self.num_units, self.num_units], initializer=self._initializer, name='W_s') self.s_1 = tf.matmul(reshaped_s1, W_s) self.s_2 = tf.matmul(reshaped_s2, W_s) self.s_1 = tf.transpose( tf.reshape(self.s_1, [self.batch_size, -1, self.num_units]), [1, 2, 0]) self.s_2 = tf.transpose( tf.reshape(self.s_2, [self.batch_size, -1, self.num_units]), [1, 2, 0]) i = tf.constant(0) state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32) def c(t, sr): return tf.less(t, self.max_length) def b(t, sr): return self.attention(t, sr) i, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_r)) with tf.variable_scope('fully_connect'): w_fc = tf.get_variable(shape=[self.num_units, num_labels], initializer=self._initializer, name='w_fc') b_fc = tf.get_variable(shape=[num_labels], initializer=self._initializer, name='b_fc') logits = tf.matmul(state_r.h, w_fc) + b_fc #logits = tf.layers.dense(outputs, num_labels) # todo: implement unfinished networks self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / \ tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int64), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters for item in tf.global_variables(): print item opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step, #var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_labels, batch_size, embed, learning_rate=0.001, max_gradient_norm=5.0 ): # todo: implement placeholders self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1') self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2') # shape: batch*len self.texts_length = tf.placeholder(tf.int32, [None], name='texts_length') # shape: batch self.len = tf.constant(1.0, shape=[batch_size]) self.labels = tf.placeholder( tf.int64, [None], name='labels') # shape: batch self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.embed_units = num_embed_units self.batch_size = batch_size self._initializer = tf.truncated_normal_initializer(stddev=0.1) self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input1 = self.symbol2index.lookup(self.texts1) # batch*len self.index_input2 = self.symbol2index.lookup(self.texts2) ''' self.h_s1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.h_s2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.h_r = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.a1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.a2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) ''' self.h_s1 = [] self.h_s2 = [] self.h_r = [] self.a1 = [] self.a2 = [] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable( 'embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*len*embed_unit self.embed_input2 = tf.nn.embedding_lookup( self.embed, self.index_input2) with tf.variable_scope('lstm_s'): self.lstm_s = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0) ''' out_s1, state_s1 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length, dtype=tf.float32) out_s2, state_s2 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length, dtype=tf.float32) self.h_s1 = state_s1 self.h_s2 = state_s2 ''' with tf.variable_scope('lstm_r'): self.lstm_r = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0) ''' self.ini_op1 = tf.assign(self.h_s1[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op2 = tf.assign(self.h_s2[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op3 = tf.assign(self.h_r[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op4 = tf.assign(self.a1[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op5 = tf.assign(self.a2[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) ''' self.h_s1.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.h_s2.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.h_r.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.a1.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.a2.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) W = tf.Variable(self._initializer(shape=[num_embed_units, num_labels],dtype=tf.float32)) bias = tf.Variable(tf.constant(0.0, shape=[num_labels]), dtype=tf.float32) i = tf.constant(1, dtype=tf.int64) print self.index_input1[1].get_shape() length = self._length(self.index_input1[1]) self.ind = 1 state_s1 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32) state_s2 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32) state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32) def c(t, s1, s2, sr): return tf.less(t, length+1) def b(t, s1, s2, sr): return self.attention(t, s1, s2, sr) i, state_s1, state_s2, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_s1, state_s2, state_r)) logits = tf.matmul(state_r.h, W) + bias #logits = tf.layers.dense(outputs, num_labels) # todo: implement unfinished networks self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / \ tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum( tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters ''' opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients( zip(clipped_gradients, self.params), global_step=self.global_step) ''' self.global_step = tf.Variable(0, trainable=False) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step, var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__( self, embed, # 词嵌入 [VOCABULARY_COUNT * 200] vocabulary, # 词汇表 [1 * VOCABULARY_COUNT] vocabulary_count, # 词汇数 num_layers, # encoder和decoder的层数 num_units, # encoder和decoder的隐藏状态维度 learning_rate, max_gradient_norm, max_len, # output_alignments=False ): # 解码最大长度 # placeholder通常用于存储数据,用于feed_dict的配合,接收输入数据(如真实的训练样本)用于训练模型等 # placeholder在训练过程中会不断被赋予新的值,用于批训练,基本上其值是不会轻易进行加减操作 self.post_string = tf.placeholder( dtype=tf.string, shape=(None, None), name="post_string") # padding后的post batch_size * encoder_len self.response_string = tf.placeholder( dtype=tf.string, shape=(None, None), name="response_string" ) # padding后的response batch_size * decoder_len self.label_string = tf.placeholder( dtype=tf.string, shape=(None, None), name="label_string") # batch_size * decoder_len self.post_len = tf.placeholder( dtype=tf.int32, shape=(None, ), name="post_len") # 每条post的长度(padding前) batch_size self.response_len = tf.placeholder( dtype=tf.int32, shape=(None, ), name="reponse_len") # 每条response长度(padding前) batch_size # tf.get_variable表示创建或返回指定名称的模型变量——共享变量 self.embed = tf.get_variable( dtype=tf.float32, initializer=embed, name="embed") # 词嵌入,作为变量训练,VOCABULARY_COUNT * 200 self.vocabulary = tf.constant(vocabulary, dtype=tf.string) # 词汇表,VOCABULARY_COUNT self.batch_size = tf.shape(self.post_string)[0] self.encoder_len = tf.shape(self.post_string)[1] self.decoder_len = tf.shape(self.response_string)[1] ''' mask矩阵是一个由0和1组成的矩阵,该矩阵用以指示哪些是真正的数据,哪些是padding 其中1代表真实数据,0代表padding数据 [[1. 1. 1. 0. 0.] [1. 1. 1. 1. 0.] [1. 1. 1. 1. 1.]] response_len-1:所有长度减去START_WORD所占的位置 [batch_size * decoder_len] tf.cumsum根据列从右往左累计求和 例如 右边第一列为原始的[0 0 0],右边倒数第二列[0+0 0+1 1+0],右边倒数第三列[0+0+1 0+1+0 1+0+0] response_len = [3, 4, 5] decoder_len = 5 onehot = [[0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]] cumsum = [[1. 1. 1. 0. 0.] [1. 1. 1. 1. 0.] [1. 1. 1. 1. 1.]] ''' # self.post_mask = tf.cumsum(tf.one_hot(self.post_len), self.encoder_len), axis=1, reverse=True) self.mask = tf.cumsum(tf.one_hot(self.response_len - 1, self.decoder_len), axis=1, reverse=True) # 将字符(key)转化成id(value)表示的表,默认值为1 self.string_to_id = MutableHashTable( key_dtype=tf.string, # 键的类型 value_dtype=tf.int64, # 值的类型 default_value=1, # 当检索不到时的默认值 shared_name="string_to_id", # 如果非空,表将在多个session中以该名字共享 name="string_to_id", # 操作名 checkpoint=True) # 如果为True,表能从checkpoint中保存和恢复 # 将id转化成字符串表示的表,默认值为"_NDW" self.id_to_string = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_NDW", shared_name="id_to_string", name="id_to_string", checkpoint=True) # 将post和response转化成id表示 # table.lookup()根据表替换张量值 self.post_id = self.string_to_id.lookup( self.post_string) # batch_size * encoder_len self.response_id = self.string_to_id.lookup( self.response_string) # batch_size * decoder_len self.label_id = self.string_to_id.lookup( self.label_string) # batch_size * decoder_len # 将post和response转化成嵌入表示 ''' tf.nn.embedding_lookup(params, ids,……)根据索引选取一个张量里面对应的元素 batch_size * encoder_len * embed_size: [[[vector_1], [vector_2], ... [vector_encoder_len]], [[vector_1], [vector_2], ... [vector_encoder_len]], ..., [[vector_1], [vector_2], ... [vector_encoder_len]]] ''' self.post_embed = tf.nn.embedding_lookup( embed, self.post_id) # batch_size * encoder_len * embed_size self.response_embed = tf.nn.embedding_lookup( embed, self.response_id) # batch_size * decoder_len * embed_size ''' Python中对于无需关注其实际含义的变量可以用_代替,这就和for i in range(5)一样,因为这里我们对i并不关心,所以用_代替仅获取值而已 [LSTMCell(num_units), LSTMCell(num_units)] MultiRNNCell用于构建多层循环神经网络 ''' # encoder和decoder的层数和维度 encoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) # 2层RNN decoder_cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) projection_fn, loss_fn, inference_fn = get_project_funtion( vocabulary_count) # 定义模型的encoder部分 # tf.variable_scope表示变量所在的命名空间,指定变量的作用域"encoder/变量" with tf.variable_scope("encoder"): self.encoder_output, self.encoder_state = tf.nn.dynamic_rnn( encoder_cell, # RNN单元 self. post_embed, # padding后的post batch_size * encoder_len * embed_size self.post_len, # post的有效长度 batch_size dtype=tf.float32) # [batch_size encoder_len num_units] 每个样本每个时间步都对应一个输出 # self.encoder_output_shape = tf.shape(self.encoder_output) # 返回2个LSTMStateTuple(c=array([[batch_size num_units]]),h=array([[batch_size num_units]])) # [num_layers(2层) 2(c和h) batch_size num_units] 整个LSTM输出的最终状态,包含C和H,共2层,每个样本都有一个num_units维的状态C和H # self.encoder_state_shape = tf.shape(self.encoder_state) # 定义模型的decoder部分 # 训练时decoder with tf.variable_scope("decoder"): # keys, values, attention_score_fn, attention_construct_fn = \ # prepare_attention(self.encoder_output, num_units, reuse=False) # decoder_fn_train = attention_decoder_fn_train(self.encoder_state, # keys, # values, # attention_score_fn, # attention_construct_fn, # output_alignments=output_alignments, # decoder_len=self.decoder_len) self.decoder_output, self.decoder_state, self.loop_state = dynamic_decoder( decoder_cell, encoder_state=self. encoder_state, # num_layers * 2 * batch_size * num_units input=self.response_embed, response_len=self.response_len) # self.decoder_output_shape = tf.shape(self.decoder_output) # [batch_size decoder_len num_units] # self.decoder_state_shape = tf.shape(self.decoder_state) # [num_layers 2 batch_size num_units] # self.softmaxed_probability = projection_function(self.decoder_output) # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count] # self.maximum_likelihood_id = tf.argmax(self.softmaxed_probability, axis=2) # [batch_size decoder_len] # self.output_string = self.id_to_string.lookup(self.maximum_likelihood_id) self.loss, self.avg_loss = loss_fn(self.decoder_output, self.label_id, self.mask) ''' 通过tf.variable_scope函数可以控制tf.get_variable函数的语义 当reuse = True时,这个上下文管理器内所有的tf.get_variable都会直接获取已经创建的变量。如果变量不存在,则会报错 相反,如果reuse = None或者reuse = False,tf.get_variable将创建新的变量,若同名的变量已经存在则报错 ''' # 测试时decoder with tf.variable_scope("decoder", reuse=True): # keys, values, attention_score_fn, attention_construct_fn = \ # prepare_attention(self.encoder_output, num_units, reuse=False) # decoder_fn_inference = attention_decoder_fn_inference(self.encoder_state, # keys, # values, # attention_score_fn, # attention_construct_fn, # self.embed, # START_WORD_ID, # END_WORD_ID, # max_len, # vocabulary_count) self.inference_output, self.inference_state, self.inference_loop_state = dynamic_decoder( decoder_cell, encoder_state=self.encoder_state, projection_function=projection_fn, embed=self.embed, max_len=max_len) self.inference_maximum_likelihood_id = inference_fn( self.inference_output) # [batch_size decoder_len] self.inference_string = self.id_to_string.lookup( self.inference_maximum_likelihood_id ) # [batch_size decoder_len] ''' Variable用于可训练变量,比如网络权重,偏置 在声明时必须赋予初值,在训练过程中该值很可能会进行不断的加减操作变化 ''' self.global_step = tf.Variable(0, trainable=False, name="global_step") # 获取程序中的全局变量 self.params = tf.global_variables() # 使用自适应优化器——Adam优化算法,创建一个optimizer opt = tf.train.AdamOptimizer(learning_rate=learning_rate) # 根据 decoder_loss 计算 params 梯度,gradients长度等于len(params) gradients = tf.gradients(self.loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) # 返回一个执行梯度更新的ops self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver()
def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None ]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder( dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [ self.word2index.lookup(doc) for doc in tf.unstack(self.docs) ] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [ tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs ] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn( self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [ dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1) ] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt( tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [ tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1) ] self.prods = [ tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1) ] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable( initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
class RNN(object): def __init__(self, num_symbols, num_embed_units, num_units, num_labels, batch_size, embed, learning_rate=0.001, max_gradient_norm=5.0 ): # todo: implement placeholders self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1') self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2') # shape: batch*len self.texts_length = tf.placeholder(tf.int32, [None], name='texts_length') # shape: batch self.len = tf.constant(1.0, shape=[batch_size]) self.labels = tf.placeholder( tf.int64, [None], name='labels') # shape: batch self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.embed_units = num_embed_units self.batch_size = batch_size self._initializer = tf.truncated_normal_initializer(stddev=0.1) self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input1 = self.symbol2index.lookup(self.texts1) # batch*len self.index_input2 = self.symbol2index.lookup(self.texts2) ''' self.h_s1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.h_s2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.h_r = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.a1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) self.a2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False) ''' self.h_s1 = [] self.h_s2 = [] self.h_r = [] self.a1 = [] self.a2 = [] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable( 'embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*len*embed_unit self.embed_input2 = tf.nn.embedding_lookup( self.embed, self.index_input2) with tf.variable_scope('lstm_s'): self.lstm_s = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0) ''' out_s1, state_s1 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length, dtype=tf.float32) out_s2, state_s2 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length, dtype=tf.float32) self.h_s1 = state_s1 self.h_s2 = state_s2 ''' with tf.variable_scope('lstm_r'): self.lstm_r = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0) ''' self.ini_op1 = tf.assign(self.h_s1[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op2 = tf.assign(self.h_s2[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op3 = tf.assign(self.h_r[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op4 = tf.assign(self.a1[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.ini_op5 = tf.assign(self.a2[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) ''' self.h_s1.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.h_s2.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.h_r.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)) self.a1.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) self.a2.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) W = tf.Variable(self._initializer(shape=[num_embed_units, num_labels],dtype=tf.float32)) bias = tf.Variable(tf.constant(0.0, shape=[num_labels]), dtype=tf.float32) i = tf.constant(1, dtype=tf.int64) print self.index_input1[1].get_shape() length = self._length(self.index_input1[1]) self.ind = 1 state_s1 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32) state_s2 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32) state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32) def c(t, s1, s2, sr): return tf.less(t, length+1) def b(t, s1, s2, sr): return self.attention(t, s1, s2, sr) i, state_s1, state_s2, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_s1, state_s2, state_r)) logits = tf.matmul(state_r.h, W) + bias #logits = tf.layers.dense(outputs, num_labels) # todo: implement unfinished networks self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / \ tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum( tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters ''' opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients( zip(clipped_gradients, self.params), global_step=self.global_step) ''' self.global_step = tf.Variable(0, trainable=False) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step, var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) def attention(self, t, s1, s2, sr): ''' h_s1_j = tf.reshape(x1[t], [1, -1]) h_s2_j = tf.reshape(x2[t], [1, -1]) h_s1_p = tf.slice(s1, begin=[0, 0], size=[t, self.embed_units]) h_s2_p = tf.slice(s2, begin=[0, 0], size=[t, self.embed_units]) ''' s1_t = tf.concat([self.embed_input1[:,t-1], sr.h],1) s2_t = tf.concat([self.embed_input2[:,t-1], sr.h],1) r_t = tf.concat([self.a1[self.ind-1].h, self.a2[self.ind-1].h],1) with tf.variable_scope('lstm_s'): out_s1, state_s1 = self.lstm_s(inputs=s1_t, state=s1) out_s2, state_s2 = self.lstm_s(inputs=s2_t, state=s2) with tf.variable_scope('lstm_r'): out_r, state_r = self.lstm_r(inputs=r_t, state=sr) ''' self.assign_op1 = tf.assign(self.h_s1[t], state_s1) self.assign_op2 = tf.assign(self.h_s2[t], state_s2) self.assign_op3 = tf.assign(self.h_r[t], state_r) ''' self.h_s1.append(state_s1) self.h_s2.append(state_s2) self.h_r.append(state_r) a1t = tf.constant(0.0, shape = [self.batch_size, self.embed_units], dtype=tf.float32) a2t = tf.constant(0.0, shape = [self.batch_size, self.embed_units], dtype=tf.float32) def c1(j, t, a1tj, a2tj): return tf.less(j, t) def b1(j, t, a1tj, a2tj): return self.match(j,t, a1tj, a2tj) k = tf.constant(1, dtype=tf.int64) self.j = 1 k, q, a1t, a2t = tf.while_loop(cond=c1, body=b1, loop_vars=[k ,t, a1t, a2t], shape_invariants=None) ''' self.assign_op4 = tf.assign(self.a1[t], a1t) self.assign_op5 = tf.assign(self.a2[t], a2t) ''' self.a1.append(a1t) self.a2.append(a2t) t=tf.add(t,1) self.ind+=1 return t, state_s1, state_s2, state_r def match(self, j, t, a1tj, a2tj): with tf.variable_scope('Attn_'): W_s = tf.get_variable(shape=[self.embed_units, self.embed_units], initializer=self._initializer, name='W_s') W_o = tf.get_variable(shape=[self.embed_units, self.embed_units], initializer=self._initializer, name='W_o') W_e = tf.get_variable(shape=[self.embed_units, 1], initializer=self._initializer, name='W_e') W_a = tf.get_variable(shape=[self.embed_units, self.embed_units], initializer=self._initializer, name='W_a') e1_tj = tf.matmul(tf.tanh(tf.matmul(self.h_s1[self.j].h, W_s) + tf.matmul(W_o, self.h_s2[self.ind].h, transpose_b=True) + tf.matmul(W_a, self.h_r[self.ind-1].h, transpose_b=True)), W_e) e2_tj = tf.matmul(tf.tanh(tf.matmul(W_s, self.h_s2[self.j].h, transpose_b=True) + tf.matmul(W_o, self.h_s1[self.ind].h, transpose_b=True) + tf.matmul(W_a, self.h_r[self.ind-1].h, transpose_b=True)), W_e) alpha1_tj = tf.reshape(tf.nn.softmax(e1_tj, dim=1),[-1]) alpha2_tj = tf.reshape(tf.nn.softmax(e2_tj, dim=1),[-1]) ''' with tf.variable_scope('atten'): a1tj = tf.get_variable(shape = [self.embed_units, batch_size], initializer=tf.constant_initializer(), name='a1tj') a2tj = tf.get_variable(shape = [self.embed_units, batch_size], initializer=tf.constant_initializer(), name='a2tj') self.add_op1 = tf.assign_add(a1tj, tf.transpose(self.h_s1[j])*alpha1_tj) self.add_op2 = tf.assign_add(a2tj, tf.transpose(self.h_s2[j])*alpha2_tj) ''' a1tj = tf.add(a1tj, tf.transpose(self.h_s1[self.j].h)*alpha1_tj) a2tj = tf.add(a2tj, tf.transpose(self.h_s2[self.j].h)*alpha2_tj) j = tf.add(j,1) self.j+=1 return j, t, a1tj, a2tj def _length(self, sequence): mask = tf.sign(tf.abs(sequence)) length = tf.reduce_sum(mask, axis=-1) return length def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, data, summary=False): input_feed = {self.texts1: data['texts1'], self.texts2: data['texts2'], self.texts_length: data['texts_length'], self.labels: data['labels'], self.keep_prob: data['keep_prob']} output_feed = [self.loss, self.accuracy, self.train_op] #self.gradient_norm, self.update] ''' ,self.assign_op1, self.assign_op2, self.assign_op3, self.assign_op4, self.assign_op5, self.ini_op1, self.ini_op2, self.ini_op3, self.ini_op4, self.ini_op5] ''' if summary: output_feed.append(self.merged_summary_op) return session.run(output_feed, input_feed)
class LSTMDSSM(object): """ The LSTM-DSSM model refering to the paper: Deep Sentence Embedding Using Long Short-Term Memory Networks: Analysis and Application to Information Retrieval. papaer available at: https://arxiv.org/abs/1502.06922 """ def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True ) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)] self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.origin_sims = self.sims self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, queries, docs): input_feed = {self.queries: queries['texts'], self.queries_length: queries['texts_length'], self.docs: docs['texts'], self.docs_length: docs['texts_length']} output_feed = [self.loss, self.update, self.states_q, self.states_d, self.queries_norm, self.docs_norm, self.prods, self.sims, self.prob, self.hit_prob] return session.run(output_feed, input_feed) def test_step(self, session, queries, docs, ground_truths): input_feed = {self.queries: queries['texts'], self.queries_length: queries['texts_length'], self.docs: docs['texts'], self.docs_length: docs['texts_length']} output_feed = [self.origin_sims] scores = (session.run(output_feed, input_feed)[0][0] + 1) / 2 # debug # print("ground truths: " + str(ground_truths)) # if max(ground_truths) == 0: # print("predicts for dissimilar pairs: " + str(scores)) l = len(ground_truths) loss = 0 for i in range(l): predict = scores[i] ground_truth = ground_truths[i] predict = min([max([predict, 1e-15]), 1 - 1e-15]) if ground_truth == 0: loss += math.log(1 - predict) else: loss += math.log(predict) return -loss / l
class LSTMDSSM(object): """ The LSTM-DSSM model refering to the paper: Deep Sentence Embedding Using Long Short-Term Memory Networks: Analysis and Application to Information Retrieval. papaer available at: https://arxiv.org/abs/1502.06922 """ def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None ]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder( dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [ self.word2index.lookup(doc) for doc in tf.unstack(self.docs) ] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [ tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs ] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn( self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [ dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1) ] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt( tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [ tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1) ] self.prods = [ tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1) ] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable( initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape())) def train_step(self, session, queries, docs): input_feed = { self.queries: queries['texts'], self.queries_length: queries['texts_length'], self.docs: docs['texts'], self.docs_length: docs['texts_length'] } output_feed = [ self.loss, self.update, self.states_q, self.states_d, self.queries_norm, self.docs_norm, self.prods, self.sims, self.prob, self.hit_prob ] return session.run(output_feed, input_feed) def test_step(self, session, queries, docs, ground_truths): input_feed = { self.queries: queries['texts'], self.queries_length: queries['texts_length'], self.docs: docs['texts'], self.docs_length: docs['texts_length'] } output_feed = [self.sims] scores = (session.run(output_feed, input_feed)[0][0] + 1) / 2 l = len(ground_truths) loss = 0 for i in range(l): predict = scores[i] ground_truth = ground_truths[i] predict = max([min([predict, 1 - 1e-15]), 1e-15]) if ground_truth == 0: loss += math.log(1 - predict) else: loss += math.log(predict) return -loss / l
class Model(object): def __init__(self, word_embed, entity_embed, vocab_size=30000, num_embed_units=300, num_units=512, num_layers=2, num_entities=0, num_trans_units=100, max_length=60, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=500, output_alignments=True): # initialize params self.vocab_size = vocab_size self.num_embed_units = num_embed_units self.num_units = num_units self.num_layers = num_layers self.num_entities = num_entities self.num_trans_units = num_trans_units self.learning_rate = learning_rate self.max_gradient_norm = max_gradient_norm self.num_samples = num_samples self.max_length = max_length self.output_alignments = output_alignments # build the embedding table (index to vector) if word_embed is None: # initialize the embedding randomly self.word_embed = tf.get_variable( 'word_embed', [self.vocab_size, self.num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.word_embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=word_embed) if entity_embed is None: # initialize the embedding randomly self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: # initialize the embedding by pre-trained trans vectors self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # initialize inputs and outputs self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch self.entities = tf.placeholder(tf.string, (None, None, None), 'entities') # batch self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # batch self.triples = tf.placeholder(tf.string, (None, None, None, 3), 'triples') # batch self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples') # batch self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples') # batch self.match_triples = tf.placeholder(tf.int32, (None, None, None), 'match_triples') # batch self._init_vocabs() # build the vocab table (string to index) self.posts_word_id = self.symbol2index.lookup(self.posts) # batch*len self.posts_entity_id = self.entity2index.lookup( self.posts) # batch*len self.responses_target = self.symbol2index.lookup( self.responses) # batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build entity embeddings entity_trans_transformed = tf.layers.dense(self.entity_trans, self.num_trans_units, activation=tf.tanh, name='trans_transformation') padding_entity = tf.get_variable('entity_padding_embed', [7, self.num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat( [padding_entity, entity_trans_transformed], axis=0) # get knowledge graph embedding, knowledge triple embedding self.triples_embedding, self.entities_word_embedding, self.graph_embedding = self._build_kg_embedding( ) # build knowledge graph graph_embed_input, triple_embed_input = self._build_kg_graph() # build encoder encoder_output, encoder_state = self._build_encoder(graph_embed_input) # build decoder self._build_decoder(encoder_output, encoder_state, triple_embed_input) # initialize training process self.global_step = tf.Variable(0, trainable=False) self.params = tf.global_variables() gradients = tf.gradients(self.decoder_loss, self.params) self.clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, self.max_gradient_norm) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.update = optimizer.apply_gradients(zip(self.clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() def _init_vocabs(self): self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) def _build_kg_embedding(self): encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * self.num_trans_units]) entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.word_embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, self.num_embed_units]) head, relation, tail = tf.split(triples_embedding, [self.num_trans_units] * 3, axis=3) with tf.variable_scope('graph_attention', reuse=tf.AUTO_REUSE): head_tail = tf.concat([head, tail], axis=3) head_tail_transformed = tf.layers.dense(head_tail, self.num_trans_units, activation=tf.tanh, name='head_tail_transform') relation_transformed = tf.layers.dense(relation, self.num_trans_units, name='relation_transform') e_weight = tf.reduce_sum(relation_transformed * head_tail_transformed, axis=3) alpha_weight = tf.nn.softmax(e_weight) graph_embedding = tf.reduce_sum(tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) return triples_embedding, entities_word_embedding, graph_embedding def _build_kg_graph(self): encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # knowledge graph vectors graph_embed_input = tf.gather_nd( self.graph_embedding, tf.concat([ tf.tile( tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), self.posts_triple ], axis=2)) # knowledge triple vectors triple_embed_input = tf.reshape( tf.nn.embedding_lookup( self.entity_embed, self.entity2index.lookup(self.responses_triple)), [batch_size, decoder_len, 3 * self.num_trans_units]) return graph_embed_input, triple_embed_input def _build_encoder(self, graph_embed_input): post_word_input = tf.nn.embedding_lookup( self.word_embed, self.posts_word_id) # batch*len*unit encoder_cell = MultiRNNCell( [GRUCell(self.num_units) for _ in range(self.num_layers)]) # encoder input: e(x_t) = [w(x_t); g_i] encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2) encoder_output, encoder_state = dynamic_rnn(encoder_cell, encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # shape:[batch_size, max_time, cell.output_size] return encoder_output, encoder_state def _build_decoder(self, encoder_output, encoder_state, triple_embed_input): # decoder input: e(y_t) = [w(y_t); k_j] encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) response_word_input = tf.nn.embedding_lookup( self.word_embed, self.responses_word_id) # batch*len*unit decoder_input = tf.concat([response_word_input, triple_embed_input], axis=2) print("decoder_input:", decoder_input.shape) # define cell decoder_cell = MultiRNNCell( [GRUCell(self.num_units) for _ in range(self.num_layers)]) # get loss functions sequence_loss, total_loss = loss_computation( self.vocab_size, num_samples=self.num_samples) # decoder training process with tf.variable_scope('decoder'): # prepare attention attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', self.num_units, scope_name="decoder", imem=(self.graph_embedding, self.triples_embedding), output_alignments=self.output_alignments) print("graph_embedding:", self.graph_embedding.shape) print("triples_embedding:", self.triples_embedding.shape) decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, output_alignments=self.output_alignments, max_length=tf.reduce_max(self.responses_length)) # train decoder decoder_output, _, decoder_context_state = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, decoder_input, self.responses_length, scope="decoder_rnn") output_fn, selector_fn = output_projection( self.vocab_size, scope_name="decoder_rnn") output_logits = output_fn(decoder_output) selector_logits = selector_fn(decoder_output) print("decoder_output:", decoder_output.shape) # shape: [batch, seq, num_units] print("output_logits:", output_logits.shape) print("selector_fn:", selector_logits.name) triple_len = tf.shape(self.triples)[2] one_hot_triples = tf.one_hot(self.match_triples, triple_len) use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) alignments = tf.transpose(decoder_context_state.stack(), perm=[1, 0, 2, 3]) self.decoder_loss, self.ppx_loss, self.sentence_ppx \ = total_loss(output_logits, selector_logits, self.responses_target, self.decoder_mask, alignments, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, name="ppx_loss") # decoder inference process with tf.variable_scope('decoder', reuse=True): # prepare attention attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', self.num_units, scope_name="decoder", imem=(self.graph_embedding, self.triples_embedding), output_alignments=self.output_alignments, reuse=True) output_fn, selector_fn = output_projection(self.vocab_size, scope_name=None, reuse=True) decoder_fn_inference \ = attention_decoder_fn_inference(output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.word_embed, GO_ID, EOS_ID, self.max_length, self.vocab_size, imem=(self.entities_word_embedding, tf.reshape(self.triples_embedding, [encoder_batch_size, -1, 3 * self.num_trans_units])), selector_fn=selector_fn) # get decoder output decoder_distribution, _, infer_context_state \ = dynamic_rnn_decoder(decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(decoder_distribution)[1] output_ids = tf.transpose( infer_context_state.gather(tf.range(output_len))) word_ids = tf.cast( tf.clip_by_value(output_ids, 0, self.vocab_size), tf.int64) entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, self.vocab_size) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(self.entities_word_embedding)[1], [-1, 1]), [-1]) entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity(self.generation, name='generation') def set_vocabs(self, session, vocab, entity_vocab, relation_vocab): op_in = self.symbol2index.insert( constant_op.constant(vocab), constant_op.constant(list(range(self.vocab_size)), dtype=tf.int64)) session.run(op_in) op_out = self.index2symbol.insert( constant_op.constant(list(range(self.vocab_size)), dtype=tf.int64), constant_op.constant(vocab)) session.run(op_out) op_in = self.entity2index.insert( constant_op.constant(entity_vocab + relation_vocab), constant_op.constant(list( range(len(entity_vocab) + len(relation_vocab))), dtype=tf.int64)) session.run(op_in) op_out = self.index2entity.insert( constant_op.constant(list( range(len(entity_vocab) + len(relation_vocab))), dtype=tf.int64), constant_op.constant(entity_vocab + relation_vocab)) session.run(op_out) return session def print_parameters(self): for item in self.params: print('%s: %s' % (item.name, item.get_shape().as_list())) def step_train(self, session, data, forward_only=False, summary=False): input_feed = { self.posts: data['posts'], self.posts_length: data['posts_length'], self.responses: data['responses'], self.responses_length: data['responses_length'], self.triples: data['triples'], self.posts_triple: data['posts_triple'], self.responses_triple: data['responses_triple'], self.match_triples: data['match_triples'] } if forward_only: output_feed = [self.sentence_ppx] else: output_feed = [self.sentence_ppx, self.decoder_loss, self.update] if summary: output_feed.append(self.merged_summary_op) return session.run(output_feed, input_feed)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, embed, entity_embed=None, num_entities=0, num_trans_units=100, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=60, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch self.entities = tf.placeholder(tf.string, (None, None), 'entities') # batch self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # batch self.triples = tf.placeholder(tf.string, (None, None, 3), 'triples') # batch self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples') # batch self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples') # batch self.match_triples = tf.placeholder(tf.int32, (None, None), 'match_triples') # batch encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] #use_triples = tf.reduce_sum(tf.cast(tf.greater_equal(self.match_triples, 0), tf.float32), axis=-1) one_hot_triples = tf.one_hot(self.match_triples, triple_num) use_triples = tf.reduce_sum(one_hot_triples, axis=[2]) self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # build the vocab table (string to index) self.posts_word_id = self.symbol2index.lookup(self.posts) # batch*len self.posts_entity_id = self.entity2index.lookup(self.posts) # batch*len #self.posts_word_id = tf.Print(self.posts_word_id, ['use_triples', use_triples, 'one_hot_triples', one_hot_triples], summarize=1e6) self.responses_target = self.symbol2index.lookup(self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1] self.responses_word_id = tf.concat([tf.ones([batch_size, 1], dtype=tf.int64)*GO_ID, tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1) # batch*len self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # initialize the embedding randomly self.entity_trans = tf.get_variable('entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: # initialize the embedding by pre-trained word vectors self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) self.entity_trans_transformed = tf.layers.dense(self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat([padding_entity, self.entity_trans_transformed], axis=0) triples_embedding = tf.reshape(tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, 3 * num_trans_units]) entities_word_embedding = tf.reshape(tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts_word_id) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_word_id) #batch*len*unit encoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer(num_units, num_symbols, num_samples) with tf.variable_scope('decoder'): # get attention function attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1,0,2]) #self.alignments = tf.Print(self.alignments, [self.alignments], summarize=1e8) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') #self.decoder_loss = tf.Print(self.decoder_loss, ['decoder_loss', self.decoder_loss], summarize=1e6) else: self.decoder_loss, self.sentence_ppx = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') with tf.variable_scope('decoder', reuse=True): # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=entities_word_embedding, selector_fn=selector_fn) self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_inference, scope="decoder_rnn") if output_alignments: output_len = tf.shape(self.decoder_distribution)[1] output_ids = tf.transpose(output_ids_ta.gather(tf.range(output_len))) word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) entity_ids = tf.reshape(tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) entities = tf.reshape(tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) self.generation = tf.where(output_ids > 0, words, entities, name='generation') else: self.generation_index = tf.argmax(self.decoder_distribution, 2) self.generation = self.index2symbol.lookup(self.generation_index, name='generation') # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.global_variables() # calculate the gradient of parameters #opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(, , 'texts_length') # shaoe: [batch] self.labels = tf.placeholder(, , 'labels') # shape: [batch] self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup(self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup(, ) #shape: [batch, length, num_embed_units] #todo: implement other RNNCell to replace BasicRNNCell cell = MultiRNNCell([BasicRNNCell(num_units) for _ in range(num_layers)]) outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") #todo: vectors is the last hidden states of the BasicRNNCell, u may need to change the code to get the right vectors of other RNNCell vectors = states[-1] with tf.variable_scope('logits'): weight = tf.get_variable("weights", [num_units, num_labels]) bias = tf.get_variable("biases", [num_labels]) #todo: implement the linear transformation: [batch, num_units] -> [batch, num_labels], using vectors, weight, bias logits = self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate, max_gradient_norm=5.0, param_da=150, param_r=10): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(tf.int32, None, 'texts_length') # shape: [batch] self.labels = tf.placeholder(tf.int32, None, 'labels') # shape: [batch] self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) batch_size = tf.shape(self.texts)[0] # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup( self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #shape: [batch, length, num_embed_units] #todo: implement Multi-layer RNNCell with #num_units neurons and #num_layers layers def LSTM(): return BasicLSTMCell(num_units) cells = [LSTM() for i in range(num_layers)] cell_fw = MultiRNNCell(cells) cell_bw = MultiRNNCell(cells) #todo: implement bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units) #H = tf.Print(H, [H, tf.shape(H), "H"]) with tf.variable_scope('logits'): #todo: implement self-attention mechanism, feel free to add codes to calculate internal results Ws1 = tf.get_variable("Ws1", [2 * num_units, param_da]) Ws2 = tf.get_variable("Ws2", [param_da, param_r]) temp = tf.tanh(tf.einsum('aij,jr->air', H, Ws1)) #temp = tf.Print(temp, [temp, tf.shape(temp), "shape"]) A = tf.nn.softmax( tf.einsum('aij,jr->air', temp, Ws2)) # shape: (batch, param_r*2*num_units) #A = tf.Print(A, [A, tf.shape(A), "A"]) M = tf.reduce_sum(tf.einsum('aij,aik->ajk', A, H), axis=1) #M = tf.Print(M, [M, tf.shape(M), "M"]) logits = tf.layers.dense( M, num_labels, activation=None, name='projection') # shape: (batch, num_labels) #logits = tf.Print(logits, [logits, tf.shape(logits), "logits"]) #todo: calculate additional loss, feel free to add codes to calculate internal results identity = tf.reshape( tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r]) temp = tf.matmul(A, A, transpose_a=True) self.penalized_term = tf.norm(temp - identity) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + 0.001 * self.penalized_term predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, tf.cast(predict_labels, tf.int32)), tf.int32), name='accuracy') self.params = tf.trainable_variables() # global_step = tf.Variable(0, trainable=False) # initial_learning_rate = self.learning_rate # learning_rate = tf.train.exponential_decay(initial_learning_rate, # global_step=global_step, # decay_steps=10,decay_rate=0.9) # calculate the gradient of parameters #opt = tf.train.AdamOptimizer(learning_rate) opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, beam_size, embed, learning_rate=0.5, remove_unk=False, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=8, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # build the vocab table (string to index) self.posts_input = self.symbol2index.lookup(self.posts) # batch*len self.responses_target = self.symbol2index.lookup( self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_input = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.posts_input) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) if use_lstm: cell = MultiRNNCell([LSTMCell(num_units)] * num_layers) else: cell = MultiRNNCell([GRUCell(num_units)] * num_layers) # rnn encoder encoder_output, encoder_state = dynamic_rnn(cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_symbols, num_samples) # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = attention_decoder_fn.prepare_attention(encoder_output, 'luong', num_units) with tf.variable_scope('decoder'): decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn) self.decoder_output, _, _ = dynamic_rnn_decoder( cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") self.decoder_loss = sampled_sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) with tf.variable_scope('decoder', reuse=True): decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols) self.decoder_distribution, _, _ = dynamic_rnn_decoder( cell, decoder_fn_inference, scope="decoder_rnn") self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, num_symbols - 2], 2)[1], 2) + 2 # for removing UNK self.generation = self.index2symbol.lookup(self.generation_index, name='generation') with tf.variable_scope('decoder', reuse=True): decoder_fn_beam_inference = attention_decoder_fn_beam_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, beam_size, remove_unk) _, _, self.context_state = dynamic_rnn_decoder( cell, decoder_fn_beam_inference, scope="decoder_rnn") (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) = self.context_state self.beam_parents = tf.transpose(tf.reshape( beam_parents.stack(), [max_length + 1, -1, beam_size]), [1, 0, 2], name='beam_parents') self.beam_symbols = tf.transpose( tf.reshape(beam_symbols.stack(), [max_length + 1, -1, beam_size]), [1, 0, 2]) self.beam_symbols = self.index2symbol.lookup(tf.cast( self.beam_symbols, tf.int64), name="beam_symbols") self.result_probs = tf.transpose(tf.reshape( result_probs.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2], name='result_probs') self.result_symbols = tf.transpose( tf.reshape(result_symbols.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2]) self.result_parents = tf.transpose(tf.reshape( result_parents.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2], name='result_parents') self.result_symbols = self.index2symbol.lookup( tf.cast(self.result_symbols, tf.int64), name='result_symbols') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # Exporter for serving self.model_exporter = exporter.Exporter(self.saver) inputs = {"enc_inps:0": self.posts, "enc_lens:0": self.posts_length} outputs = { "beam_symbols": self.beam_symbols, "beam_parents": self.beam_parents, "result_probs": self.result_probs, "result_symbols": self.result_symbols, "result_parents": self.result_parents } self.model_exporter.init(tf.get_default_graph().as_graph_def(), named_graph_signatures={ "inputs": exporter.generic_signature(inputs), "outputs": exporter.generic_signature(outputs) })
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.5, max_gradient_norm=5.0, model='LSTM'): #todo: implement placeholders self.texts = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.texts_length = tf.placeholder(dtype=tf.int32, shape=None) # shape: batch self.labels = tf.placeholder(dtype=tf.int64, shape=None) # shape: batch self.keep_prob = tf.placeholder(dtype=tf.float32) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #batch*len*embed_unit #todo: implement unfinished networks if num_layers == 1: if model == 'LSTM': cell = BasicLSTMCell(num_units) elif model == 'RNN': cell = BasicRNNCell(num_units) elif model == 'GRU': cell = GRUCell(num_units) else: print("Wrong model!") return cell_dr = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob) outputs, states = dynamic_rnn(cell_dr, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") if model == 'LSTM': h_state = states[0] else: h_state = states else: if model == 'LSTM': cell = BasicLSTMCell(num_units) elif model == 'RNN': cell = BasicRNNCell(num_units) elif model == 'GRU': cell = GRUCell(num_units) else: print("Wrong model!") return cell_dr = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob) multi_cell = tf.contrib.rnn.MultiRNNCell([cell_dr] * num_layers, state_is_tuple=True) init_state = multi_cell.zero_state(16, tf.float32) outputs, state = tf.nn.dynamic_rnn(multi_cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn", initial_state=init_state, time_major=False) h_state = outputs[:, -1, :] logits = tf.layers.dense(h_state, num_labels) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0, param_da=150, param_r=10): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(tf.int32,(None), 'texts_length') # shape: [batch] self.labels = tf.placeholder(tf.int64,(None), 'labels') # shape: [batch] self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) batch_size = tf.shape(self.texts)[0] # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup(self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #shape: [batch, length, num_embed_units] #todo: implement Multi-layer RNNCell with #num_units neurons and #num_layers layers cell_fw = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)]) cell_bw = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)]) #todo: implement bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,self.embed_input ,self.texts_length , dtype=tf.float32, scope="rnn") vectors = states[-1][-1] print ("Hi") H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units) with tf.variable_scope('logits'): #todo: implement self-attention mechanism, feel free to add codes to calculate internal results Ws1 = tf.get_variable("Ws1", shape = [2*num_units, param_da]) Ws2 = tf.get_variable("Ws2", shape = [param_da, param_r]) #param1 = tf.matmul(vectors,Ws1) + Ws2 #print(Ws1,Ws2) A = tf.nn.softmax(tf.einsum('aij,jk->aik',tf.nn.tanh(tf.einsum('aij,jk->aik',H,Ws1)),Ws2)) #M = tf.matmul(H,Ws1) + Ws2 # shape: (batch, param_r*2*num_units) M = tf.einsum('aij,aik->ajk',A,H) #M=tf.reduce_sum(M, axis=1) M = tf.reshape(M,[batch_size,param_r*2*num_units]) logits = tf.layers.dense(M, num_labels, activation=None, name='projection') # shape: (batch, num_labels) #logits = tf.layers.dense(M, num_labels, activation=None, name='projection') # shape: (batch, num_labels) #todo: calculate additional loss, feel free to add codes to calculate internal results identity = tf.reshape(tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r]) #self.penalized_term = tf.nnl2_loss(M,name=None) P = tf.einsum("aij,ajk->aik",tf.einsum("aij->aji",A),A) - identity self.penalized_term = tf.reduce_mean(tf.trace(tf.einsum("aij,ajk->aik", tf.einsum("aij->aji",P),P))) self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + 0.0001*self.penalized_term predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)