def calculate_loss(self, outputs): """ calculate loss :param outputs: if model is train, outputs is decoder outputs, otherwise outputs is decoder outputs matrix w and add basie :return: loss """ with tf.variable_scope('loss'), tf.name_scope('loss'): def sampled_loss_func(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=self.w_t, biases=self.v, labels=labels, inputs=inputs, num_sampled=self.model_config.num_softmax_samples, num_classes=self.words_dict_len) if self.model_config.num_softmax_samples != 0 and self.model_config.model == 'train': loss = seq2seq_lib.sampled_sequence_loss( outputs, self.targets, self.loss_weights, sampled_loss_func) else: loss = tf.contrib.legacy_seq2seq.sequence_loss( outputs, self.targets, self.loss_weights) return loss
def _add_seq2seq_old(self, sess): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens with tf.variable_scope('Embedding'), tf.device('/gpu:0'): #============================================================================== # Embedding shared by the input and outputs. #embedding = tf.get_variable( # 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, # trainable=False, # initializer=tf.truncated_normal_initializer(stddev=1e-4)) #sess.run(tf.initialize_all_variables()) #============================================================================== vsize = self._vocab.NumIds() embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, trainable=False, initializer=tf.truncated_normal_initializer(stddev=1e-4)) sess.run(tf.initialize_all_variables()) if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (vsize, hps.emb_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = data.GetWordIds(word, self._vocab) if idx != None: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) print "to test ... .. . . embedding first loaded:" print(sess.run(tf.nn.embedding_lookup(embedding, 2))) sess.run(embedding.assign(initW)) print "to test ... .. .. . function loaded:" print(sess.run(tf.nn.embedding_lookup(embedding, 2))) #=============================================================================== # Embedding shared by the input and outputs. emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] #matrix factorization ## s,u,v=tf.svd(emb_encoder_inputs,compute_uv=True) ## eigenSum=tf.reduce_sum(s) ## eigen=0 ## threshold=0 ## i=0; ## for i in range(len(s)): ## eigen=s(i) ## if((eigen/eigenSum)>threshold) ## break; #rebuild eigenvector with i length ## new_eigenMatrix = tf.Variable(tf.zeros([i,i])) ## for j in range(i): ## new_eigenMatrix[j,j]=s(j) #decrease embedding dim [vsize,64] ##emb_encoder_inputs=tf.batch_matmul(u[,:j],new_eigenMatrix) # new_embedding=u*s #or decrease word length [N,128] # new_embedding=v*s for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): #bidirectional rnn cell cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell_fw = tf.nn.rnn_cell.DropoutWrapper( cell_fw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) cell_bw = tf.nn.rnn_cell.DropoutWrapper( cell_bw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) (emb_encoder_inputs, fw_state, _) = tf.nn.bidirectional_rnn(cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs print "fw_state:", fw_state with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(1, encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) print "====emb_decoder_inputs:", emb_decoder_inputs print "====self._dec_in_state:", self._dec_in_state print "====self._enc_top_states:", self._enc_top_states print "====decoder_outputs:", decoder_outputs print "====self._dec_out_state:", self._dec_out_state with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/gpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(1, [ tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/gpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) tf.logging.info('num_sampled%s', hps.num_softmax_samples) return tf.nn.sampled_softmax_loss( w_t, v, inputs, labels, hps.num_softmax_samples, vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.nn.seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d'%layer_i), tf.device( self._next_device()): cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoder_inputs, fw_state, _) = tf.nn.bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) encoder_outputs = [tf.reshape(x, [hps.batch_size, 1, 2*hps.num_hidden]) for x in encoder_outputs] self._enc_top_states = tf.concat(1, encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat( 1, [tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size*2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, v, inputs, labels, hps.num_softmax_samples, vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.nn.seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): vocab_size = self._vocab.num_ids() hyper_params = self._hyper_params embedding_size = hyper_params.emb_dim enc_layers = hyper_params.enc_layers with tf.variable_scope('seq2seq'): encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # TODO: initialize using pre-trained embedding with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vocab_size, embedding_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=1e-4) ) emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs] for layer_i in xrange(enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device(self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hyper_params.num_hidden, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=False ) cell_bw = tf.contrib.rnn.LSTMCell( hyper_params.num_hidden, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=False ) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens ) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hyper_params.num_hidden, vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4) ) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4) ) with tf.variable_scope('decoder'), tf.device(self._next_device()): # when decoding, use model output from the previous step for the next step loop_function = None if hyper_params.mode == 'decode': loop_function = self._extract_argmax_and_embed(embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hyper_params.num_hidden, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=False ) encoder_outputs = [ tf.reshape(x, [hyper_params.batch_size, 1, 2 * hyper_params.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search, dec_out_state are stored by # beam_search for next feeding. initial_state_attention = (hyper_params.mode == 'decode') decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention ) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append(tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hyper_params.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.arg_max(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat( axis=1, values=[tf.reshape(x, [hyper_params.batch_size, 1]) for x in best_outputs] ) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hyper_params.batch_size * 2 ) with tf.variable_scope('loss'), tf.device(self._next_device()): def sample_loss_func(inputs, labels): with tf.device('/cpu:0'): # TODO: Try gpu labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hyper_params.num_softmax_samples, num_classes=vocab_size ) if hyper_params.num_softmax_samples != 0 and hyper_params.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sample_loss_func ) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights ) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens emb_encoder_inputs = None emb_decoder_inputs = None #with tf.variable_scope('Embedding'), tf.device('/gpu:0'): # Embedding shared by the input and outputs. if FLAGS.word2vec == None: embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, trainable=False, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] else: # Embedding shared by the input and outputs. emb_encoder_inputs = [ tf.nn.embedding_lookup(self._embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(self._embedding, x) for x in decoder_inputs ] if FLAGS.svd_dim: #svd factorization svd_dim = FLAGS.svd_dim emb_encoder_matrix = tf.pack( [tf.transpose(x) for x in emb_encoder_inputs]) emb_encoder_matrix = tf.transpose(emb_encoder_matrix) s, u, v = tf.svd(emb_encoder_matrix, compute_uv=True) b = [tf.gather(x, range(svd_dim)) for x in tf.unpack(s)] b = tf.pack(b) #100:eigen values 300:embedding c = [ tf.slice(tf.transpose(x), [0, 0], [svd_dim, hps.emb_dim]) for x in tf.unpack(u) ] c = tf.pack(c) d = [tf.diag(x) for x in tf.unpack(b)] d = tf.pack(d) e = tf.batch_matmul(d, c) emb_decoder_inputs = tf.unpack(tf.transpose(e, perm=[1, 0, 2])) for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): #bidirectional rnn cell cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell_fw = tf.nn.rnn_cell.DropoutWrapper( cell_fw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) cell_bw = tf.nn.rnn_cell.DropoutWrapper( cell_bw, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) (emb_encoder_inputs, fw_state, _) = tf.nn.bidirectional_rnn(cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs print "fw_state:", fw_state with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': if FLAGS.word2vec == None: loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) else: loop_function = _extract_argmax_and_embed( self._embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=hps.input_dropout, output_keep_prob=hps.output_dropout) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(1, encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) print "====emb_decoder_inputs:", emb_decoder_inputs print "====self._dec_in_state:", self._dec_in_state print "====self._enc_top_states:", self._enc_top_states print "====decoder_outputs:", decoder_outputs print "====self._dec_out_state:", self._dec_out_state with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/gpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(1, [ tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/gpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) tf.logging.info('num_sampled%s', hps.num_softmax_samples) return tf.nn.sampled_softmax_loss( w_t, v, inputs, labels, hps.num_softmax_samples, vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.nn.seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): #Xinchun: add encoder for stock price sequence encoderPrice_inputs = tf.unstack(tf.transpose(self._anomPrices)) pricelist_lens = self._pricelist_lens encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] #Xinchun: added price encoder embedding embeddingPrice = tf.get_variable( 'embeddingPrice', [vsize, 0], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoderPrice_inputs = [ tf.nn.embedding_lookup(embeddingPrice, x) for x in encoderPrice_inputs ] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs #Xinchun: add another encoder for anomPrice for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoderPrice%d' % layer_i), tf.device( self._next_device()): cellPrice_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cellPrice_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoderPrice_inputs, fwPrice_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cellPrice_fw, cellPrice_bw, emb_encoderPrice_inputs, dtype=tf.float32, sequence_length=pricelist_lens) encoderPrice_outputs = emb_encoderPrice_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': print("calling loop function") print("----------------------------") loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] #Xinchun: added encoderPrice_outputs encoderPrice_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoderPrice_outputs ] """ Jenkai: modified the shape of self._enc_top_state, shape=(4, 240, 512) and self._dec_in_state, shape=(4, 1024) need to verify it's correctness """ #Xinchun: modified _enc_top_states and _dec_in_state print("111111111111111") selfConcatEncoder1 = tf.concat((encoder_outputs), 1) selfConcatEncoder2 = tf.concat((encoderPrice_outputs), 1) self._enc_top_states = tf.concat( [selfConcatEncoder1, selfConcatEncoder2], 1 ) #tf.concat(axis = 1, values = tf.concat((encoder_outputs, encoderPrice_outputs), 1)) self._dec_in_state = tf.add(fw_state, fwPrice_state) print("222222222222222") # self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) # self._dec_in_state = fw_state # print(emb_decoder_inputs) # print(self._dec_in_state) # print(self._enc_top_states) # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) print("3333333333333333") with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) # emb_encoder_inputs中的每个元素shape是[batch_size,emb_dim] emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=True) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=True) # emb_encoder_inputs的shape是[batch_size,2*num_hidden] (emb_encoder_inputs, fw_state, _) = tf.nn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=True) # 扩展维度,由原来的二维变成三维 encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] # _enc_top_states的shape是[batch_size,len(encoder_outputs),2*hps.num_hidden] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') # decoder_outputs 是一个list,其中每个元素的shape为[batch_size x output_size],其中output_size,由于没有指定,所以其值为cell.output_size即hps.num_hidden decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: # tf.get_variable_scope() 返回的只是 variable_scope,不管 name_scope. 在使用tf.get_variable_scope().reuse_variables() 时可以无视name_scope tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) # 取model_outputs[-1]最后一个,在测试的时候,beam_search每次获取attention最后一个输出的topk,来进行搜索 self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unpack(tf.transpose(self._articles)) decoder_inputs = tf.unpack(tf.transpose(self._abstracts)) targets = tf.unpack(tf.transpose(self._targets)) loss_weights = tf.unpack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d'%layer_i), tf.device( self._next_device()): cell_fw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123)) cell_bw = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113)) (emb_encoder_inputs, fw_state, _) = tf.nn.bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.nn.rnn_cell.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113)) encoder_outputs = [tf.reshape(x, [hps.batch_size, 1, 2*hps.num_hidden]) for x in encoder_outputs] self._enc_top_states = tf.concat(1, encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.nn.seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat( 1, [tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size*2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, v, inputs, labels, hps.num_softmax_samples, vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.nn.seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): ''' tf.transpose:转置 tf.stack()这是一个矩阵拼接的函数,tf.unstack()则是一个矩阵分解的函数,默认是按行分解 self._articles维度是[hps.batch_size, hps.enc_timesteps] 这样一来,encoder_inputs的每一次输入的都是本批次所有_articles的第一个单词组成一行 encoder_inputs就变成了由120个[64,1]组成 ''' encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. # 扩充每个单词的词嵌入维度 with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) #emb_encoder_inputs成为120个[64,1,128]组成 emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] #深度双向RNN for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): #前向RNN cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, #256 initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) #反向RNN cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) #outputs是一个tuple(outputs_fw, outputs_bw) #static_bidirectional_rnn:双向递归神经网络,目标是增加RNN可利用的信息,它可以同时使用时序数据中某个输入的历史及未来数据 #同一个时刻的正向节点和反向节点是不共用的,作为输出的时候是两个节点输出一个结果 #cell_fw代表前向的Cell,cell_bw代表反向Cell #将输入emb_encoder_inputs覆盖,作为下一步的输入,同时记录了前向RNN最后时刻的fw_state #emb_encoder_inputs为120个[64,1,128]组成,static_bidirectional_rnn每次取一个,输出一个结果。 #最终新的emb_encoder_inputs中也有120个[64,2,256] #存在4层双向RNN (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( #num_hidden:256,vsize是词汇表大小 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) #转置后每行对应的词嵌入维度 w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, #256 initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) #encoder_outputs原本是120个[64,2,256] #将每一个单词的维度变为:[64,1,512] #这样一来encoder_outputs就变成了120个[64,1,512] encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] #将encoder_outputs拼接成[64,120,512] #_enc_top_states是作为decode部分中attention的输入,_dec_in_state是decode部分中LSTM的输入,fw_state是前向RNN的最后的state状态 self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') #emb_decoder_inputs是120个[64,1,128]组成,_dec_in_state是前向RNN的最后的state状态,_enc_top_states是encoder部分的所有输出 #decoder_outputs的decode阶段的输出,_dec_out_state是decode阶段中最后的lstm状态 decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): #tf.get_variable()会检测已经存在的变量是否已经共享.如果你想共享他们,你需要像下面使用的一样,通过reuse_variables()这个方法来指定. if i > 0: tf.get_variable_scope().reuse_variables() #w是[num_hidden,vsize],num_hidden:256,vsize是词汇表大小,得到model_outputs的120个输出为:[64,vsize] model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): #从vsize中取出那个字 #best_outputs是120个[64,1],model_outputs是120个[64,vsize],因为取的是最大的argmax,因此视为最佳 best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) #self._outputs是[64,120] self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) #取出最后的state,[64,vsize] #top_k这个函数的作用是返回 input 中每行最大的 k 个数,并且返回它们所在位置的索引 #tf.nn.top_k(input, k, name=None) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) #targets是[hps.batch_size, hps.dec_timesteps] #loss_weights是[hps.batch_size, hps.dec_timesteps] #model_outputs是 if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): # unstacks the articles, abstracts, targets, etc into a list of len=time_steps. encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. # embedds words in the encoder and decoder with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] # stack n layers of lstms for encoder for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs # define a weight matrix to project output of hidden state to vocabulary (w=[num_hiddn x vocab_size], biases=v) with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. In training just use the direct inputs loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) # reshape encoder_outputs # the outputs is a list of shapes [batch_size x 2*num_hidden] # 2*num_hidden because we have a bidirectional rnn and it concats the outputs # we want convert the list of shapes into a single tensor where the second dimension is time_steps # add a new dimension at second dimension : [batch_size, 1, 2*num_hidden] encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] # then concat all the time_steps along that axis: shape=[batch_size, time_steps, 2*num_hidden] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) # last step of the fw rnn for decoder input self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') # during decoding, the RNN can look up information in the additional tensor, attention_states # Next decode using attention # decoder_outputs is a list of tensorf of shape [batch_size x output_size] # TODO: check actually how does the `attention_decoder` works decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( decoder_inputs= emb_decoder_inputs, # a list of 2D tensorfs [batch_size x embedding_size] initial_state=self. _dec_in_state, # 2D Tensor [batch_size, cell.state_size] attention_states=self. _enc_top_states, # 3D Tensor [batch_size, attn_length x attn_size], attn_length here is the time_steps, attn_size is the size of the rnn output (2*num_hidden) cell=cell, num_heads= 1, # number of attention heads that read from attention_states loop_function= loop_function, # this function will be applied to i-th output in order to generate i+1 th input and decoder_inputs will be ignored, except for the first element (GO symbol). This can be also used in training to emulate, initial_state_attention=initial_state_attention) # get the output of the decoder and project it into the vocabulary matrix (output = w*output with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): # get the most probable word along the vocabulary. best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) # define loss, using sampled loss instead of full softmax with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps # vocab size vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): # max_steps个array,每个里面的元素为长度为batch的输入 encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. # embedding(学习出来的) with tf.variable_scope('embedding'), tf.device('/cpu:0'): # embedding的shape为vsize * embedding dim embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) # 得到embedded的inputs emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] # enc_layers的encoder for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): # 前向和后向的cell cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) # 实际上多层是将encoder的输出接到下一层 (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): # 加一层output_projection w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step for the next step. loop_function = None if hps.mode == 'decode': # decode的时候用 loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) # decoder的cell cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) # 进行reshape encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] # 得到enc的输出和dec(只要fw state) self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') # 直接调用attention_decoder decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) # 输出层 with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) # 对于decode模式,直接找topk,用于beam search if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) # loss with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) # train的时候用sampled loss if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs_1 = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] #----------------------------------------------------------------------- #!!! ADD # emb_encoder_inputs_1: [enc_timesteps, batch_size, word_emb] # enc_timesteps = num_words in article, and we should convert it to # num_sentences in article # So emb_encoder_inputs_2 should be [num_sentences, batch_size, sentence_emb] num_sentences = 300 #max num_word_in_sent = np.load( "/home/dell-u/Spyder/textsum/num_sent_num_words_matrix.npy" ) sentence_emb = 300 gru_size = 200 emb_encoder_inputs_2 = tf.zeros( (num_sentences, batch_size, sentence_emb)) gru = tf.contrib.rnn.GRUCell(gru_size) #len_sentences is a list includes the index_of_interval of each sentence batch_size = emb_encoder_inputs_1.shape[1] for ib in range(batch_size): #!!! TODO len_sentences = num_word_in_sent[0] # for each sentence in sample for lo, hi in zip(np.append([0], len_sentences[:-1]), len_sentences): sent_embs = [] # Initial state of the LSTM memory. state = tf.zeros([1, gru.state_size]) for j in range(lo, hi): output, state = gru(emb_encoder_inputs_1[j, ib, :], state) final_state = state # final_state is a vector sent_embs.append(final_state) emb_encoder_inputs_2[:, ib, ] = sent_embs #----------------------------------------------------------------------- emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def build_model(self): image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image]) image_emb = tf.matmul(image, self.encode_img_W) + \ self.encode_img_b captions = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps], name='captions') articles = tf.placeholder(tf.int32, [self.batch_size, None], name='articles') # self.enc_timesteps]) news_len = tf.placeholder(tf.int32, [self.batch_size], name='news_len') mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps]) state = self.lstm.zero_state(self.batch_size, tf.float32) loss = 0.0 with tf.variable_scope("encoder"): # Dealing with news text current_emb = tf.nn.embedding_lookup(self.Wemb, articles) + self.bemb current_emb = tf.concat( #for image 1, [tf.expand_dims(image_emb, 1), current_emb]) encoder_outputs, state = tf.nn.bidirectional_dynamic_rnn( self.lstm, self.back_lstm, current_emb, news_len, dtype=tf.float32) state = state[0] encoder_outputs = tf.concat(1, encoder_outputs) with tf.variable_scope("decoder"): current_emb = tf.nn.embedding_lookup(self.Wemb, captions) + self.bemb current_emb = unpack_sequence(current_emb) cell = tf.nn.rnn_cell.LSTMCell( FLAGS.dim_hidden, state_is_tuple=True, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113)) #, cell = rnn_cell.DropoutWrapper(self.lstm, output_keep_prob=FLAGS.dropout) decoder_outputs, dec_out_state = tf.nn.seq2seq.attention_decoder( decoder_inputs=current_emb, initial_state=state, attention_states=encoder_outputs, cell=cell, output_size=None, num_heads=1, dtype=None, scope=None, initial_state_attention=False) with tf.variable_scope('loss'): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( tf.transpose(self.embed_word_W), self.embed_word_b, inputs, labels, 4096, self.n_words) #4096 decoder_outputs = decoder_outputs[:-1] sentence_modif = tf.slice(captions, [0, 1], [-1, -1]) mask_modif = tf.slice(mask, [0, 0], [-1, self.n_lstm_steps - 1], name='mask') loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, unpack_sequence(sentence_modif), unpack_sequence(mask_modif), sampled_loss_func) variable_summaries("loss", loss) with tf.variable_scope('output'): model_outputs = [] for i in range(len(decoder_outputs)): model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], self.embed_word_W, self.embed_word_b)) with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] best_outputs = tf.transpose(best_outputs) return loss, image, captions, mask, articles, news_len
def _add_seq2seq(self): hps = self._hps vsize = hps.vocab_size with tf.variable_scope('seq2seq'): encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): W = tf.Variable(tf.constant(0.0, shape=[hps.vocab_size, hps.emb_dim]), trainable=True, name="W") embedding = W.assign(self._embedding_placeholder) emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] #embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) #emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs] #emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=True) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): # TODO: change the output vocabulary to use only the word set coming from this batch # rather than the whole dictionary # REFERTO: (Abstractive Text Summarization using Sequence-to-sequence RNNS and Beyond) w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=True) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append(tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))