def pointer_decoder(encoder_inputs_emb, decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): #print 'encoder_inputs',encoder_inputs_emb #print 'decoder_inputs', decoder_inputs #print 'attention_states', attention_states encoder_inputs = encoder_inputs_emb #attn_length = attention_states.get_shape()[1].value #attn_size = attention_states.get_shape()[2].value attn_length = shape(attention_states, 1) attn_size = shape(attention_states, 2) with tf.name_scope('attention_setup'): attnw = tf.get_variable("AttnW", [1, attn_size, attn_size]) attention_states = tf.nn.conv1d(attention_states, attnw, 1, 'SAME') attnv = tf.get_variable("AttnV", [attn_size]) sys.stdout = sys.stderr def attention_weight(output): y = _linear(output, attn_size, True) y = tf.reshape(y, [-1, 1, attn_size]) # Calculate attention weights for every encoder's input by taking an inner product between the weight bector (attnv), and the conbined decoder's state with the encoder's output. attention_vectors = tf.nn.softmax( tf.reduce_sum(attnv * tf.tanh(y + attention_states), axis=2)) return attention_vectors states = [initial_state] outputs = [] pointed_idxs = [] for i, d in enumerate(tf.unstack(decoder_inputs, axis=1)): with tf.name_scope('Decode_%d' % i): if i > 0: tf.get_variable_scope().reuse_variables() pointed_idx = d # in testing, inputs to decoder won't be used except the first one. if feed_prev and i > 0: # take argmax, convert the pointed index into one-hot, and get the pointed encoder_inputs by multiplying and reduce_sum. pointed_idx = tf.argmax(output, axis=1, output_type=tf.int32) pointed_idxs.append(pointed_idx) with tf.name_scope('copy_from_encoder_inputs'): pointed_idx = tf.reshape( tf.one_hot(pointed_idx, depth=attn_length), [-1, attn_length, 1]) inp = tf.reduce_sum(encoder_inputs * pointed_idx, axis=1) inp = tf.stop_gradient(inp) output, state = cell(inp, states[-1]) with tf.name_scope('attention_weight'): output = attention_weight(output) #print 'output', output states.append(state) outputs.append(output) outputs = tf.stack(outputs, axis=1) states = tf.stack(states, axis=1) return outputs, states, pointed_idxs
def setup_placeholder(self, config): ''' Prepare tf.placeholder and their lengthes. They are kept as instance variables. ''' self.e_inputs_w_ph = tf.placeholder(tf.int32, [None, None], name="EncoderInputWords") self.e_inputs_c_ph = tf.placeholder(tf.int32, [None, None, None], name="EncoderInputChars") #self.d_outputs_ph = tf.placeholder( # tf.int32, [None, None], name="DecoderOutput") self.d_outputs_ph = self.e_inputs_w_ph self.is_training = tf.placeholder(tf.bool, [], name='is_training') with tf.name_scope('keep_prob'): self.keep_prob = 1.0 - tf.to_float( self.is_training) * config.dropout_rate with tf.name_scope('batch_size'): self.batch_size = batch_size = shape(self.d_outputs_ph, 0) with tf.name_scope('start_tokens'): self.start_tokens = tf.tile(tf.constant([BOS_ID], dtype=tf.int32), [batch_size]) with tf.name_scope('end_tokens'): self.end_token = PAD_ID end_tokens = tf.tile(tf.constant([self.end_token], dtype=tf.int32), [batch_size]) # Count the length of each dialogue, utterance, (word). with tf.name_scope('utterance_length'): self.uttr_lengths = tf.count_nonzero(self.e_inputs_w_ph, axis=1, dtype=tf.int32) ''' # Example of the decoder's inputs and outputs. Against a given input ['how', 'are', 'you', '?'] to the decoder's placeholder, - decoder's input : ['_BOS', 'how', 'are', 'you', '?'] - decoder's output (target) : ['how', 'are', 'you', '?', '_PAD'] - target_length: 5 - target_weights: [1, 1, 1, 1, 1] Here, the token _PAD behaves as EOS. ''' with tf.name_scope('decoder_inputs'): self.decoder_inputs = tf.concat( [tf.expand_dims(self.start_tokens, 1), self.d_outputs_ph], axis=1) # the length of decoder's inputs/outputs is increased by 1 because of BOS or EOS. with tf.name_scope('target_lengths'): self.target_length = tf.count_nonzero( self.d_outputs_ph, axis=1, dtype=tf.int32) + 1 with tf.name_scope('target_weights'): self.target_weights = tf.sequence_mask(self.target_length, dtype=tf.float32) with tf.name_scope('targets'): self.targets = tf.concat( [self.d_outputs_ph, tf.expand_dims(end_tokens, 1)], axis=1)[:, :shape(self.target_weights, 1)]
def encode(self, inputs, sequence_length): with tf.variable_scope(self.shared_scope or "CNNEncoder"): target_rank = 3 # [*, max_sequence_length, hidden_size] flattened_inputs, prev_shape = flatten(inputs, target_rank) flattened_aggregated_outputs = cnn(flattened_outputs, activation=self.activation) target_shape = prev_shape[:-2] + [ shape(flattened_aggregated_outputs, -1) ] outputs = tf.reshape(flattened_aggregated_outputs, target_shape) outputs = tf.nn.dropout(outputs, self.keep_prob) return outputs, outputs
def setup_decoder_states(self, config, encoder_outputs, encoder_state, scope=None): attention_states = encoder_outputs response_emb = tf.nn.embedding_lookup(self.w_embeddings, self.d_outputs_ph) response_lengths = tf.count_nonzero(self.d_outputs_ph, axis=1, dtype=tf.int32) print 'encoder_state', encoder_state print 'encoder_outputs', encoder_outputs _, h_future = self.uttr_encoder.encode(response_emb, response_lengths) print 'h_future', h_future def _get_distribution(state, output_size): h = state num_layers = 1 for i in range(num_layers): with tf.variable_scope('linear%d' % i) as scope: h = linear(h, output_size, scope=scope) with tf.variable_scope('Mean'): mean = linear(h, output_size, activation=None) with tf.variable_scope('Var'): var = linear(h, output_size, activation=tf.nn.softplus) return tfd.MultivariateNormalDiag(mean, var) output_size = shape(encoder_state, -1) with tf.variable_scope('Prior'): self.prior = _get_distribution(encoder_state, output_size) with tf.variable_scope('Posterior'): self.posterior = _get_distribution( tf.concat([encoder_state, h_future], axis=-1), output_size) train_decoder_state = tf.concat( [encoder_state, self.posterior.sample()], axis=-1) test_decoder_state = tf.concat( [encoder_state, self.prior.sample()], axis=-1) #train_decoder_state = encoder_state + self.posterior.sample() #test_decoder_state = encoder_state + self.prior.sample() print train_decoder_state print test_decoder_state return train_decoder_state, test_decoder_state, attention_states
def __init__(self, sess, conf, vocab): ModelBase.__init__(self, sess, conf) self.vocab = vocab input_max_len, output_max_len = None, conf.output_max_len self.is_training = tf.placeholder(tf.bool, [], name='is_training') with tf.name_scope('keep_prob'): self.keep_prob = 1.0 - tf.to_float( self.is_training) * conf.dropout_rate with tf.name_scope('EncoderInput'): self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len], name="EncoderInput") with tf.name_scope('batch_size'): batch_size = shape(self.e_inputs_ph, 0) with tf.variable_scope('Embeddings') as scope: self.w_embeddings = self.initialize_embeddings( 'Word', vocab.embeddings.shape, initializer=tf.constant_initializer(vocab.embeddings), trainable=conf.train_embedding)
def __init__(self, sess, conf, vocab): ModelBase.__init__(self, sess, conf) self.vocab = vocab input_max_len, output_max_len = None, conf.output_max_len self.is_training = tf.placeholder(tf.bool, [], name='is_training') with tf.name_scope('keep_prob'): self.keep_prob = 1.0 - tf.to_float( self.is_training) * conf.dropout_rate # <Sample input> # e_inputs: [1, 40, 44, 0, 0], d_outputs: [2, 0, 0] (target=44) with tf.name_scope('EncoderInput'): self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len], name="EncoderInput") with tf.name_scope('batch_size'): batch_size = shape(self.e_inputs_ph, 0) with tf.variable_scope('Embeddings') as scope: w_embeddings = self.initialize_embeddings( 'Word', vocab.embeddings.shape, initializer=tf.constant_initializer(vocab.embeddings), trainable=conf.train_embedding) with tf.variable_scope('WordEncoder') as scope: word_encoder = WordEncoder(conf, w_embeddings, self.keep_prob, shared_scope=scope) e_inputs_emb = word_encoder.encode([self.e_inputs_ph]) with tf.variable_scope('SentEncoder') as scope: sent_encoder = SentenceEncoder(conf, self.keep_prob, shared_scope=scope) e_inputs_length = tf.count_nonzero(self.e_inputs_ph, axis=1) e_outputs, e_state = sent_encoder.encode(e_inputs_emb, e_inputs_length) attention_states = e_outputs self.d_outputs_ph = [] self.losses = [] self.greedy_predictions = [] self.copied_inputs = [] for i, col_name in enumerate(conf.target_columns): with tf.name_scope('DecoderOutput%d' % i): d_outputs_ph = tf.placeholder(tf.int32, [None, output_max_len], name="DecoderOutput") ds_name = 'Decoder' if conf.share_decoder else 'Decoder%d' % i with tf.variable_scope(ds_name) as scope: d_cell = setup_cell(conf.cell_type, conf.rnn_size, conf.num_layers, keep_prob=self.keep_prob) teacher_forcing = conf.teacher_forcing if 'teacher_forcing' in conf else False d_outputs, predictions, copied_inputs = setup_decoder( d_outputs_ph, e_inputs_emb, e_state, attention_states, d_cell, batch_size, output_max_len, scope=scope, teacher_forcing=teacher_forcing) self.copied_inputs.append(copied_inputs) d_outputs_length = tf.count_nonzero(d_outputs_ph, axis=1, name='outputs_length') with tf.name_scope('add_eos'): targets = tf.concat([ d_outputs_ph, tf.zeros([batch_size, 1], dtype=tf.int32) ], axis=1) # the length of outputs should be also added by 1 because of EOS. with tf.name_scope('output_weights'): d_outputs_weights = tf.sequence_mask( d_outputs_length + 1, maxlen=shape(d_outputs_ph, 1) + 1, dtype=tf.float32) with tf.name_scope('loss%d' % i): loss = tf.contrib.seq2seq.sequence_loss( d_outputs, targets, d_outputs_weights) self.d_outputs_ph.append(d_outputs_ph) self.losses.append(loss) self.greedy_predictions.append(predictions) with tf.name_scope('Loss'): self.loss = tf.reduce_mean(self.losses) self.updates = self.get_updates(self.loss)
def setup_decoder(self, config, train_decoder_state, test_decoder_state, embeddings, encoder_input_lengths=None, attention_states=None, projection_layer=None, scope=None): batch_size = self.batch_size decoder_inputs_emb = tf.nn.embedding_lookup(embeddings, self.decoder_inputs) # TODO: 多言語対応にする時はbias, trainableをfalseにしてembeddingをconstantにしたい decoder_cell = setup_cell(config.decoder.cell_type, shape(train_decoder_state, -1), config.decoder.num_layers, keep_prob=self.keep_prob) if projection_layer is None: with tf.variable_scope('projection') as scope: kernel = tf.transpose(embeddings, perm=[1, 0]) projection_layer = SharedKernelDense(shape(embeddings, 0), use_bias=False, trainable=False, shared_kernel=kernel) with tf.name_scope('Training'): train_decoder_cell = decoder_cell decoder_initial_state = train_decoder_state helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs_emb, sequence_length=self.target_length, time_major=False) decoder = tf.contrib.seq2seq.BasicDecoder( train_decoder_cell, helper, decoder_initial_state, output_layer=projection_layer) train_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, impute_finished=True, maximum_iterations=tf.reduce_max(self.target_length), scope=scope) logits = train_decoder_outputs.rnn_output with tf.name_scope('Test'): beam_width = config.beam_width test_decoder_cell = decoder_cell decoder_initial_state = tf.contrib.seq2seq.tile_batch( test_decoder_state, multiplier=beam_width) decoder = tf.contrib.seq2seq.BeamSearchDecoder( test_decoder_cell, embeddings, self.start_tokens, self.end_token, decoder_initial_state, beam_width, output_layer=projection_layer, length_penalty_weight=config.length_penalty_weight) test_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, impute_finished=False, maximum_iterations=config.utterance_max_len, scope=scope) predictions = test_decoder_outputs.predicted_ids #self.beam_scores = test_decoder_outputs.beam_search_decoder_output.scores # memo: 出力結果はbeam_scoresの低い順にならんでいて (負の値を取る),概ねそれはちゃんと正確さと一致してそう? return logits, predictions
def pointer_decoder(encoder_inputs_emb, decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): encoder_inputs = encoder_inputs_emb attn_length = shape(attention_states, 1) attn_size = shape(attention_states, 2) with tf.name_scope('attention_setup'): # Prepare the weights for attention calculation. We assume here the sizes of attention_states (encoder's outputs), encoder's state, decoder's output are same. attnw = tf.get_variable("AttnW1", [1, attn_size, attn_size]) attnw2 = tf.get_variable("AttnW2", [attn_size, attn_size]) attnv = tf.get_variable("AttnV", [attn_size]) # Calculate W1 * attention_states in advance since each output and state of encoder is unchanged while decoding. attention_states = tf.nn.conv1d(attention_states, attnw, 1, 'SAME') sys.stdout = sys.stderr def attention_weight(output): """ Calculate attention weights for every encoder's input by taking an inner product the weight bector (attnv) with the conbined and transformed the encoder's output and decoder's state. output_probabilities[i] = V・tanh(W1・attention_state[i] + W2・decoder's output[t]) - i: the index of an input word - t: current time-step in decoding - V: a tensor with the shape [attention_size] - W1: a tensor with the shape [attention_size, encoder's rnn_size] - W2: a tensor with the shape [attention_size, decoder's rnn_size] """ y = tf.matmul(output, attnw2) y = tf.reshape(y, [-1, 1, attn_size]) attention_vectors = tf.nn.softmax(tf.reduce_sum(attnv * tf.tanh(attention_states + y), axis=2)) return attention_vectors states = [initial_state] outputs = [] pointed_idxs = [] with tf.name_scope('Decode_Timestep'): for i, d in enumerate(tf.unstack(decoder_inputs, axis=1)): with tf.name_scope('Decode_%d' % i): if i > 0: tf.get_variable_scope().reuse_variables() # The first input to the decoder is something like _START (or just a _PAD) token we prepared to start decoding. pointed_idx = d # If feed_prev == True, inputs to decoder won't be used except the first one. The model makes decisions of which should be the next inputs by itself. if feed_prev and i > 0: # Take argmax to decide which indices of input should be most possible. pointed_idx = tf.argmax(output, axis=1, output_type=tf.int32) pointed_idxs.append(pointed_idx) with tf.name_scope('copy_from_encoder_inputs'): # Convert the pointed index into one-hot, and get the pointed encoder_inputs by multiplying and reduce_sum. pointed_idx = tf.reshape(tf.one_hot(pointed_idx, depth=attn_length), [-1, attn_length, 1]) inp = tf.reduce_sum(encoder_inputs * pointed_idx, axis=1) # In their original paper, the gradients shouldn't be propagated to input embeddings through these copying. The embeddings should be updated only from the encoder. inp = tf.stop_gradient(inp) output, state = cell(inp, states[-1]) # Calculate the output (and the next input) distribution with tf.name_scope('attention_weight'): output = attention_weight(output) states.append(state) outputs.append(output) with tf.name_scope('outputs'): outputs = tf.stack(outputs, axis=1) with tf.name_scope('states'): states = tf.stack(states, axis=1) with tf.name_scope('pointed_idx'): pointed_idxs = tf.stack(pointed_idxs, axis=1) return outputs, states, pointed_idxs
def setup_decoder(self, config, train_decoder_state, test_decoder_state, encoder_input_lengths=None, attention_states=None, projection_layer=None, scope=None): batch_size = self.batch_size decoder_inputs_emb = tf.nn.embedding_lookup(self.w_embeddings, self.decoder_inputs) # TODO: 多言語対応にする時はbias, trainableをfalseにしてembeddingをconstantにしたい decoder_cell = setup_cell(config.decoder.cell_type, shape(train_decoder_state, -1), config.decoder.num_layers, keep_prob=self.keep_prob) if projection_layer is None: with tf.variable_scope('projection') as scope: projection_layer = tf.layers.Dense(config.w_vocab_size, use_bias=True, trainable=True) with tf.name_scope('Training'): if config.attention_type: assert attention_states is not None num_units = shape(attention_states, -1) attention = tf.contrib.seq2seq.LuongAttention( num_units, attention_states, memory_sequence_length=encoder_input_length) train_decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention) decoder_initial_state = train_decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=train_decoder_state) else: train_decoder_cell = decoder_cell decoder_initial_state = train_decoder_state # encoder_state can't be directly copied into decoder_cell when using the attention mechanisms, initial_state must be an instance of AttentionWrapperState. (https://github.com/tensorflow/nmt/issues/205) helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs_emb, sequence_length=self.target_length, time_major=False) decoder = tf.contrib.seq2seq.BasicDecoder( train_decoder_cell, helper, decoder_initial_state, output_layer=projection_layer) train_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, impute_finished=True, maximum_iterations=tf.reduce_max(self.target_length), scope=scope) logits = train_decoder_outputs.rnn_output with tf.name_scope('Test'): beam_width = config.beam_width if config.attention_type: num_units = shape(attention_states, -1) attention = tf.contrib.seq2seq.LuongAttention( num_units, tf.contrib.seq2seq.tile_batch(attention_states, multiplier=beam_width), memory_sequence_length=tf.contrib.seq2seq.tile_batch( encoder_input_length, multiplier=beam_width)) test_decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention) decoder_initial_state = test_decoder_cell.zero_state( batch_size * beam_width, tf.float32).clone(cell_state=tf.contrib.seq2seq.tile_batch( test_decoder_state, multiplier=beam_width)) else: test_decoder_cell = decoder_cell decoder_initial_state = tf.contrib.seq2seq.tile_batch( test_decoder_state, multiplier=beam_width) decoder = tf.contrib.seq2seq.BeamSearchDecoder( test_decoder_cell, self.w_embeddings, self.start_tokens, self.end_token, decoder_initial_state, beam_width, output_layer=projection_layer, length_penalty_weight=config.length_penalty_weight) test_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, impute_finished=False, maximum_iterations=config.utterance_max_len, scope=scope) predictions = test_decoder_outputs.predicted_ids return logits, predictions
def __init__(self, sess, config, vocab, encoder=None, is_training=None): PointerNetworkBase.__init__(self, sess, config, vocab, is_training=is_training) input_max_len, output_max_len = None, config.output_max_len # <Sample input> # e_inputs: [1, 40, 44, 0, 0], d_outputs: [2, 0, 0] (target=44) with tf.name_scope('EncoderInput'): self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len], name="EncoderInput") self.pos_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len], name="EncoderInputPOS") self.wtype_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len], name="EncoderInputWordType") with tf.name_scope('batch_size'): batch_size = shape(self.e_inputs_ph, 0) with tf.variable_scope('Embeddings') as scope: e_inputs_emb = [] w_embeddings = self.initialize_embeddings( 'Word', vocab.word.embeddings.shape, initializer=tf.constant_initializer(vocab.word.embeddings), trainable=config.train_embedding) e_inputs_emb.append( tf.nn.embedding_lookup(w_embeddings, self.e_inputs_ph)) if self.use_pos: pos_embeddings = self.initialize_embeddings( 'POS', [vocab.pos.size, config.feature_size], trainable=True) e_inputs_emb.append( tf.nn.embedding_lookup(pos_embeddings, self.pos_inputs_ph)) if self.use_wtype: wtype_embeddings = self.initialize_embeddings( 'Wtype', [vocab.wtype.size, config.feature_size], trainable=True) e_inputs_emb.append( tf.nn.embedding_lookup(wtype_embeddings, self.wtype_inputs_ph)) e_inputs_emb = tf.concat(e_inputs_emb, axis=-1) e_inputs_emb = tf.nn.dropout(e_inputs_emb, self.keep_prob) with tf.variable_scope('SentEncoder') as scope: # If an encoder is not given, prepare a new one. if encoder is None: encoder_type = getattr(encoder_class, config.encoder_type) sent_encoder = encoder_type(config, self.keep_prob, shared_scope=scope) else: sent_encoder = encoder e_inputs_length = tf.count_nonzero(self.e_inputs_ph, axis=1) e_outputs, e_state = sent_encoder.encode(e_inputs_emb, e_inputs_length) attention_states = e_outputs self.d_outputs_ph = [] self.losses = [] self.greedy_predictions = [] self.copied_inputs = [] for i, col_name in enumerate(self.target_columns): with tf.name_scope('DecoderOutput%d' % i): d_outputs_ph = tf.placeholder(tf.int32, [None, output_max_len], name="DecoderOutput") ds_name = 'Decoder' if config.share_decoder else 'Decoder%d' % i with tf.variable_scope(ds_name) as scope: d_cell = setup_cell(config.cell_type, config.rnn_size, config.num_layers, keep_prob=self.keep_prob) teacher_forcing = config.teacher_forcing if 'teacher_forcing' in config else False d_outputs, predictions, copied_inputs = setup_decoder( d_outputs_ph, e_inputs_emb, e_state, attention_states, d_cell, batch_size, output_max_len, scope=scope, teacher_forcing=teacher_forcing) self.copied_inputs.append(copied_inputs) d_outputs_length = tf.count_nonzero(d_outputs_ph, axis=1, name='outputs_length') with tf.name_scope('add_eos'): targets = tf.concat([ d_outputs_ph, tf.zeros([batch_size, 1], dtype=tf.int32) ], axis=1) # the length of outputs should be also added by 1 because of EOS. with tf.name_scope('output_weights'): d_outputs_weights = tf.sequence_mask( d_outputs_length + 1, maxlen=shape(d_outputs_ph, 1) + 1, dtype=tf.float32) with tf.name_scope('loss%d' % i): loss = tf.contrib.seq2seq.sequence_loss( d_outputs, targets, d_outputs_weights) self.d_outputs_ph.append(d_outputs_ph) self.losses.append(loss) self.greedy_predictions.append(predictions) with tf.name_scope('Loss'): self.loss = tf.reduce_mean(self.losses) self.updates = self.get_updates(self.loss)