def _create_loss(self): print('Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.') start = time.time() def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, self.cell, num_encoder_symbols=config.ENC_VOCAB, num_decoder_symbols=config.DEC_VOCAB, embedding_size=config.HIDDEN_SIZE, output_projection=self.output_projection, feed_previous=do_decode) if self.fw_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, self.targets, self.decoder_masks, config.BUCKETS, lambda x, y: _seq2seq_f(x, y, True), softmax_loss_function=self.softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if self.output_projection: for bucket in xrange(len(config.BUCKETS)): self.outputs[bucket] = [tf.matmul(output, self.output_projection[0]) + self.output_projection[1] for output in self.outputs[bucket]] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, self.targets, self.decoder_masks, config.BUCKETS, lambda x, y: _seq2seq_f(x, y, False), softmax_loss_function=self.softmax_loss_function) print('Time:', time.time() - start)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, wordEmbedding=None, num_samples=-1, embedding_size=100, forward_only=False, beam_search=False, beam_size=10, category=6, use_emb=False, use_imemory=False, use_ememory=False, emotion_size=100, imemory_size=256, dtype=tf.float32): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels, num_samples, self.target_vocab_size), dtype) softmax_loss_function = sampled_loss else: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) # Create the internal multi-layer cell for our RNN. def create_rnn_cell(): gr = tf.contrib.rnn.GRUCell(size) return gr gru = tf.contrib.rnn.GRUCell(size) encoder_cell = gru if num_layers > 1: encoder_cell = tf.contrib.rnn.MultiRNNCell( [create_rnn_cell() for _ in range(num_layers)], ) # Create the internal multi-layer cell for our RNN. decoder_cell = encoder_cell print('===ok=====') # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, decoder_emotions, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, decoder_emotions, decoder_cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, emotion_category=category, emotion_size=emotion_size, imemory_size=imemory_size, wordEmbedding=wordEmbedding, use_emb=use_emb, use_imemory=use_imemory, use_ememory=use_ememory, output_projection=output_projection, initial_state_attention=True, feed_previous=do_decode, dtype=dtype, beam_search=beam_search, beam_size=beam_size) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] self.target_weights1 = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) self.target_weights1.append( tf.placeholder(dtype, shape=[None], name="weight1{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] self.decoder_emotions = tf.placeholder(tf.int32, shape=[None], name="decoder_emotion") # Training outputs and losses. if forward_only: if beam_search: self.outputs, self.beam_results, self.beam_symbols, self.beam_parents = seq2seq.decode_model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.decoder_emotions, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.target_weights1, self.decoder_emotions, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory) else: self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.target_weights1, self.decoder_emotions, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.pretrain_var = [] self.initial_var = [] for i in tf.trainable_variables(): if 'Emotion' not in i.name and 'emotion' not in i.name and 'memory' not in i.name and 'Memory' not in i.name: self.pretrain_var.append(i) for i in tf.all_variables(): if i not in self.pretrain_var: self.initial_var.append(i) self.pretrain_saver = tf.train.Saver( self.pretrain_var, write_version=tf.train.SaverDef.V2) self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=200)
def __init__(self, source_vocab_size, target_vocab_size, buckets, dummy_set, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, fixed_rate, weibo_rate, qa_rate, use_lstm=False, num_samples=512, forward_only=False, scope_name='seq2seq', dtype=tf.float32): self.scope_name = scope_name with tf.variable_scope(self.scope_name): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.fixed_rate = fixed_rate self.weibo_rate = weibo_rate self.qa_rate = qa_rate self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.dummy_dialogs = dummy_set # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels, num_samples, self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return rl_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # for reinforcement learning self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input") self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj") # Training outputs and losses. #if forward_only: self.outputs, self.losses, self.encoder_state = rl_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f( x, y, tf.select(self.force_dec_input, False, True)), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. #if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ control_flow_ops.cond( self.en_output_proj, lambda: tf.matmul(output, output_projection[ 0]) + output_projection[1], lambda: output) for output in self.outputs[b] ] # Gradients and SGD update operation for training the model. self.tvars = tf.trainable_variables() #if not forward_only: self.gradient_norms = [] self.updates = [] self.advantage = [ tf.placeholder(tf.float32, name="advantage_%i" % i) for i in range(len(buckets)) ] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): adjusted_losses = tf.sub(self.losses[b], self.advantage[b]) gradients = tf.gradients(adjusted_losses, self.tvars) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, self.tvars), global_step=self.global_step)) # self.saver = tf.train.Saver(tf.all_variables()) all_variables = [ k for k in tf.global_variables() if k.name.startswith(self.scope_name) ] self.saver = tf.train.Saver(all_variables)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, embedding_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=2048, forward_only=False, dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): #print(do_decode[0].dtype) return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, do_decode, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, output_projection=output_projection, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] self.decode = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder_{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder_{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight_{0}".format(i))) self.decode.append( tf.placeholder(tf.bool, name='decode_{0}'.format(i))) #self.iteration = tf.placeholder(tf.float32) #self.eps = exp_decay(self.iteration) #self.decode = sampling(self.eps, self.iteration, buckets[-1][1]+1) #self.decode = tf.placeholder(tf.bool, shape=[buckets[-1][1]+1], name='decode') # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.states, self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, self.decode), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] for b in xrange(len(buckets)): self.outputs[b] = [ tf.nn.log_softmax(output) for output in self.outputs[b] ] else: self.states, self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, self.decode), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, encoder_masks, encoder_inputs_tensor, decoder_inputs, target_weights, target_vocab_size, buckets, target_embedding_size, attn_num_layers, attn_num_hidden, forward_only, use_gru): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.encoder_inputs_tensor = encoder_inputs_tensor self.decoder_inputs = decoder_inputs self.target_weights = target_weights self.target_vocab_size = target_vocab_size self.buckets = buckets self.encoder_masks = encoder_masks # Create the internal multi-layer cell for our RNN. single_cell = tf.contrib.rnn.BasicLSTMCell(attn_num_hidden, forget_bias=0.0, state_is_tuple=False) if use_gru: print("using GRU CELL in decoder") single_cell = tf.contrib.rnn.GRUCell(attn_num_hidden) cell = single_cell if attn_num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell] * attn_num_layers, state_is_tuple=False) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode): num_hidden = attn_num_layers * attn_num_hidden lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) # Backward direction cell lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell, lstm_bw_cell, lstm_inputs, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, sequence_length=None, scope=None) encoder_inputs = [ e * f for e, f in zip(pre_encoder_inputs, encoder_masks[:seq_length]) ] top_states = [ array_ops.reshape(e, [-1, 1, num_hidden * 2]) for e in encoder_inputs ] attention_states = array_ops.concat(top_states, 1) initial_state = tf.concat( axis=1, values=[output_state_fw, output_state_bw]) outputs, _, attention_weights_history = embedding_attention_decoder( decoder_inputs, initial_state, attention_states, cell, num_symbols=target_vocab_size, embedding_size=target_embedding_size, num_heads=1, output_size=target_vocab_size, output_projection=None, feed_previous=do_decode, initial_state_attention=False, attn_num_hidden=attn_num_hidden) return outputs, attention_weights_history # Our targets are decoder inputs shifted by one. targets = [ decoder_inputs[i + 1] for i in xrange(len(decoder_inputs) - 1) ] softmax_loss_function = None # default to tf.nn.sparse_softmax_cross_entropy_with_logits # Training outputs and losses. if forward_only: self.output, self.loss, self.attention_weights_history = model_with_buckets( encoder_inputs_tensor, decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function) else: self.output, self.loss, self.attention_weights_history = model_with_buckets( encoder_inputs_tensor, decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), softmax_loss_function=softmax_loss_function)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, config=None, corrective_tokens_mask=None): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.config = config # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # One hot encoding of corrective tokens. corrective_tokens_tensor = tf.constant( corrective_tokens_mask if corrective_tokens_mask else np.zeros(self.target_vocab_size), shape=[self.target_vocab_size], dtype=tf.float32) batched_corrective_tokens = tf.stack([corrective_tokens_tensor] * self.batch_size) self.batch_corrective_tokens_mask = batch_corrective_tokens_mask = \ tf.placeholder( tf.float32, shape=[None, None], name="corrective_tokens") # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary # size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = core_rnn_cell_impl.GRUCell(size) if use_lstm: single_cell = core_rnn_cell_impl.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = core_rnn_cell_impl.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): """ :param encoder_inputs: list of length equal to the input bucket length of 1-D tensors (of length equal to the batch size) whose elements consist of the token index of each sample in the batch at a given index in the input. :param decoder_inputs: :param do_decode: :return: """ if do_decode: # Modify bias here to bias the model towards selecting words # present in the input sentence. input_bias = self.build_input_bias( encoder_inputs, batch_corrective_tokens_mask) # Redefined seq2seq to allow for the injection of a special # decoding function that return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, loop_fn_factory= apply_input_bias_and_extract_argmax_fn_factory(input_bias)) else: return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode) # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in range(len(buckets)): # We need to apply the same input bias used during model # evaluation when decoding. input_bias = self.build_input_bias( self.encoder_inputs[:buckets[b][0]], batch_corrective_tokens_mask) self.outputs[b] = [ project_and_apply_input_bias(output, output_projection, input_bias) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.RMSPropOptimizer(0.001) if self.config.use_rms_prop \ else tf.train.GradientDescentOptimizer(self.learning_rate) # opt = tf.train.AdamOptimizer() for b in range(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, dtype=tf.float32): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) output_projection = None softmax_loss_function = None if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() encoder_cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(num_layers)]) encoder_cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(num_layers)]) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, encoder_cell, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, source_target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, scheduling_rate, scheduling_rate_decay_factor, num_samples=4096, forward_only=False): """Create the model. Args: source_target_vocab_size: size of the source/target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. scheduling_rate: scheduling_rate_decay_factor: forward_only: if set, we do not construct the backward pass in the model. """ self.source_target_vocab_size = source_target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.scheduling_rate = tf.Variable(float(scheduling_rate), trainable=False) self.scheduling_rate_decay_op = self.scheduling_rate.assign( self.scheduling_rate * scheduling_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.source_target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.source_target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.source_target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( w_t, b, inputs, labels, num_samples, self.source_target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_target_vocab_size, num_decoder_symbols=source_target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, scheduling_rate=self.scheduling_rate) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) print("i'm in here") if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, vocab_size, buckets, size, num_layers, batch_size, mode): self.vocab_size = vocab_size self.buckets =buckets # units of rnn cell self.size = size # dimension of words self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = tf.Variable(0.5, trainable=False) self.mode = mode self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"] # learning rate decay self.learning_rate_decay = self.learning_rate.assign(self.learning_rate * 0.99) # input for Reinforcement part self.loop_or_not = tf.placeholder(tf.bool) self.reward = tf.placeholder(tf.float32, [None]) batch_reward = tf.stop_gradient(self.reward) self.RL_index = [None for _ in self.buckets] # projection function w_t = tf.get_variable('proj_w', [self.vocab_size, self.size]) w = tf.transpose(w_t) b = tf.get_variable('proj_b', [self.vocab_size]) output_projection = (w, b) def sample_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast(tf.nn.sampled_softmax_loss(weights = local_w_t, biases = local_b, inputs = local_inputs, labels = labels, num_sampled = 512, num_classes = self.vocab_size), dtype = tf.float32) softmax_loss_function = sample_loss #FIXME add RL function def seq2seq_multi(encoder_inputs, decoder_inputs, mode): embedding = tf.get_variable("embedding", [self.vocab_size, self.size]) loop_function_RL = None if mode == 'MLE': feed_previous = False elif mode == 'TEST': feed_previous = True # need loop_function elif mode == 'RL': feed_previous = True def loop_function_RL(prev, i): prev = tf.matmul(prev, output_projection[0]) + output_projection[1] prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1) if i == 1: for index, RL in enumerate(self.RL_index): if RL is None: self.RL_index[index] = prev_index self.index = index break else: self.RL_index[self.index] = tf.concat([self.RL_index[self.index], prev_index], axis = 1) prev_index = tf.reshape(prev_index, [-1]) # decide which to be the next time step input sample = tf.nn.embedding_lookup(embedding, prev_index) from_decoder = tf.nn.embedding_lookup(embedding, decoder_inputs[i]) return tf.where(self.loop_or_not, sample, from_decoder) return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols = self.vocab_size, num_decoder_symbols = self.vocab_size, embedding_size = self.size, output_projection = output_projection, feed_previous = feed_previous, dtype = tf.float32, embedding = embedding, loop = loop_function_RL) # inputs self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'encoder{0}'.format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape = [None], name = 'decoder{0}'.format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape = [None], name = 'weight{0}'.format(i))) targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] def single_cell(): return tf.contrib.rnn.GRUCell(self.size) cell = single_cell() if self.num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(self.num_layers)]) if self.mode == 'MLE': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode), softmax_loss_function = softmax_loss_function) for b in xrange(len(self.buckets)): self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]] self.update = [] optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append(optimizer.apply_gradients(zip(clipped_gradients, tf.trainable_variables()))) elif self.mode == 'TEST': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode), softmax_loss_function = softmax_loss_function) for b in xrange(len(self.buckets)): self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]] elif self.mode == 'RL': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode), softmax_loss_function = softmax_loss_function, per_example_loss = True) for b in xrange(len(self.buckets)): self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]] for i, b in enumerate(self.outputs): prev_index = tf.multinomial(tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1) self.RL_index[i] = tf.concat([self.RL_index[i], prev_index], axis = 1) self.update = [] optimizer = tf.train.GradientDescentOptimizer(0.01) #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(self.buckets)): scaled_loss = tf.multiply(self.losses[b], batch_reward) self.losses[b] = tf.reduce_mean(scaled_loss) gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append(optimizer.apply_gradients(zip(clipped_gradients, tf.trainable_variables()))) # specify saver self.saver = tf.train.Saver(max_to_keep = 2)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, state_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, keep_prob=1.0, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. state_size: size of environment representation. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. keep_prob: probability DO NOT dropout. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.state_size = state_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Create the internal multi-layer cell for our RNN. cell = rnn_cell.BasicLSTMCell(size) if keep_prob < 1.0 and (not forward_only): cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) if num_layers > 1: cell = rnn_cell.MultiRNNCell([cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. # define the seq2seq model def seq2seq_f(encoder_inputs, decoder_inputs, decoder_inputs_positions, decoder_inputs_maps, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, batch_size, self.state_size, decoder_inputs_positions=decoder_inputs_positions, decoder_inputs_maps=decoder_inputs_maps, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] self.decoder_inputs_positions = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[self.batch_size], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[self.batch_size], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[self.batch_size], name="weight{0}".format(i))) self.decoder_inputs_positions.append(tf.placeholder(tf.int32, shape=[self.batch_size, 3], name="position{0}".format(i))) self.decoder_inputs_maps = tf.placeholder(tf.int32, shape=[self.batch_size], name="mapNo") # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses, self.attentions, self.environments, self.positions = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y, p, m: seq2seq_f(x, y, p, m, True), decoder_inputs_positions=self.decoder_inputs_positions, decoder_inputs_maps=self.decoder_inputs_maps) else: self.positions = None self.outputs, self.losses, self.attentions, self.environments, _ = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y, p, m: seq2seq_f(x, y, p, m, False), decoder_inputs_positions=self.decoder_inputs_positions, decoder_inputs_maps=self.decoder_inputs_maps) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] # opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, src_vocab_size, trg_vocab_size, buckets, size, num_layers, batch_size, mode, input_keep_prob, output_keep_prob, state_keep_prob, beam_search, beam_size, schedule_sampling='linear', sampling_decay_rate=0.99, sampling_global_step=150000, sampling_decay_steps=500, pretrain_vec=None, pretrain_trainable=False, length_penalty=None, length_penalty_factor=0.6, feed_previous=False): self.feed_previous = feed_previous self.decoder_max_len = tf.placeholder(tf.int32, [None]) self.src_vocab_size = src_vocab_size self.trg_vocab_size = trg_vocab_size self.buckets = buckets # units of rnn cell self.size = size # dimension of words self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = tf.Variable(0.5, trainable=False) self.mode = mode self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"] # learning rate decay self.learning_rate_decay = self.learning_rate.assign( self.learning_rate * 0.99) # input for Reinforcement part self.loop_or_not = tf.placeholder(tf.bool) self.reward = tf.placeholder(tf.float32, [None]) batch_reward = tf.stop_gradient(self.reward) self.RL_index = [None for _ in self.buckets] # dropout self.input_keep_prob = input_keep_prob self.output_keep_prob = output_keep_prob self.state_keep_prob = state_keep_prob # beam search self.beam_search = beam_search self.beam_size = beam_size self.length_penalty = length_penalty self.length_penalty_factor = length_penalty_factor # if load pretrain word vector self.pretrain_vec = pretrain_vec self.pretrain_trainable = pretrain_trainable # schedule sampling self.sampling_probability_clip = None self.schedule_sampling = schedule_sampling if self.schedule_sampling == 'False': self.schedule_sampling = False self.init_sampling_probability = 1.0 self.sampling_global_step = sampling_global_step self.sampling_decay_steps = sampling_decay_steps self.sampling_decay_rate = sampling_decay_rate if self.schedule_sampling == 'linear': self.decay_fixed = self.init_sampling_probability * ( self.sampling_decay_steps / self.sampling_global_step) with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign_sub( self.sampling_probability, self.decay_fixed) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0)) elif self.schedule_sampling == 'exp': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) #self.sampling_probability = tf.train.exponential_decay( self.sampling_probability_decay = tf.assign( self.sampling_probability, tf.train.natural_exp_decay(self.sampling_probability, self.sampling_global_step, self.sampling_decay_steps, self.sampling_decay_rate, staircase=True)) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif self.schedule_sampling == 'inverse_sigmoid': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign( self.sampling_probability, #tf.train.cosine_decay( tf.train.linear_cosine_decay( self.sampling_probability, self.sampling_decay_steps, self.sampling_global_step, )) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif not self.schedule_sampling: pass else: raise ValueError( "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]" ) w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size]) w = tf.transpose(w_t) b = tf.get_variable('proj_b', [self.trg_vocab_size]) output_projection = (w, b) def sample_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast(tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, inputs=local_inputs, labels=labels, num_sampled=512, num_classes=self.trg_vocab_size), dtype=tf.float32) softmax_loss_function = sample_loss #FIXME add RL function def seq2seq_multi(encoder_inputs, decoder_inputs, mode, pretrain_vec=None): if pretrain_vec is not None: pad_num = self.src_vocab_size - pretrain_vec.shape[0] pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)], mode='constant') tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT] pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:] special_tags = tf.get_variable(name="special_tags", initializer=tag_vec, trainable=True) embedding = tf.get_variable(name="embedding", initializer=pretrain_vec, trainable=self.pretrain_trainable) embedding = tf.concat([special_tags, embedding], 0) else: embedding = tf.get_variable("embedding", [self.src_vocab_size, self.size]) loop_function_RL = None self.loop_function_RL = loop_function_RL return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=self.src_vocab_size, num_decoder_symbols=self.trg_vocab_size, embedding_size=self.size, output_projection=output_projection, feed_previous=self.feed_previous, dtype=tf.float32, embedding=embedding, beam_search=self.beam_search, beam_size=self.beam_size, loop=loop_function_RL, schedule_sampling=self.schedule_sampling, sampling_probability=self.sampling_probability_clip, length_penalty=self.length_penalty, length_penalty_factor=self.length_penalty_factor) # inputs self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='decoder{0}'.format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name='weight{0}'.format(i))) targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] def single_cell(): return tf.contrib.rnn.GRUCell(self.size) #return tf.contrib.rnn.BasicLSTMCell(self.size) cell = single_cell() if self.num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(self.num_layers)]) cell = rnn.DropoutWrapper(cell, input_keep_prob=self.input_keep_prob, output_keep_prob=self.output_keep_prob, state_keep_prob=self.state_keep_prob) #self.buckets = [(10, self.decoder_max_len), (15, self.decoder_max_len), (25, self.decoder_max_len), (50, self.decoder_max_len)] self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)] self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function) for b in range(len(self.buckets)): #print('self.outputs[b]: ',self.outputs[b]) self.outputs[b] = [ tf.nn.log_softmax( tf.matmul(output, output_projection[0]) + output_projection[1]) for output in self.outputs[b] ] self.saver = tf.train.Saver(max_to_keep=2)
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_edim, hidden_units, num_layers, keep_prob, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, beam_size, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. hidden_edim: number of dimensions for word embedding hidden_units: number of hidden units for each layer num_layers: number of layers in the model. keep_prob: keep probability used for dropout. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. beam_size: the beam size for beam search decoding forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) w = tf.get_variable("proj_w", [hidden_units // 2, self.target_vocab_size], initializer=tf.random_normal_initializer(0, 0.01, seed=123)) b = tf.get_variable("proj_b", [self.target_vocab_size], initializer=tf.constant_initializer(0.0), trainable=False) output_projection = (w, b) # before softmax, there is an output projection def softmax_loss_function(logit, target): # loss function of seq2seq model logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1]) target = array_ops.reshape(target, [-1]) return nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) single_cell = rnn_cell.GRUCell(hidden_units) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) if not forward_only: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=float(keep_prob), seed=123) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, encoder_mask, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=hidden_edim, beam_size=beam_size, output_projection=output_projection, num_layers=num_layers, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_mask = tf.placeholder(tf.int32, shape=[None, None], name="encoder_mask") # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params_to_update = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.gradient_norms_print = [] self.updates = [] opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params_to_update, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params_to_update), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000, # keep all checkpoints keep_checkpoint_every_n_hours=6)
def __init__(self, vocab_size, embedding_dim, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, train_mode=True, name='Seq2SeqModel'): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ with tf.variable_scope(name) as vs: self.vocab_size = vocab_size #self.target_vocab_size = target_vocab_size #print(type(target_vocab_size)) self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None self.embeddings = tf.get_variable( name='embeddings', shape=[self.vocab_size, embedding_dim], initializer=tf.random_uniform_initializer()) # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) # hidden_size = 128 # output_size = 1 # def weighted_sampled_loss(labels,inputs):#bug fixed # labels = tf.reshape(labels, [-1, 1]) # inputs_ = tf.matmul(inputs,w) + b # with tf.variable_scope('mlp_weight_loss') as vs: # # weight = tf.nn.relu(tf.matmul(inputs,w_i)+b_i,name='input_relu') # weight = tf.nn.relu(tf.matmul(weight,w_h)+b_h,name='hidden_relu') # weight = tf.nn.relu(tf.matmul(weight,w_o),name='output_relu') # weight = tf.reshape(weight,shape=[-1]) # #labels_ = tf.one_hot(labels,self.target_vocab_size,1,0) # losses_ = tf.nn.sampled_softmax_loss(w_t, b,labels,inputs, num_samples, self.target_vocab_size) # #losses_ = tf.nn.softmax_cross_entropy_with_logits(labels=labels_, logits=inputs_) # #print('losses_shape:',losses_.get_shape()) # weight = tf.nn.softmax(losses_) # return tf.multiply(weight,losses_) wi_cell = tf.contrib.rnn.GRUCell(10) wo_cell = tf.contrib.rnn.GRUCell(10) def weight(inputs, outputs): q, _ = tf.contrib.rnn.static_rnn(wi_cell, inputs, dtype=tf.float32) a, _ = tf.contrib.rnn.static_rnn(wo_cell, outputs, dtype=tf.float32) return tf.reduce_mean(q[-1], axis=-1) - tf.reduce_mean( a[-1], axis=-1) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs, num_samples, self.vocab_size) # if train_mode: # softmax_loss_function = weighted_sampled_loss # else: softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs=None, decoder_inputs=None, do_decode=False): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.GRUCell(size) for i in range(num_layers) ]), # num_encoder_symbols=source_vocab_size, # num_decoder_symbols=target_vocab_size, # embedding_size=size, num_symbols=self.vocab_size, embeddings=self.embeddings, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] emb_encoder_inputs = [ tf.nn.embedding_lookup(self.embeddings, ele) for ele in self.encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(self.embeddings, ele) for ele in self.decoder_inputs ] # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( emb_encoder_inputs, emb_decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in range(len(buckets)): self.outputs[b] = [[ tf.matmul(output_, output_projection[0]) + output_projection[1] for output_ in output ] for output in self.outputs[b]] else: self.outputs, self.losses = seq2seq.model_with_buckets( emb_encoder_inputs, emb_decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = [] for ele in tf.trainable_variables(): if ele.name.startswith(name): params.append(ele) if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(params) self.variables = [] for ele in tf.global_variables(): if ele.name.startswith(name): self.variables.append(ele)
def __init__(self, mode, length_penalty=None, length_penalty_factor=0.6): self.src_vocab_size = FLAGS.src_vocab_size self.trg_vocab_size = FLAGS.trg_vocab_size self.buckets = buckets # units of rnn cell self.size = FLAGS.hidden_size # dimension of words self.num_layers = FLAGS.num_layers self.batch_size = FLAGS.batch_size if mode == 'RL' or mode == 'MLE' else 1 self.learning_rate = tf.Variable(0.5, trainable=False) self.mode = mode self.dummy_reply = ["哈哈 , 是啊 。", "怎麼 了 ?", "你 在 哪 ?"] self.r1 = FLAGS.r1 self.r2 = FLAGS.r2 self.r3 = FLAGS.r3 # learning rate decay self.learning_rate_decay = self.learning_rate.assign( self.learning_rate * 0.99) # input for Reinforcement part self.loop_or_not = tf.placeholder(tf.bool) self.reward = tf.placeholder(tf.float32, [None]) batch_reward = tf.stop_gradient(self.reward) self.RL_index = [None for _ in self.buckets] # dropout self.input_keep_prob = FLAGS.input_keep_prob self.output_keep_prob = FLAGS.output_keep_prob self.state_keep_prob = FLAGS.state_keep_prob # beam search self.beam_search = FLAGS.beam_search self.beam_size = FLAGS.beam_size self.length_penalty = length_penalty self.length_penalty_factor = length_penalty_factor # if load pretrain word vector self.pretrain_vec = FLAGS.pretrain_vec self.pretrain_trainable = FLAGS.pretrain_trainable # schedule sampling self.sampling_probability_clip = None self.schedule_sampling = FLAGS.schedule_sampling if self.schedule_sampling == 'False': self.schedule_sampling = False self.init_sampling_probability = 1.0 self.sampling_global_step = FLAGS.sampling_global_step self.sampling_decay_steps = FLAGS.sampling_decay_steps self.sampling_decay_rate = FLAGS.sampling_decay_rate if self.schedule_sampling == 'linear': self.decay_fixed = self.init_sampling_probability * ( self.sampling_decay_steps / self.sampling_global_step) with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign_sub( self.sampling_probability, self.decay_fixed) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0)) elif self.schedule_sampling == 'exp': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) #self.sampling_probability = tf.train.exponential_decay( self.sampling_probability_decay = tf.assign( self.sampling_probability, tf.train.natural_exp_decay(self.sampling_probability, self.sampling_global_step, self.sampling_decay_steps, self.sampling_decay_rate, staircase=True)) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif self.schedule_sampling == 'inverse_sigmoid': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign( self.sampling_probability, #tf.train.cosine_decay( tf.train.linear_cosine_decay( self.sampling_probability, self.sampling_decay_steps, self.sampling_global_step, )) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif not self.schedule_sampling: pass else: raise ValueError( "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]" ) w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size]) w = tf.transpose(w_t) b = tf.get_variable('proj_b', [self.trg_vocab_size]) output_projection = (w, b) def sample_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast(tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, inputs=local_inputs, labels=labels, num_sampled=512, num_classes=self.trg_vocab_size), dtype=tf.float32) softmax_loss_function = sample_loss #FIXME add RL function def seq2seq_multi(encoder_inputs, decoder_inputs, mode, pretrain_vec=None): if pretrain_vec is not None: pad_num = self.src_vocab_size - pretrain_vec.shape[0] pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)], mode='constant') tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT] pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:] special_tags = tf.get_variable(name="special_tags", initializer=tag_vec, trainable=True) embedding = tf.get_variable(name="embedding", initializer=pretrain_vec, trainable=self.pretrain_trainable) embedding = tf.concat([special_tags, embedding], 0) else: embedding = tf.get_variable("embedding", [self.src_vocab_size, self.size]) loop_function_RL = None if mode == 'MLE': feed_previous = False elif mode == 'TEST': feed_previous = True # need loop_function elif mode == 'RL': feed_previous = True def loop_function_RL(prev, i): prev = tf.matmul( prev, output_projection[0]) + output_projection[1] prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1) if i == 1: for index, RL in enumerate(self.RL_index): if RL is None: self.RL_index[index] = prev_index self.index = index break else: self.RL_index[self.index] = tf.concat( [self.RL_index[self.index], prev_index], axis=1) #self.RL_index: [(?,9),(?,14),(?,24),(?,49)] #RL_index指的是取樣後每個字的index prev_index = tf.reshape(prev_index, [-1]) #prev_index: (?,) # decide which to be the next time step input sample = tf.nn.embedding_lookup(embedding, prev_index) #sample: (?,256) from_decoder = tf.nn.embedding_lookup( embedding, decoder_inputs[i]) #from_decoder: (?,256) return tf.where(self.loop_or_not, sample, from_decoder) self.loop_function_RL = loop_function_RL return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=self.src_vocab_size, num_decoder_symbols=self.trg_vocab_size, embedding_size=self.size, output_projection=output_projection, feed_previous=feed_previous, dtype=tf.float32, embedding=embedding, beam_search=self.beam_search, beam_size=self.beam_size, loop=loop_function_RL, schedule_sampling=self.schedule_sampling, sampling_probability=self.sampling_probability_clip, length_penalty=self.length_penalty, length_penalty_factor=self.length_penalty_factor) # inputs self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='decoder{0}'.format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name='weight{0}'.format(i))) targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] def single_cell(): return tf.contrib.rnn.GRUCell(self.size) #return tf.contrib.rnn.BasicLSTMCell(self.size) cell = single_cell() if self.num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(self.num_layers)]) cell = rnn.DropoutWrapper(cell, input_keep_prob=self.input_keep_prob, output_keep_prob=self.output_keep_prob, state_keep_prob=self.state_keep_prob) if self.mode == 'MLE': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function) for b in range(len(self.buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] self.update = [] optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(self.buckets)): gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append( optimizer.apply_gradients( zip(clipped_gradients, tf.trainable_variables()))) elif self.mode == 'TEST': #self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)] self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function) for b in range(len(self.buckets)): #print('self.outputs[b]: ',self.outputs[b]) self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] #print('self.outputs[b]: ',self.outputs[b]) elif self.mode == 'RL': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function, per_example_loss=True) #print('self.buckets: ',len(self.buckets)) for b in range(len(self.buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] #print('self.RL_index: ',self.RL_index) #print('self.outputs: ',len(self.outputs[0]),len(self.outputs[1]),len(self.outputs[2]),len(self.outputs[3])) #print('self.RL_index: ',len(self.RL_index)) #print('self.outputs: ',len(self.outputs)) for i, b in enumerate(self.outputs): prev_index = tf.multinomial( tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1) #下面一行目的為補足最後一個decoder output,因為在decoder當中呼叫一次loop_function,RL_index才會append一次,但最後一個input得到的output不會再當prev丟入下一個loop_function,因此要從self.outputs的最後一個物件來補齊。 self.RL_index[i] = tf.concat([self.RL_index[i], prev_index], axis=1) #print(i,len(b)) #print('self.buckets: ',self.buckets) #print('self.buckets[i][1]: ',self.buckets[i][1]) #print('self.buckets[i][1] - 1: ',self.buckets[i][1] - 1) #print('b[self.buckets[i][1] - 1]: ', b[self.buckets[i][1] - 1]) #print('prev_index: ',prev_index) #print('self.RL_index[i]: ',self.RL_index[i]) #print('----------------') #self.outputs: list of 4 buckets, each (?,6258) #print('self.RL_index: ',self.RL_index) self.update = [] optimizer = tf.train.GradientDescentOptimizer(0.01) #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(self.buckets)): scaled_loss = tf.multiply(self.losses[b], batch_reward) self.losses[b] = tf.reduce_mean(scaled_loss) gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append( optimizer.apply_gradients( zip(clipped_gradients, tf.trainable_variables()))) # specify saver self.saver = tf.train.Saver(max_to_keep=10)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss def get_lstm(): # MK add this function cell = tf.contrib.rnn.BasicLSTMCell(size, state_is_tuple=True,reuse=tf.get_variable_scope().reuse) return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.8) # Create the internal multi-layer cell for our RNN. def single_cell(): #return tf.contrib.rnn.GRUCell(size) #MK add dropout cell = tf.contrib.rnn.GRUCell(size) #cell = GRUCell(size) #MK return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.8) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() if num_layers > 1: #MK change for testing cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)]) #cell = tf.contrib.rnn.MultiRNNCell([get_lstm() for _ in range(num_layers)],state_is_tuple=True) # MK change # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], #MK name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], #MK name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], #MK name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: #self.outputs, self.losses,self.states = tf.contrib.legacy_seq2seq.model_with_buckets( #MK change self.outputs, self.losses,self.states,self.enc_outputs = seq2seq.model_with_buckets( #MK change self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: # self.outputs, self.losses,_ = tf.contrib.legacy_seq2seq.model_with_buckets( # MK change self.outputs, self.losses,self.states,self.enc_outputs = seq2seq.model_with_buckets( # MK change self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() #if not forward_only: #MK if True: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, source_target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, scheduling_rate, scheduling_rate_decay_factor, num_samples = 4096, forward_only=False): """Create the model. Args: source_target_vocab_size: size of the source/target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. scheduling_rate: scheduling_rate_decay_factor: forward_only: if set, we do not construct the backward pass in the model. """ self.source_target_vocab_size = source_target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.scheduling_rate = tf.Variable(float(scheduling_rate), trainable=False) self.scheduling_rate_decay_op = self.scheduling_rate.assign(self.scheduling_rate * scheduling_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.source_target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.source_target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.source_target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.source_target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_target_vocab_size, num_decoder_symbols=source_target_vocab_size, embedding_size = size, output_projection = output_projection, feed_previous=do_decode, scheduling_rate = self.scheduling_rate ) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x,y,True), softmax_loss_function=softmax_loss_function) print ("i'm in here") if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x,y,False), softmax_loss_function = softmax_loss_function) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, buckets, source_vocab_sizes, target_vocab_sizes, size, source_embedding_sizes, target_embedding_sizes, target_data_types, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, decoder_type, use_lstm=True, average_loss_across_timesteps=True, forward_only=False, feed_previous=False, predict_span_end_pointers=False, use_adam=False, restrict_decoder_structure=False, transition_vocab_sets=None, transition_state_map=None, encoder_decoder_vocab_map=None, use_bidirectional_encoder=False, pretrained_word_embeddings=None, word_embeddings=None, dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.buckets = buckets self.batch_size = batch_size self.decoder_type = decoder_type self.transition_vocab_sets = transition_vocab_sets if transition_state_map is None: self.transition_state_map = None else: self.transition_state_map = tf.constant(transition_state_map) self.encoder_decoder_vocab_map = tf.constant(encoder_decoder_vocab_map) self.use_stack_decoder = decoder_type == data_utils.STACK_DECODER_STATE self.average_loss_across_timesteps = average_loss_across_timesteps self.input_keep_prob = tf.placeholder(tf.float32, name="input_keep_probability") self.output_keep_prob = tf.placeholder(tf.float32, name="output_keep_probability") if not use_adam: self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.embedding_weights = {} for source_type in source_embedding_sizes.iterkeys(): self.embedding_weights[source_type] = tf.Variable( tf.constant(0.0, shape=[ source_vocab_sizes[source_type], source_embedding_sizes[source_type] ]), trainable=(source_type <> 'em'), name=source_type + "_encoder_embeddings") if source_type == 'en': assert word_embeddings is not None assert source_embedding_sizes['en'] == word_embeddings.shape[1] self.embedding_weights['en'].assign(word_embeddings) elif source_type == 'em': assert pretrained_word_embeddings is not None assert source_embedding_sizes[ 'em'] == pretrained_word_embeddings.shape[1] self.embedding_weights['em'].assign(pretrained_word_embeddings) else: init_vectors = np.random.uniform( -np.sqrt(3), np.sqrt(3), (source_vocab_sizes[source_type], source_embedding_sizes[source_type])) self.embedding_weights[source_type].assign(init_vectors) output_projections = {} for target_type in target_vocab_sizes.iterkeys(): vocab_size = target_vocab_sizes[target_type] w = tf.get_variable( target_type + "_proj_w", [size, vocab_size], initializer=tf.uniform_unit_scaling_initializer(), dtype=dtype) w_t = tf.transpose(w) b = tf.get_variable(target_type + "_proj_b", [vocab_size], dtype=dtype) output_projections[target_type] = (w, b) def full_loss(logits, labels): labels = tf.reshape(labels, [-1]) return tf.nn.sparse_softmax_cross_entropy_with_logits( logits, labels) def full_output_loss(inputs, labels): logits = tf.nn.xw_plus_b(inputs, w, b) labels = tf.reshape(labels, [-1]) return tf.nn.sparse_softmax_cross_entropy_with_logits( logits, labels) softmax_loss_function = full_loss def create_cell(use_dropout=True): # Create the internal cell for our RNN. if use_lstm: cell = tf.nn.rnn_cell.LSTMCell( size, use_peepholes=False, state_is_tuple=True, initializer=tf.uniform_unit_scaling_initializer()) else: cell = tf.nn.rnn_cell.GRUCell(size) if use_dropout: cell = tf.nn.rnn_cell.DropoutWrapper(cell, self.input_keep_prob, self.output_keep_prob) return cell with tf.variable_scope("encoder_fw"): fw_cell = create_cell() with tf.variable_scope("encoder_bw"): bw_cell = create_cell() with tf.variable_scope("decoder_main"): dec_cell = create_cell() with tf.variable_scope("decoder_aux"): dec_aux_cell = create_cell(False) if self.decoder_type == data_utils.MEMORY_STACK_DECODER_STATE: with tf.variable_scope("decoder_lin_mem"): dec_mem_cell = create_cell() else: dec_mem_cell = None self.decoder_restrictions = [] num_decoder_restrictions = 0 if restrict_decoder_structure: num_decoder_restrictions = data_utils.NUM_TR_STATES for i in xrange(num_decoder_restrictions): self.decoder_restrictions.append( tf.placeholder(tf.int32, shape=[None], name="restrictions{0}".format(i))) if self.transition_vocab_sets is None: self.decoder_transition_map = None else: self.decoder_transition_map = data_utils.construct_transition_map( self.transition_vocab_sets, False) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( self.decoder_type, encoder_inputs, decoder_inputs, fw_cell, bw_cell, dec_cell, dec_aux_cell, dec_mem_cell, source_vocab_sizes, target_vocab_sizes, source_embedding_sizes, target_embedding_sizes, predict_span_end_pointers=predict_span_end_pointers, decoder_restrictions=self.decoder_restrictions, output_projections=output_projections, word_vectors=self.embedding_weights, transition_state_map=self.transition_state_map, encoder_decoder_vocab_map=self.encoder_decoder_vocab_map, use_bidirectional_encoder=use_bidirectional_encoder, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # For now assume that we only have embedding inputs, and single sequence # of target weights. for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append({}) for key in source_vocab_sizes.iterkeys(): self.encoder_inputs[-1][key] = tf.placeholder( tf.int32, shape=[None], name="encoder_{0}_{1}".format(key, i)) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append({}) for key in target_data_types: self.decoder_inputs[-1][key] = tf.placeholder( tf.int32, shape=[None], name="decoder_{0}_{1}".format(key, i)) for i in xrange(buckets[-1][1] + 1): self.target_weights.append({}) for key in target_data_types: if key == "parse" or key == "predicate" or key == "ind": self.target_weights[-1][key] = tf.placeholder( dtype, shape=[None], name="weight_{0}_{1}".format(key, i)) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, feed_previous), forward_only, softmax_loss_function=softmax_loss_function, average_across_timesteps=self.average_loss_across_timesteps) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] if use_adam: opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-02) else: opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) if max_gradient_norm > 0: clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) else: self.gradient_norms.append(tf.zeros([1])) self.updates.append( opt.apply_gradients(zip(gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())