def SampleGRUSeq2Seq(enc_inp, dec_inp, weights): """Example sequence-to-sequence model that uses GRU cells.""" def GRUSeq2Seq(enc_inp, dec_inp): cell = rnn_cell.MultiRNNCell([rnn_cell.GRUCell(24)] * 2) return seq2seq.embedding_attention_seq2seq( enc_inp, dec_inp, cell, classes, classes, output_projection=(w, b)) targets = [dec_inp[i + 1] for i in xrange(len(dec_inp) - 1)] + [0] def SampledLoss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes) return seq2seq.model_with_buckets( enc_inp, dec_inp, targets, weights, buckets, classes, GRUSeq2Seq, softmax_loss_function=SampledLoss)
def SampleGRUSeq2Seq(enc_inp, dec_inp, weights): """Example sequence-to-sequence model that uses GRU cells.""" def GRUSeq2Seq(enc_inp, dec_inp): cell = rnn_cell.MultiRNNCell([rnn_cell.GRUCell(24)] * 2) return seq2seq.embedding_attention_seq2seq( enc_inp, dec_inp, cell, classes, classes, output_projection=(w, b)) targets = [dec_inp[i+1] for i in xrange(len(dec_inp) - 1)] + [0] def SampledLoss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes) return seq2seq.model_with_buckets(enc_inp, dec_inp, targets, weights, buckets, classes, GRUSeq2Seq, softmax_loss_function=SampledLoss)
def __init__(self, source_vocab_size, target_vocab_size, buckets, source_proj_size, target_proj_size, encoder_size, decoder_size, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer='sgd', input_feeding=False, combine_inp_attn=False, dropout=0.0, attention_f=None, window_size=10, content_function=None, decoder_attention_f="None", num_samples=512, forward_only=False, max_len=100, cpu_only=False, early_stop_patience=0, save_best_model=True, dtype=tf.float32): super(NMTModel, self).__init__() if cpu_only: device = "/cpu:0" else: device = "/gpu:0" with tf.device(device): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.attention_f = attention_f self.content_function = content_function self.window_size = window_size self.combine_inp_attn = combine_inp_attn if decoder_attention_f == "None": self.decoder_attention_f = None else: self.decoder_attention_f = decoder_attention_f # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_loss = tf.Variable(0.0, trainable=False) self.current_loss_update_op = None self.avg_loss = tf.Variable(0.0, trainable=False) self.avg_loss_update_op = self.avg_loss.assign(tf.div(self.current_loss, self.global_step)) if early_stop_patience > 0 or save_best_model: self.best_eval_loss = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) else: self.best_eval_loss = None self.estop_counter = None self.estop_counter_update_op = None self.estop_counter_reset_op = None self.source_proj_size = source_proj_size self.target_proj_size = target_proj_size self.encoder_size = encoder_size self.decoder_size = decoder_size self.input_feeding = input_feeding self.max_len = max_len self.dropout = dropout self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self.step_num = tf.Variable(0, trainable=False) self.dtype = dtype # If we use sampled softmax, we need an output projection. loss_function = None with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [decoder_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) self.output_projection = (w, b) self.sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < self.target_vocab_size: self.sampled_softmax = True def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) loss_function = sampled_loss # create the embedding matrix - this must be done in the CPU for now with tf.device("/cpu:0"): self.src_embedding = tf.Variable( tf.truncated_normal( [source_vocab_size, source_proj_size], stddev=0.01 ), name='embedding_src' ) # decoder with attention with tf.name_scope('decoder_with_attention') as scope: # create this variable to be used inside the embedding_attention_decoder self.tgt_embedding = tf.Variable( tf.truncated_normal( [target_vocab_size, target_proj_size], stddev=0.01 ), name='embedding' ) # Create the internal multi-layer cell for our RNN. self.encoder_cell_fw, self.encoder_cell_bw, self.decoder_cell = cells.build_nmt_bidirectional_cell( encoder_size, decoder_size, source_proj_size, target_proj_size, dropout=dropout) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs): return self.inference(encoder_inputs, decoder_inputs) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] self.decoder_states_holders = None # Training outputs and losses. if forward_only: # self.batch_size = beam_size for i in xrange(len(self.encoder_inputs), self.max_len): self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) b_size = array_ops.shape(self.encoder_inputs[0])[0] # context, decoder_initial_state, attention_states, input_length self.ret0, self.ret1, self.ret2 = self.encode(self.encoder_inputs, b_size) self.decoder_init_plcholder = tf.placeholder(tf.float32, shape=[None, (target_proj_size) * 2], name="decoder_init") # shape of this placeholder: the first None indicate the batch size and the second the input length self.attn_plcholder = tf.placeholder(tf.float32, shape=[None, self.ret2.get_shape()[1], target_proj_size], name="attention_states") # decoder_states = None if self.decoder_attention_f is not None: self.decoder_states_holders = tf.placeholder(tf.float32, shape=[None, None, 1, decoder_size], name="decoder_state") decoder_states = self.decoder_states_holders self.logits, self.states = attention_decoder_nmt( decoder_inputs=[self.decoder_inputs[0]], initial_state=self.decoder_init_plcholder, attention_states=self.attn_plcholder, cell=self.decoder_cell, num_symbols=target_vocab_size, attention_f=attention_f, window_size=window_size, content_function=content_function, decoder_attention_f=decoder_attention_f, combine_inp_attn=combine_inp_attn, input_feeding=input_feeding, dropout=self.dropout_feed, initializer=None, dtype=dtype ) # If we use output projection, we need to project outputs for decoding. self.logits = tf.nn.xw_plus_b(self.logits[-1], self.output_projection[0], self.output_projection[1]) self.logits = nn_ops.softmax(self.logits) else: tf_version = pkg_resources.get_distribution("tensorflow").version if tf_version == "0.6.0" or tf_version == "0.5.0": self.outputs, self.losses = seq2seq.model_with_buckets( encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs, targets=targets, weights=self.target_weights, num_decoder_symbols=self.target_vocab_size, buckets=buckets, seq2seq=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function) else: self.outputs, self.losses = model_with_buckets( encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs, targets=targets, weights=self.target_weights, buckets=buckets, seq2seq_f=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] # opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = optimization_ops.get_optimizer(optimizer, learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, vocab_size, buckets_or_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, model_type, use_lstm=True, num_samples=512, forward_only=False): """Create the model. This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. Args: vocab_size: Size of the vocabulary. target_vocab_size: Size of the target vocabulary. buckets_or_sentence_length: If using buckets: A list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. Else: Number of the maximum number of words per sentence. size: Number of units in each layer of the model. num_layers: Number of layers in the model. max_gradient_norm: Gradients will be clipped to maximally this norm. batch_size: The size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: Learning rate to start with. learning_rate_decay_factor: Decay learning rate by this much when needed. num_samples: Number of samples for sampled softmax. forward_only: If set, we do not construct the backward pass in the model. """ # Need to determine if we're using buckets or not: if type(buckets_or_sentence_length) == list: self.buckets = buckets_or_sentence_length else: self.max_sentence_length = buckets_or_sentence_length self.vocab_size = vocab_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat) if num_layers > 1: cell = rnn_cell.MultiRNNCell( [single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states) # The seq2seq function: we use embedding for the input and attention (if applicable). if model_type is 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: # just build embedding model, I should probably change this to throw an error def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. try: encoder_range = self.buckets[-1][0] decoder_range = self.buckets[-1][1] except AttributeError: encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length for i in xrange(encoder_range): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(decoder_range + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. try: if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(self.buckets)): self.outputs[b] = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) except AttributeError: if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # Project outputs for decoding if output_projection is not None: self.outputs = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs ] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.params = params # Hold onto this for Woz if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) try: for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) except AttributeError: gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) output_projection = None softmax_loss_function = None if num_samples > 0 and num_samples < self.target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, output_projection=output_projection, feed_previous=do_decode) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) output_projection = None softmax_loss_function = None if num_samples > 0 and num_samples < self.target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, output_projection=output_projection, feed_previous=do_decode) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, vocab_size, buckets_or_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, model_type, use_lstm=True, num_samples=512, forward_only=False): """Create the model. This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. Args: vocab_size: Size of the vocabulary. target_vocab_size: Size of the target vocabulary. buckets_or_sentence_length: If using buckets: A list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. Else: Number of the maximum number of words per sentence. size: Number of units in each layer of the model. num_layers: Number of layers in the model. max_gradient_norm: Gradients will be clipped to maximally this norm. batch_size: The size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: Learning rate to start with. learning_rate_decay_factor: Decay learning rate by this much when needed. num_samples: Number of samples for sampled softmax. forward_only: If set, we do not construct the backward pass in the model. """ # Need to determine if we're using buckets or not: if type(buckets_or_sentence_length) == list: self.buckets = buckets_or_sentence_length else: self.max_sentence_length = buckets_or_sentence_length self.vocab_size = vocab_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Summary variables. NOTE: added these. # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate) # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat) if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states) # The seq2seq function: we use embedding for the input and attention (if applicable). if model_type is 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: # just build embedding model, I should probably change this to throw an error def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. try: encoder_range = self.buckets[-1][0] decoder_range = self.buckets[-1][1] except AttributeError: encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length for i in xrange(encoder_range): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(decoder_range + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. try: if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(self.buckets)): self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b]] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) except AttributeError: if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # Project outputs for decoding if output_projection is not None: self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.params = params # Hold onto this for Woz if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) try: for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) except AttributeError: gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512): """Create the model. Args: vocab_size: size of the source/target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. """ self.vocab_size = vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. # def seq2seq_f(encoder_inputs, decoder_inputs): # return seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode=False): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, feed_previous=False) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y), softmax_loss_function=None) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, num_input_tokens, num_target_tokens, # 辞書サイズ max_input_seq_length, max_target_seq_length, # インプット&アウトプットする文の文字数 input_word2idx, target_word2idx, # { 文字: インデックス} input_idx2word, target_idx2word, # { インデックス: 文字} num_samples=512, use_lstm=False, # トレーニングデータの総文字数, RNNそうに使うDLアーキテクチャ(LSTM) NUM_HIDDEN_UNITS, NUM_HIDDEN_LAYERS, # 隠れ層のユニット数とレイヤー数 buckets, batch_size, learning_rate, learning_rate_decay_factor, forward_only=False, max_gradient_norm, # size, num_layers, ): """ Seq2Seqを実現するニューラルネットを構築するのに必要な情報やハイパラメータたち 1. 入力データの整形(辞書数,パディング,バケツ化) 2. 入出力層の長さ 3. 隠れ層の構造(単層or多層)、タイプ(LSTM, RNN) 4. 損失関数や勾配降下最適化アルゴリズム 5. 学習率やミニバッチサイズなどのハイパラメータ """ self.num_input_tokens = num_input_tokens self.num_output_tokens = num_target_tokens self.max_input_seq_length = max_input_seq_length self.max_output_seq_length = max_target_seq_length self.input_word2idx = input_word2idx self.output_word2idx = target_word2idx self.input_idx2word = input_idx2word self.output_idx2word = target_idx2word self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) """ 出力層の定義 """ output_projection = None softmax_loss_function = None if num_samples > 0 and num_samples < self.num_input_tokens: with tf.device("/cpu0"): # tf.get_variableは、既に存在すれば取得し、なければ変数を作成する関数, 第一引数に変数の名前を指定する weight_matrix = tf.get_variable("proj_w", [size, self.num_target_tokens]) # transpose word_matrix weight_matrix_T = tf.transpose(weight_matrix) # サンプル数(デフォルトでは512)がターゲット語彙サイズよりも小さい場合にのみサンプリング・ソフトマックスを構築する bias = tf.get_variable("proj_b", [self.num_target_tokens]) # 重み行列とバイアス・ベクトルのペア # RNN セルは、バッチサイズ × target_vocab_size ではなく、バッチサイズ × size の形状のベクトルを返す # ロジットを取り出すために、重み行列を乗算し、バイアスを加える必要がある output_projection = (weight_matrix, bias) """ <where>における誤差関数の定義 """ # inputs, labels = input_idx2word.keys(), input_idx2word.values() def sampled_loss(inputs, labels): with tf.device("/cpu0"): # labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(weight_matrix_T, bias, labels, inputs, num_samples, self.num_target_tokens) softmax_loss_function = sampled_loss """ 隠れ層のアーキテクチャ設定: 「隠れ層のセルの種類」と「隠れ層の数」の定義! """ single_cell = rnn_cell.GRUCell(NUM_HIDDEN_UNITS) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(NUM_HIDDEN_UNITS) # 隠れ層のユニット数の定義 cell = single_cell if NUM_HIDDEN_LAYERS > 1: cell = rnn_cell.MultiRNNCell([single_cell] * NUM_HIDDEN_LAYERS) """ Integrate each part of Neural Network Aechitecture! """ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_input_tokens, num_target_tokens, output_projection=output_projection, feed_previous=do_decode) """ 入出力データ(Sentence)の矯正その1 # バケッティングは、文が短いときに不必要に多くの PAD 埋めを防具ために存在する # 入出力長を数種類に固定 (例えば,[(5,10),(10,15),(20,25),(40,50)]) して、数パターンのバケツを用意する """ # 入力レイヤーの定義 self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # tf.placeholderは、事前に変数の値を定義する必要のない、計算グラフ内の変数の容れ物 # tf.placeholderの使い方をマスターすると、実行時に任意の値を入れてTensorFlowに計算させることができる # 機械学習のコードでは、主に入力層に渡す変数をtf.placeholderで定義して、実行時に学習に入力画像や情報をバッチ毎に供給するために使用する self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): # decoder self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) # target_weights self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.num_target_tokens, lambda x, y: seq2seq_f(encoder_inputs=x, encoder_inputs=y, do_decode=True), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(encoder_inputs=x, encoder_inputs=y, do_decode=False), softmax_loss_function=softmax_loss_function) """切れた""" """ バックプロパゲーションの勾配降下の設定 """ params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())