def __init__(self, is_training, learning_rate=1.0, optimizer="sgd", max_grad_norm=5, num_layers=2, use_lstm=True, num_steps=35, num_steps_valid=120, proj_size=650, hidden_size=650, hidden_proj=650, num_samples=512, init_scale=0.1, dropout_rate=0.0, lr_decay=0.8, batch_size=20, attentive=False, projection_attention_f=None, output_form=lm_ops.OUTPUT_CONCAT, vocab_size=10000): with tf.device("/gpu:0"): if attentive: assert projection_attention_f is not None self.batch_size = batch_size = batch_size self.num_steps = num_steps self.num_steps_valid = num_steps_valid vocab_size = vocab_size self._input_data_train = [] self._targets_train = [] self.mask_train = [] for i in xrange(num_steps): # Last bucket is the biggest one. self.input_data_train.append( tf.placeholder(tf.int32, shape=[None], name="input_train{0}".format(i))) self.targets_train.append( tf.placeholder(tf.int32, shape=[None], name="target_train{0}".format(i))) self.mask_train.append( tf.placeholder(tf.float32, shape=[None], name="mask_train{0}".format(i))) self._input_data_valid = [] self._targets_valid = [] self.mask_valid = [] for i in xrange( num_steps_valid): # Last bucket is the biggest one. self.input_data_valid.append( tf.placeholder(tf.int32, shape=[None], name="input_valid{0}".format(i))) self.targets_valid.append( tf.placeholder(tf.int32, shape=[None], name="target_valid{0}".format(i))) self.mask_valid.append( tf.placeholder(tf.float32, shape=[None], name="mask_valid{0}".format(i))) hidden_projection = None if hidden_proj > 0: hidden_projection = hidden_proj self.cell = cells.build_lm_multicell_rnn( num_layers, hidden_size, proj_size, use_lstm=use_lstm, hidden_projection=hidden_projection, dropout=dropout_rate) self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self._initial_state_train = self.cell.zero_state( batch_size, tf.float32) self._initial_state_valid = self.cell.zero_state(1, tf.float32) # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * lr_decay) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign( self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_ppx = tf.Variable(1.0, trainable=False) self.current_loss = tf.Variable(0.0, trainable=False) # self.current_loss_update_op = None self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign( self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) initializer = tf.random_uniform_initializer(minval=init_scale, maxval=init_scale, seed=_SEED) out_proj = hidden_size if hidden_proj > 0: out_proj = hidden_proj with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [out_proj, vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [vocab_size]) self.output_projection = (w, b) sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < vocab_size: sampled_softmax = True def sampled_loss(logits, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) losses = tf.nn.sampled_softmax_loss( w_t, b, logits, labels, num_samples, vocab_size) return losses loss_function = sampled_loss with tf.device("/cpu:0"): # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell embedding = tf.Variable(tf.random_uniform( [vocab_size, proj_size], minval=-init_scale, maxval=init_scale), name="embedding") # embedding = tf.get_variable("embedding", [vocab_size, proj_size]) inputs_train = [ tf.nn.embedding_lookup(embedding, i) for i in self.input_data_train ] inputs_valid = [ tf.nn.embedding_lookup(embedding, i) for i in self.input_data_valid ] with tf.variable_scope("RNN", initializer=initializer): if attentive: outputs_train, state_train, _ = lm_ops.apply_attentive_lm( self.cell, inputs_train, sequence_length=array_ops.squeeze( math_ops.add_n(self.mask_train)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32) outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm( self.cell, inputs_valid, sequence_length=array_ops.squeeze( math_ops.add_n(self.mask_valid)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32) else: outputs_train, state_train = lm_ops.apply_lm( self.cell, inputs_train, sequence_length=math_ops.add_n(self.mask_train), dropout=self.dropout_feed, dtype=tf.float32) outputs_valid, state_valid = lm_ops.apply_lm( self.cell, inputs_valid, sequence_length=math_ops.add_n(self.mask_valid), dropout=self.dropout_feed, dtype=tf.float32) if sampled_softmax is False: logits_train = [ tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_train ] logits_valid = [ tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_valid ] else: logits_train = outputs_train logits_valid = outputs_valid loss_train = seq2seq.sequence_loss_by_example( logits_train, self.targets_train, self.mask_train, average_across_timesteps=True) loss_valid = seq2seq.sequence_loss_by_example( logits_valid, self.targets_valid, self.mask_valid, average_across_timesteps=True) self._cost_train = cost = tf.reduce_sum(loss_train) / float( batch_size) self._final_state_train = state_train self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size) self._final_state_valid = state_valid if not is_training: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm) opt = optimization_ops.get_optimizer(optimizer, learning_rate) self._train_op = opt.apply_gradients(zip(grads, tvars), global_step=self.global_step) self._valid_op = tf.no_op() self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, source_proj_size, target_proj_size, encoder_size, decoder_size, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, optimizer='sgd', input_feeding=False, combine_inp_attn=False, dropout=0.0, attention_f=None, window_size=10, content_function=None, decoder_attention_f="None", num_samples=512, forward_only=False, max_len=100, cpu_only=False, early_stop_patience=0, save_best_model=True, dtype=tf.float32): super(NMTModel, self).__init__() if cpu_only: device = "/cpu:0" else: device = "/gpu:0" with tf.device(device): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.attention_f = attention_f self.content_function = content_function self.window_size = window_size self.combine_inp_attn = combine_inp_attn if decoder_attention_f == "None": self.decoder_attention_f = None else: self.decoder_attention_f = decoder_attention_f # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_loss = tf.Variable(0.0, trainable=False) self.current_loss_update_op = None self.avg_loss = tf.Variable(0.0, trainable=False) self.avg_loss_update_op = self.avg_loss.assign(tf.div(self.current_loss, self.global_step)) if early_stop_patience > 0 or save_best_model: self.best_eval_loss = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) else: self.best_eval_loss = None self.estop_counter = None self.estop_counter_update_op = None self.estop_counter_reset_op = None self.source_proj_size = source_proj_size self.target_proj_size = target_proj_size self.encoder_size = encoder_size self.decoder_size = decoder_size self.input_feeding = input_feeding self.max_len = max_len self.dropout = dropout self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self.step_num = tf.Variable(0, trainable=False) self.dtype = dtype # If we use sampled softmax, we need an output projection. loss_function = None with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [decoder_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) self.output_projection = (w, b) self.sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < self.target_vocab_size: self.sampled_softmax = True def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) loss_function = sampled_loss # create the embedding matrix - this must be done in the CPU for now with tf.device("/cpu:0"): self.src_embedding = tf.Variable( tf.truncated_normal( [source_vocab_size, source_proj_size], stddev=0.01 ), name='embedding_src' ) # decoder with attention with tf.name_scope('decoder_with_attention') as scope: # create this variable to be used inside the embedding_attention_decoder self.tgt_embedding = tf.Variable( tf.truncated_normal( [target_vocab_size, target_proj_size], stddev=0.01 ), name='embedding' ) # Create the internal multi-layer cell for our RNN. self.encoder_cell_fw, self.encoder_cell_bw, self.decoder_cell = cells.build_nmt_bidirectional_cell( encoder_size, decoder_size, source_proj_size, target_proj_size, dropout=dropout) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs): return self.inference(encoder_inputs, decoder_inputs) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] self.decoder_states_holders = None # Training outputs and losses. if forward_only: # self.batch_size = beam_size for i in xrange(len(self.encoder_inputs), self.max_len): self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) b_size = array_ops.shape(self.encoder_inputs[0])[0] # context, decoder_initial_state, attention_states, input_length self.ret0, self.ret1, self.ret2 = self.encode(self.encoder_inputs, b_size) self.decoder_init_plcholder = tf.placeholder(tf.float32, shape=[None, (target_proj_size) * 2], name="decoder_init") # shape of this placeholder: the first None indicate the batch size and the second the input length self.attn_plcholder = tf.placeholder(tf.float32, shape=[None, self.ret2.get_shape()[1], target_proj_size], name="attention_states") # decoder_states = None if self.decoder_attention_f is not None: self.decoder_states_holders = tf.placeholder(tf.float32, shape=[None, None, 1, decoder_size], name="decoder_state") decoder_states = self.decoder_states_holders self.logits, self.states = attention_decoder_nmt( decoder_inputs=[self.decoder_inputs[0]], initial_state=self.decoder_init_plcholder, attention_states=self.attn_plcholder, cell=self.decoder_cell, num_symbols=target_vocab_size, attention_f=attention_f, window_size=window_size, content_function=content_function, decoder_attention_f=decoder_attention_f, combine_inp_attn=combine_inp_attn, input_feeding=input_feeding, dropout=self.dropout_feed, initializer=None, dtype=dtype ) # If we use output projection, we need to project outputs for decoding. self.logits = tf.nn.xw_plus_b(self.logits[-1], self.output_projection[0], self.output_projection[1]) self.logits = nn_ops.softmax(self.logits) else: tf_version = pkg_resources.get_distribution("tensorflow").version if tf_version == "0.6.0" or tf_version == "0.5.0": self.outputs, self.losses = seq2seq.model_with_buckets( encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs, targets=targets, weights=self.target_weights, num_decoder_symbols=self.target_vocab_size, buckets=buckets, seq2seq=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function) else: self.outputs, self.losses = model_with_buckets( encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs, targets=targets, weights=self.target_weights, buckets=buckets, seq2seq_f=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] # opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = optimization_ops.get_optimizer(optimizer, learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())
def __init__(self, is_training, learning_rate=1.0, optimizer="sgd", max_grad_norm=5, num_layers=2, use_lstm=True, num_steps=35, num_steps_valid=120, proj_size=650, hidden_size=650, hidden_proj=650, num_samples=512, init_scale=0.1, dropout_rate=0.0, lr_decay=0.8, batch_size=20, attentive=False, projection_attention_f=None, output_form=lm_ops.OUTPUT_CONCAT, vocab_size=10000): with tf.device("/gpu:0"): if attentive: assert projection_attention_f is not None self.batch_size = batch_size = batch_size self.num_steps = num_steps self.num_steps_valid = num_steps_valid vocab_size = vocab_size self._input_data_train = [] self._targets_train = [] self.mask_train = [] for i in xrange(num_steps): # Last bucket is the biggest one. self.input_data_train.append(tf.placeholder(tf.int32, shape=[None], name="input_train{0}".format(i))) self.targets_train.append(tf.placeholder(tf.int32, shape=[None], name="target_train{0}".format(i))) self.mask_train.append(tf.placeholder(tf.float32, shape=[None], name="mask_train{0}".format(i))) self._input_data_valid = [] self._targets_valid = [] self.mask_valid = [] for i in xrange(num_steps_valid): # Last bucket is the biggest one. self.input_data_valid.append(tf.placeholder(tf.int32, shape=[None], name="input_valid{0}".format(i))) self.targets_valid.append(tf.placeholder(tf.int32, shape=[None], name="target_valid{0}".format(i))) self.mask_valid.append(tf.placeholder(tf.float32, shape=[None], name="mask_valid{0}".format(i))) hidden_projection = None if hidden_proj > 0: hidden_projection = hidden_proj self.cell = cells.build_lm_multicell_rnn(num_layers, hidden_size, proj_size, use_lstm=use_lstm, hidden_projection=hidden_projection, dropout=dropout_rate) self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self._initial_state_train = self.cell.zero_state(batch_size, tf.float32) self._initial_state_valid = self.cell.zero_state(1, tf.float32) # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * lr_decay) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_ppx = tf.Variable(1.0, trainable=False) self.current_loss = tf.Variable(0.0, trainable=False) # self.current_loss_update_op = None self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) initializer = tf.random_uniform_initializer(minval=init_scale, maxval=init_scale, seed=_SEED) out_proj = hidden_size if hidden_proj > 0: out_proj = hidden_proj with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [out_proj, vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [vocab_size]) self.output_projection = (w, b) sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < vocab_size: sampled_softmax = True def sampled_loss(logits, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) losses = tf.nn.sampled_softmax_loss(w_t, b, logits, labels, num_samples, vocab_size) return losses loss_function = sampled_loss with tf.device("/cpu:0"): # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell embedding = tf.Variable( tf.random_uniform( [vocab_size, proj_size], minval=-init_scale, maxval=init_scale ), name="embedding" ) # embedding = tf.get_variable("embedding", [vocab_size, proj_size]) inputs_train = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_train] inputs_valid = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_valid] with tf.variable_scope("RNN", initializer=initializer): if attentive: outputs_train, state_train, _ = lm_ops.apply_attentive_lm( self.cell, inputs_train, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_train)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32 ) outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm( self.cell, inputs_valid, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_valid)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32 ) else: outputs_train, state_train = lm_ops.apply_lm( self.cell, inputs_train, sequence_length=math_ops.add_n(self.mask_train), dropout=self.dropout_feed, dtype=tf.float32 ) outputs_valid, state_valid = lm_ops.apply_lm( self.cell, inputs_valid, sequence_length=math_ops.add_n(self.mask_valid), dropout=self.dropout_feed, dtype=tf.float32 ) if sampled_softmax is False: logits_train = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_train] logits_valid = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_valid] else: logits_train = outputs_train logits_valid = outputs_valid loss_train = seq2seq.sequence_loss_by_example( logits_train, self.targets_train, self.mask_train, average_across_timesteps=True ) loss_valid = seq2seq.sequence_loss_by_example( logits_valid, self.targets_valid, self.mask_valid, average_across_timesteps=True ) self._cost_train = cost = tf.reduce_sum(loss_train) / float(batch_size) self._final_state_train = state_train self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size) self._final_state_valid = state_valid if not is_training: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm) opt = optimization_ops.get_optimizer(optimizer, learning_rate) self._train_op = opt.apply_gradients(zip(grads, tvars), global_step=self.global_step) self._valid_op = tf.no_op() self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())