def __init__(self, rnn_size, num_layers, batch_size, seq_length, vocab_size, grad_clip,\ infer=False): """ Constructor for an RNN using LSTMs. @param rnn_size: The size of the RNN @param num_layers: The number of layers for the RNN to have @param batch_size: The batch size to train with @param seq_length: The length of the sequences to use in training @param vocab_size: The size of the vocab @param grad_clip: The point at which to clip the gradient in the gradient descent @param infer: """ #TODO: During training, (and when sampling), the input to the RNN should be # the list of ingredients that goes with that recipe text. if infer: batch_size = 1 seq_length = 1 cell_fn = rnn_cell.GRUCell #BasicLSTMCell cell = cell_fn(rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.variable_scope("rnnlm"): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with (tf.device("/cpu:0")): embedding = tf.get_variable("embedding", [vocab_size, rnn_size]) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(\ embedding, self.input_data)) inputs = [tf.squeeze(inp, [1]) for inp in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) loop_func = loop if infer else None outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state,\ cell, loop_function=loop_func, scope="rnnlm") output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits],\ [tf.reshape(self.targets, [-1])],\ [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_graph(self, test): """ Builds an LSTM graph in TensorFlow. """ if test: self.batch_size = 1 self.seq_len = 1 lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size) self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) with tf.variable_scope('lstm_vars'): self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size]) self.bs = tf.get_variable('bs', [self.vocab_size]) with tf.device('/cpu:0'): self.embeddings = tf.get_variable( 'embeddings', [self.vocab_size, self.cell_size]) input_embeddings = tf.nn.embedding_lookup( self.embeddings, self.inputs) inputs_split = tf.split(1, self.seq_len, input_embeddings) inputs_split = [ tf.squeeze(input_, [1]) for input_ in inputs_split ] def loop(prev, _): prev = tf.matmul(prev, self.ws) + self.bs prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embeddings, prev_symbol) lstm_outputs_split, self.final_state = seq2seq.rnn_decoder( inputs_split, self.initial_state, self.cell, loop_function=loop if test else None, scope='lstm_vars') lstm_outputs = tf.reshape(tf.concat(1, lstm_outputs_split), [-1, self.cell_size]) logits = tf.matmul(lstm_outputs, self.ws) + self.bs self.probs = tf.nn.softmax(logits) total_loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_len])], self.vocab_size) self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len self.global_step = tf.Variable(0, trainable=False, name='global_step') self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
def build_graph(self): config = self.config self.reader = utils.DataReader(seq_len=config.seq_length, batch_size=config.batch_size, data_filename=config.data_filename) self.cell = LayerNormFastWeightsBasicRNNCell(num_units=config.rnn_size) self.input_data = tf.placeholder(tf.int32, [None, config.input_length]) self.targets = tf.placeholder(tf.int32, [None, 1]) self.initial_state = self.cell.zero_state( tf.shape(self.targets)[0], tf.float32) self.initial_fast_weights = self.cell.zero_fast_weights( tf.shape(self.targets)[0], tf.float32) with tf.variable_scope("input_embedding"): embedding = tf.get_variable( "embedding", [config.vocab_size, config.embedding_size]) inputs = tf.split( 1, config.input_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input, [1]) for input in inputs] with tf.variable_scope("send_to_rnn"): state = (self.initial_state, self.initial_fast_weights) output = None for i, input in enumerate(inputs): if i > 0: tf.get_variable_scope().reuse_variables() output, state = self.cell(input, state) with tf.variable_scope("softmax"): softmax_w = tf.get_variable("softmax_w", [config.rnn_size, config.vocab_size]) softmax_b = tf.get_variable("softmax_b", [config.vocab_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) self.output = tf.cast( tf.reshape(tf.arg_max(self.probs, 1), [-1, 1]), tf.int32) self.accuracy = tf.reduce_mean( tf.cast(tf.equal(self.output, self.targets), tf.float32)) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([config.batch_size])], config.vocab_size) self.cost = tf.reduce_mean(loss) self.final_state = state # self.lr = tf.Variable(0.001, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.grad_clip) optimizer = tf.train.AdamOptimizer() # self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.summary_accuracy = tf.scalar_summary('accuracy', self.accuracy) tf.scalar_summary('cost', self.cost) self.summary_all = tf.merge_all_summaries()
def __init__(self, vocabularySize, config_param): self.vocabularySize = vocabularySize self.config = config_param self._inputX = tf.placeholder(tf.int32, [self.config.batch_size, self.config.sequence_size], "InputsX") self._inputTargetsY = tf.placeholder(tf.int32, [self.config.batch_size, self.config.sequence_size], "InputTargetsY") #Converting Input in an Embedded form with tf.device("/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable("embedding", [self.vocabularySize, self.config.embeddingSize]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX) inputs = tf.split(1, self.config.sequence_size, embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define Tensor RNN singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size) self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] * self.config.num_layers) self._initial_state = self.multilayerRNN.zero_state(self.config.batch_size, tf.float32) #Defining Logits hidden_layer_output, last_state = rnn.rnn(self.multilayerRNN, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape(tf.concat(1, hidden_layer_output), [-1, self.config.hidden_size]) self._logits = tf.nn.xw_plus_b(hidden_layer_output, tf.get_variable("softmax_w", [self.config.hidden_size, self.vocabularySize]), tf.get_variable("softmax_b", [self.vocabularySize])) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss loss = seq2seq.sequence_loss_by_example([self._logits], [tf.reshape(self._inputTargetsY, [-1])], [tf.ones([self.config.batch_size * self.config.sequence_size])], self.vocabularySize) self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size) self._final_state = last_state
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format(args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) self.attn_length = 5 self.attn_size = 32 self.attention_states = tf.placeholder(tf.float32,[args.batch_size, self.attn_length, self.attn_size]) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size],name='soft_w') softmax_b = build_weight([args.vocab_size],name='soft_b') word_embedding = build_weight([args.vocab_size, args.embedding_size],name='word_embedding') inputs_list = tf.split(1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) if not args.attention: outputs, last_state = seq2seq.rnn_decoder(inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') else: outputs, last_state = attention_decoder(inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','') self.var_op = tf.global_variables() self.saver = tf.train.Saver(self.var_op,max_to_keep=4,keep_checkpoint_every_n_hours=1)
def __init__(self, args, infer=False): self.args = args training = not infer if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) if training and args.dropout > 0: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=1.0-args.dropout) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): self.embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(self.embedding, self.input_data) if training and args.dropout > 0: inputs = tf.nn.dropout(inputs, args.dropout) inputs = tf.split(1, args.seq_length, inputs) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state if not infer: self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_model(self): with tf.name_scope("batch_size"): self.batch_size = tf.shape(self.images)[0] with tf.variable_scope("rnnlm"): image_emb = tf.matmul(self.fc7, self.encode_img_W) + self.encode_img_b # Replicate self.seq_per_img times for each image embedding image_emb = tf.reshape(tf.tile(tf.expand_dims(image_emb, 1), [1, self.seq_per_img, 1]), [self.batch_size * self.seq_per_img, self.input_encoding_size]) rnn_inputs = tf.split(1, self.seq_length + 1, tf.nn.embedding_lookup(self.Wemb, self.labels[:,:self.seq_length + 1])) rnn_inputs = [tf.squeeze(input_, [1]) for input_ in rnn_inputs] rnn_inputs = [image_emb] + rnn_inputs initial_state = self.cell.zero_state(self.batch_size * self.seq_per_img, tf.float32) outputs, last_state = seq2seq.rnn_decoder(rnn_inputs, initial_state, self.cell, loop_function=None) #outputs, last_state = tf.nn.rnn(self.cell, rnn_inputs, initial_state) self.logits = [tf.matmul(output, self.embed_word_W) + self.embed_word_b for output in outputs[1:]] with tf.variable_scope("loss"): loss = seq2seq.sequence_loss_by_example(self.logits, [tf.squeeze(label, [1]) for label in tf.split(1, self.seq_length + 1, self.labels[:, 1:])], # self.labels[:,1:] is the target [tf.squeeze(mask, [1]) for mask in tf.split(1, self.seq_length + 1, self.masks[:, 1:])]) self.cost = tf.reduce_mean(loss) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) self.cnn_lr = tf.Variable(0.0, trainable=False) # Collect the rnn variables, and create the optimizer of rnn tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='rnnlm') optimizer = tf.train.AdamOptimizer(self.lr, beta1=0.8) grads = optimizer.compute_gradients(self.cost, tvars) grads_cliped = [(tf.clip_by_value(i, -self.opt.grad_clip, self.opt.grad_clip),j) for i,j in grads if not i is None] #grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), # self.opt.grad_clip) self.train_op = optimizer.apply_gradients(grads_cliped) # Collect the cnn variables, and create the optimizer of cnn cnn_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vgg16') cnn_optimizer = tf.train.AdamOptimizer(self.cnn_lr, beta1=0.8) cnn_grads = cnn_optimizer.compute_gradients(self.cost, cnn_tvars) cnn_grads_cliped = [(tf.clip_by_value(i, -self.opt.grad_clip, self.opt.grad_clip),j) for i,j in cnn_grads if not i is None] #cnn_grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, cnn_tvars), # self.opt.grad_clip) self.cnn_train_op = cnn_optimizer.apply_gradients(cnn_grads_cliped) tf.scalar_summary('training loss', self.cost) tf.scalar_summary('learning rate', self.lr) tf.scalar_summary('cnn learning rate', self.cnn_lr) #for i,j in cnn_grads: #if not i is None and j.name.startswith('vgg16_1'): #tf.histogram_summary(j.name+'_v', j) #tf.histogram_summary(j.name+'_d', i) #for i,j in grads: #tf.histogram_summary(j.name+'_v', j) #tf.histogram_summary(j.name+'_d', i) self.summaries = tf.merge_all_summaries()
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.empirical_entropy = self.cost/np.log(2) tf.summary.scalar('Empircal_Entropy', self.empirical_entropy) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.merged_summaries = tf.summary.merge_all()
def __init__(self, config): self.batch_size = config.batch_size self.seq_length = config.seq_length size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) self._targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) #Define RNN tensor lstm_cell = rnn_cell.BasicLSTMCell(size, state_is_tuple=True) self.cells = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True) self._initial_state = self.cells.zero_state(self.batch_size, tf.float32) #Converting Input in an Embedded form with tf.device( "/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable("embedding", [vocab_size, size]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._input_data) inputs = tf.split(1, self.seq_length, embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define softmax values softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) #Get hidden layer outputs hidden_layer_output, last_state = rnn.rnn( self.cells, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape(tf.concat(1, hidden_layer_output), [-1, size]) self._logits = tf.nn.xw_plus_b(hidden_layer_output, softmax_w, softmax_b) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss function loss = seq2seq.sequence_loss_by_example( [self._logits], [tf.reshape(self._targets, [-1])], [tf.ones([self.batch_size * self.seq_length])], vocab_size) self._cost = tf.div(tf.reduce_sum(loss), self.batch_size) self._final_state = last_state #Optimize gradient descent algorithm self._learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._learning_rate) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args if infer: #When we sample, the batch and sequence lenght are = 1 args.batch_size = 1 args.seq_length = 1 cell_fn = rnn_cell.BasicLSTMCell #Define the internal cell structure cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) #Build the inputs and outputs placeholders, and start with a zero internal values self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable( "softmax_w", [args.rnn_size, args.vocab_size]) #Final w softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #Final bias with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, data, infer=False): if infer: args.batch_size = 1 args.seq_length = 1 with tf.name_scope('inputs'): self.input_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.target_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) with tf.name_scope('model'): self.cell = rnn_cell.BasicLSTMCell(args.state_size) self.cell = rnn_cell.MultiRNNCell([self.cell] * args.num_layers) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): w = tf.get_variable('softmax_w', [args.state_size, data.vocab_size]) b = tf.get_variable('softmax_b', [data.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable( 'embedding', [data.vocab_size, args.state_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) outputs, last_state = tf.nn.dynamic_rnn( self.cell, inputs, initial_state=self.initial_state) with tf.name_scope('loss'): output = tf.reshape(outputs, [-1, args.state_size]) self.logits = tf.matmul(output, w) + b self.probs = tf.nn.softmax(self.logits) self.last_state = last_state targets = tf.reshape(self.target_data, [-1]) loss = seq2seq.sequence_loss_by_example( [self.logits], [targets], [tf.ones_like(targets, dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / args.batch_size tf.scalar_summary('loss', self.cost) with tf.name_scope('optimize'): self.lr = tf.placeholder(tf.float32, []) tf.scalar_summary('learning_rate', self.lr) optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) for g in grads: tf.histogram_summary(g.name, g) grads, _ = tf.clip_by_global_norm(grads, args.grad_clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.merged_op = tf.merge_all_summaries()
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, state_is_tuple=False) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=False) self.input_data = tf.placeholder(tf.int32, [args.batch_size, None]) # the length of input sequence is variable. self.targets = tf.placeholder(tf.int32, [args.batch_size, None]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) outputs, last_state = tf.nn.dynamic_rnn( cell, inputs, initial_state=self.initial_state, scope='rnnlm') output = tf.reshape(outputs, [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) targets = tf.reshape(self.targets, [-1]) loss = seq2seq.sequence_loss_by_example( [self.logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], args.vocab_size) self.cost = tf.reduce_mean(loss) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, rnn_size, num_layers, batch_size, seq_length, vocabulary_size, gradient_clip, sample=False): lstm_cell = rnn_cell.BasicLSTMCell(num_units=rnn_size) # create the RNN cell, that is constructed from multiple lstm cells, by duplicating the lstm cell self.cell = rnn_cell.MultiRNNCell([lstm_cell] * num_layers) # Initial state is a matrix of zeros self.initial_state = self.cell.zero_state(batch_size, tf.float32) # Define the vectors that will hold Tensorflow state self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) # variable_scope is tensorflow best practice that allows us to recycle variables names with different scopes with tf.variable_scope(VARIABLE_SCOPE): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocabulary_size]) softmax_b = tf.get_variable("softmax_b", [vocabulary_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocabulary_size, rnn_size]) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop_function(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b stop_gradient = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, stop_gradient) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=loop_function if sample else None, scope=VARIABLE_SCOPE) output = tf.result_sentencehape(tf.concat(1, outputs), [-1, rnn_size]) # Calculate the logits and probabilities for the tensor self.logits = tf.matmul(output, softmax_w) + softmax_b self.probabilities = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.result_sentencehape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocabulary_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), gradient_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, config=None, mode=None): self.config = config self.mode = mode self.reader = utils.DataReader(seq_len=config.seq_length, batch_size=config.batch_size, data_filename=config.data_filename) self.cell = rnn_cell.BasicLSTMCell(config.rnn_size, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [None, config.input_length]) self.targets = tf.placeholder(tf.int32, [None, 1]) self.initial_state = self.cell.zero_state(tf.shape(self.targets)[0], tf.float32) with tf.variable_scope("input_embedding"): embedding = tf.get_variable("embedding", [config.vocab_size, config.rnn_size]) inputs = tf.split(1, config.input_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input, [1]) for input in inputs] with tf.variable_scope("send_to_rnn"): state = self.initial_state output = None for i, input in enumerate(inputs): if i > 0: tf.get_variable_scope().reuse_variables() output, state = self.cell(input, state) with tf.variable_scope("softmax"): softmax_w = tf.get_variable("softmax_w", [config.rnn_size, config.vocab_size]) softmax_b = tf.get_variable("softmax_b", [config.vocab_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([config.batch_size])], config.vocab_size) self.cost = tf.reduce_mean(loss) self.final_state = state # self.lr = tf.Variable(0.001, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.grad_clip) optimizer = tf.train.AdamOptimizer()#self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def train_neural_network(): logits, last_state, probs, cell, initial_state,inputs = neural_network() targets = tf.reshape(output_targets, [-1]) loss = seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], datalen) cost = tf.reduce_mean(loss) learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars)) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(tf.all_variables()) for epoch in range(50): sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch))) n = 0 for batche in range(n_chunk): train_loss, _, _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]}) n += 1 print(epoch, batche, train_loss) #print inputs.eval(feed_dict={input_data: x_batches[n]}) if epoch % 7 == 0: saver.save(sess, 'thundermodule', global_step=epoch) if epoch == 3: # sess.run(tf.initialize_all_variables()) # saver = tf.train.Saver(tf.all_variables()) # saver.restore(sess, 'thundermodule-7') state_ = sess.run(cell.zero_state(1, tf.float32)) # problabels=probs.eval(feed_dict={input_data: x_batches[0][1], initial_state: state_}) [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: np.array(x_batches[0][0]).reshape(1, 5), initial_state: state_}) out = GetPredata(probs_, datas) print out
def __init__(self, args, infer=False): self.args = args if infer == True: args.batch_size = 1 args.seq_length = 1 # cell = rnn_cell.BasicLSTMCell(args.state_size) # self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) b = tf.get_variable('softmax_b', [args.vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) outputs, last_state = tf.nn.dynamic_rnn( self.cell, inputs, initial_state=self.initial_state, scope='rnnlm') output = tf.reshape(outputs, [-1, args.rnn_size]) self.logits = tf.matmul(output, w) + b self.probs = tf.nn.softmax(self.logits) targets = tf.reshape(self.targets, [-1]) loss = seq2seq.sequence_loss_by_example( [self.logits], [targets], [tf.ones_like(targets, dtype=tf.float32)]) self.cost = tf.reduce_mean(loss) self.last_state = last_state self.lr = tf.Variable(0.0, trainable=False) # optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) grads, _ = tf.clip_by_global_norm(grads, args.grad_clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args): self.args = args self.dropout = tf.Variable(trainable=False, dtype=tf.float32, initial_value=0) cell = rnn_cell.LSTMCell(args.hidden, state_is_tuple=True) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout) self.input_data = tf.placeholder( tf.float32, [args.batch_size, args.seq_length, args.seq_dim]) self.output_data = tf.placeholder(tf.int32, [args.batch_size]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn_audio'): rnn_weights = tf.get_variable("rnn_weights", [args.hidden, args.num_classes]) rnn_bias = tf.get_variable("rnn_bias", [args.num_classes]) with tf.device("/cpu:0"): inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnn_audio') output = outputs[-1] self.logits = tf.matmul(output, rnn_weights) + rnn_bias self.probabilities = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [self.output_data], [tf.ones([args.batch_size])], args.num_classes) self.cost = tf.reduce_mean(loss) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) train_vars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, train_vars), 5) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
def _init_tensorflow(self, infer: bool = False): """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Arguments: infer (bool): If True, initialize model for inference. If False, initialize model for training. Returns: module: imported TensorFlow module """ import tensorflow as tf from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import seq2seq # Use self.tensorflow_state to mark whether or not model is configured # for training or inference. try: if self.tensorflow_state == infer: return tf except AttributeError: pass self.cell_fn = { "lstm": rnn_cell.BasicLSTMCell, "gru": rnn_cell.GRUCell, "rnn": rnn_cell.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size fs.mkdir(self.cache.path) cell = self.cell_fn(self.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = self.cell.zero_state(batch_size, tf.float32) scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, self.rnn_size]) inputs = tf.split( 1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope=scope_name) output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # set model status self.tensorflow_state = infer return tf
def __init__(self, args, infer=False): # infer is set to true during sampling. self.args = args if infer: # Worry about one character at a time during sampling; no batching or BPTT. args.batch_size = 1 args.seq_length = 1 # Set cell_fn to the type of network cell we're creating -- RNN, GRU or LSTM. if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) # Call tensorflow library tensorflow-master/tensorflow/python/ops/rnn_cell # to create a layer of rnn_size cells of the specified basic type (RNN/GRU/LSTM). cell = cell_fn(args.rnn_size, state_is_tuple=True) # Use the same rnn_cell library to create a stack of these cells # of num_layers layers. Pass in a python list of these cells. # (The [cell] * arg.num_layers syntax literally duplicates cell multiple times in # a list. The syntax is such that [5, 6] * 3 would return [5, 6, 5, 6, 5, 6].) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) # Create two TF placeholder nodes of 32-bit ints (NOT floats!), # each of shape batch_size x seq_length. This shape matches the batches # (listed in x_batches and y_batches) constructed in create_batches in utils.py. # input_data will receive input batches, and targets will be what it compares against # to calculate loss. self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Using the zero_state function in the RNNCell master class in rnn_cell library, # create a tensor of zeros such that we can swap it in for the network state at any time # to zero out the network's state. # State dimensions are: cell_fn state size (2 for LSTM) x rnn_size x num_layers. # So an LSTM network with 100 cells per layer and 3 layers would have a state size of 600, # and initial_state would have a dimension of none x 600. self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) # Scope our new variables to the scope identifier string "rnnlm". with tf.variable_scope('rnnlm'): # Create new variable softmax_w and softmax_b for output. # softmax_w is a weights matrix from the top layer of the model (of size rnn_size) # to the vocabulary output (of size vocab_size). softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) # softmax_b is a bias vector of the ouput characters (of size vocab_size). softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # [TODO: Why specify CPU? Same as the TF translation tutorial, but don't know why.] with tf.device("/cpu:0"): # Create new variable named 'embedding' to connect the character input to the base layer # of the RNN. Its role is the conceptual inverse of softmax_w. # It contains the trainable weights from the one-hot input vector to the lowest layer of RNN. embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) # Create an embedding tensor with tf.nn.embedding_lookup(embedding, self.input_data). # This tensor has dimensions batch_size x seq_length x rnn_size. # tf.split splits that embedding lookup tensor into seq_length tensors (along dimension 1). # Thus inputs is a list of seq_length different tensors, # each of dimension batch_size x 1 x rnn_size. inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) # Iterate through these resulting tensors and eliminate that degenerate second dimension of 1, # i.e. squeeze each from batch_size x 1 x rnn_size down to batch_size x rnn_size. # Thus we now have a list of seq_length tensors, each with dimension batch_size x rnn_size. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # THIS LOOP FUNCTION IS NEVER ACTUALLY USED. # IT IS EXPLICITLY NOT USED DURING TRAINING. # DURING INFERENCE, SEQ_LENGTH == 1, SO SEQ2SEQ.RNN_DECODER() ONLY USES THE LOOP ARGUMENT # ON SEQUENCE LENGTH ITEMS SUBSEQUENT TO THE FIRST. # This looping function is used as part of seq2seq.rnn_decoder only during sampling -- not training. # prev is a 2D Tensor of shape [batch_size x cell.output_size]. # returns a 2D Tensor of shape [batch_size x cell.input_size]. def loop(prev, _): # prev is initially the top cell state. # Convert the top cell state into character logits. prev = tf.matmul(prev, softmax_w) + softmax_b # Pull the character with the greatest logit (no sampling, just argmaxing). # WHY IS THIS ARGMAXING WHEN ACTUAL SAMPLING IS DONE PROBABILISTICALLY? # DOESN'T THIS CAUSE OUTPUTS NOT TO MATCH INPUTS DURING SEQUENCE GENERATION? prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # Re-embed that symbol as the next step's input, and return that. return tf.nn.embedding_lookup(embedding, prev_symbol) # Set up a seq2seq decoder from the seq2seq.py library. # This constructs the outputs and states nodes of the network. # Outputs is a list (of len seq_length, same as inputs) of tensors of shape [batch_size x rnn_size]. # These are the raw output values of the top layer of the network at each time step. # They have NOT been fed through the decoder projection; they are still in network space, # not character space. # State is a tensor of shape [batch_size x cell.state_size]. # This is also the step where all of the trainable parameters for the LSTM (weights and biases) are defined. outputs, self.final_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # tf.concat concatenates the output tensors along the rnn_size dimension, # to make a single tensor of shape [batch_size x (seq_length * rnn_size)]. # This gives the following 2D outputs matrix: # [(rnn output: batch 0, seq 0) (rnn output: batch 0, seq 1) ... (rnn output: batch 0, seq seq_len-1)] # [(rnn output: batch 1, seq 0) (rnn output: batch 1, seq 1) ... (rnn output: batch 1, seq seq_len-1)] # ... # [(rnn output: batch batch_size-1, seq 0) (rnn output: batch batch_size-1, seq 1) ... (rnn output: batch batch_size-1, seq seq_len-1)] # tf.reshape then reshapes it to a tensor of shape [(batch_size * seq_length) x rnn_size]. # Output will now be the following matrix: # [rnn output: batch 0, seq 0] # [rnn output: batch 0, seq 1] # ... # [rnn output: batch 0, seq seq_len-1] # [rnn output: batch 1, seq 0] # [rnn output: batch 1, seq 1] # ... # [rnn output: batch 1, seq seq_len-1] # ... # ... # [rnn output: batch batch_size-1, seq seq_len-1] # Note the following comment in rnn_cell.py: # Note: in many cases it may be more efficient to not use this wrapper, # but instead concatenate the whole sequence of your outputs in time, # do the projection on this batch-concatenated sequence, then split it # if needed or directly feed into a softmax. output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # Obtain logits node by applying output weights and biases to the output tensor. # Logits is a tensor of shape [(batch_size * seq_length) x vocab_size]. # Recall that outputs is a 2D tensor of shape [(batch_size * seq_length) x rnn_size], # and softmax_w is a 2D tensor of shape [rnn_size x vocab_size]. # The matrix product is therefore a new 2D tensor of [(batch_size * seq_length) x vocab_size]. # In other words, that multiplication converts a loooong list of rnn_size vectors # to a loooong list of vocab_size vectors. # Then add softmax_b (a single vocab-sized vector) to every row of that list. # That gives you the logits! self.logits = tf.matmul(output, softmax_w) + softmax_b # Convert logits to probabilities. Probs isn't used during training! That node is never calculated. # Like logits, probs is a tensor of shape [(batch_size * seq_length) x vocab_size]. # During sampling, this means it is of shape [1 x vocab_size]. self.probs = tf.nn.softmax(self.logits) # seq2seq.sequence_loss_by_example returns 1D float Tensor containing the log-perplexity # for each sequence. (Size is batch_size * seq_length.) # Targets are reshaped from a [batch_size x seq_length] tensor to a 1D tensor, of the following layout: # target character (batch 0, seq 0) # target character (batch 0, seq 1) # ... # target character (batch 0, seq seq_len-1) # target character (batch 1, seq 0) # ... # These targets are compared to the logits to generate loss. # Logits: instead of a list of character indices, it's a list of character index probability vectors. # seq2seq.sequence_loss_by_example will do the work of generating losses by comparing the one-hot vectors # implicitly represented by the target characters against the probability distrutions in logits. # It returns a 1D float tensor (a vector) where item i is the log-perplexity of # the comparison of the ith logit distribution to the ith one-hot target vector. loss = seq2seq.sequence_loss_by_example([self.logits], # logits: 1-item list of 2D Tensors of shape [batch_size x vocab_size] [tf.reshape(self.targets, [-1])], # targets: 1-item list of 1D batch-sized int32 Tensors of the same length as logits [tf.ones([args.batch_size * args.seq_length])], # weights: 1-item list of 1D batch-sized float-Tensors of the same length as logits args.vocab_size) # num_decoder_symbols: integer, number of decoder symbols (output classes) # Cost is the arithmetic mean of the values of the loss tensor # (the sum divided by the total number of elements). # It is a single-element floating point tensor. This is what the optimizer seeks to minimize. self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length # Create a summary for our cost. tf.scalar_summary("cost", self.cost) # Create a node to track the learning rate as it decays through the epochs. self.lr = tf.Variable(args.learning_rate, trainable=False) self.global_epoch_fraction = tf.Variable(0.0, trainable=False) self.global_seconds_elapsed = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() # tvars is a python list of all trainable TF Variable objects. # tf.gradients returns a list of tensors of length len(tvars) where each tensor is sum(dy/dx). grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) # Use ADAM optimizer with the current learning rate. # Zip creates a list of tuples, where each tuple is (variable tensor, gradient tensor). # Training op nudges the variables along the gradient, with the given learning rate, using the ADAM optimizer. # This is the op that a training session should be instructed to perform. self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.summary_op = tf.merge_all_summaries()
def __init__(self, CellType, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data") self.targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets") lstm_cell = CellType(size) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self.initial_state = cell.zero_state(batch_size, tf.float32) # initializer used for reusable variable initializer (see `get_variable`) initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], initializer=initializer) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] states = [] state = self.initial_state with tf.variable_scope("RNN", initializer=initializer): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() inputs_slice = inputs[:, time_step, :] (cell_output, state) = cell(inputs_slice, state) outputs.append(cell_output) states.append(state) self.final_state = states[-1] output = tf.reshape(tf.concat(1, outputs), [-1, size]) w = tf.get_variable("softmax_w", [size, vocab_size], initializer=initializer) b = tf.get_variable("softmax_b", [vocab_size], initializer=initializer) logits = tf.nn.xw_plus_b(output, w, b) # compute logits for loss targets = tf.reshape(self.targets, [-1]) # reshape our target outputs weights = tf.ones([batch_size * num_steps ]) # used to scale the loss average # computes loss and performs softmax on our fully-connected output layer loss = sequence_loss_by_example([logits], [targets], [weights], vocab_size) self.cost = cost = tf.div(tf.reduce_sum(loss), batch_size, name="cost") if is_training: # setup learning rate variable to decay self.lr = tf.Variable(1.0, trainable=False) # define training operation and clip the gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars), name="train") else: # if this model isn't for training (i.e. testing/validation) then we don't do anything here self.train_op = tf.no_op()
def __init__(self, config, mode='TRAIN', loaded_word_embed=None): """Builds the computing graph and initializes all variabels. Args: config: Configuration object contains all model configuration. mode: String from {'TRAIN', 'EVAL', 'INFER'}. loaded_word_embed: A numpy array of pretrained word embedding. """ # Initilizes model parameters. self.batch_size = batch_size = config.batch_size self.vocab_size = vocab_size = config.vocab_size self.embed_dim = embed_dim = config.embed_dim self.hidden_dim = hidden_dim = config.hidden_dim self.num_hiddens = num_hiddens = config.num_hiddens self.num_modes = num_modes = config.num_modes self.mode_dim = mode_dim = config.mode_dim self.cmt_seq_len = cmt_seq_len = config.cmt_seq_len self.reply_seq_len = reply_seq_len = config.reply_seq_len # Objective weight for reply language modeling. self.alpha = alpha = config.alpha # Initializes placeholders for inputs. self.comment_inputs = [] self.comment_weights = [] self.reply_inputs = [] self.reply_weights = [] self._lr = tf.Variable(0.0, trainable=False) for i in xrange(cmt_seq_len): self.comment_inputs.append( tf.placeholder(tf.int32, name='comment_input_{0}'.format(i), shape=[batch_size])) self.comment_weights.append( tf.placeholder(tf.float32, name='comment_weight_{0}'.format(i), shape=[batch_size])) for i in xrange(reply_seq_len): self.reply_inputs.append( tf.placeholder(tf.int32, name='reply_input_{0}'.format(i), shape=[batch_size])) self.reply_weights.append( tf.placeholder(tf.float32, name='reply_weight_{0}'.format(i), shape=[batch_size])) self.comment_embeds = [] self.mix_mode_embeds = [] self.mode_probs = [] self.init_reply_embed = [] # Initlize mode_rnn. if mode == 'TRAIN' and config.keep_prob < 1.0: mode_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True), output_keep_prob=config.keep_prob) for _ in xrange(num_hiddens)], state_is_tuple=True) else: mode_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True) for _ in xrange(num_hiddens)], state_is_tuple=True) # Defines the modes. batch_mode_inds = tf.constant([range(num_modes) for _ in range(batch_size)]) # Defines the embeddings on CPU. with tf.device('/cpu:0'): mode_embedding = tf.get_variable( 'mode_embedding', [num_modes, mode_dim], dtype=tf.float32) att_mode_vecs = tf.nn.embedding_lookup( mode_embedding, batch_mode_inds) att_states = tf.reshape( att_mode_vecs, [-1, num_modes, 1, mode_dim]) att_mode_weight = tf.get_variable('att_mode_weight', [1, 1, mode_dim, hidden_dim]) mode_feat = tf.nn.conv2d( att_states, att_mode_weight, [1, 1, 1, 1], 'SAME') att_v = tf.get_variable('att_v', [hidden_dim]) def single_attention(query): with tf.variable_scope('attention_mlp'): y = linear(query, hidden_dim, True) y = tf.reshape(y, [-1, 1, 1, hidden_dim]) s = tf.reduce_sum(att_v * tf.tanh(mode_feat + y), [2, 3]) a_score = tf.nn.softmax(s) weighted_sum = tf.reduce_sum( tf.reshape(a_score, [-1, num_modes, 1, 1]) * att_states, [1, 2]) a_score = tf.reshape(a_score, [-1, num_modes]) weighted_sum = tf.reshape(weighted_sum, [-1, mode_dim]) return a_score, weighted_sum with tf.device('/cpu:0'): if loaded_word_embed is None: embed_weight = tf.get_variable('word_embedding', [vocab_size, embed_dim]) else: pretrain_word_embed = tf.constant(loaded_word_embed) embed_weight = tf.get_variable('word_embedding', initializer=pretrain_word_embed) cmt_state = mode_rnn.zero_state(batch_size, tf.float32) c_prev, cell_output = cmt_state[0] # Computes the residual value of content and global modes. att_proj_weight = tf.get_variable('att_proj_weight', [mode_dim, hidden_dim]) att_probs, attns = single_attention(cell_output) cell_output += tf.matmul(attns, att_proj_weight) cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)] mode_rnn_cell_output = [] mode_probs = [] lm_logits = [] with tf.variable_scope('mode_rnn'): for i, cmt_in in enumerate(self.comment_inputs): if i > 0: tf.get_variable_scope().reuse_variables() cmt_embeds = tf.reshape( tf.nn.embedding_lookup(embed_weight, cmt_in), [batch_size, embed_dim]) cell_output, cmt_state = mode_rnn(cmt_embeds, cmt_state) mode_rnn_cell_output.append(cell_output) att_probs, attns = single_attention(cell_output) c_prev, _ = cmt_state[0] cell_output += tf.matmul(attns, att_proj_weight) cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)] with tf.variable_scope('attention_projection'): attention_proj = linear(cell_output, vocab_size, True) lm_logits.append(attention_proj) mode_probs.append(att_probs) if mode == 'INFER': self.mix_mode_embeds.append(attns) if mode == 'INFER': self.comment_embeds = mode_rnn_cell_output self.mode_probs = mode_probs top_states = [tf.reshape(e, [-1, 1, mode_rnn.output_size]) for e in mode_rnn_cell_output] states_for_reply_rnn = tf.concat(1, top_states) reply_embeds = [ tf.reshape(tf.nn.embedding_lookup(embed_weight, reply_i), [batch_size, embed_dim]) for reply_i in self.reply_inputs[:-1]] # Initlize reply_rnn. if mode == 'TRAIN' and config.keep_prob < 1.0: reply_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True), output_keep_prob=config.keep_prob) for _ in xrange(num_hiddens)], state_is_tuple=True) else: reply_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True) for _ in xrange(num_hiddens)], state_is_tuple=True) reply_rnn_output, reply_rnn_final_state = attention_decoder( reply_embeds, cmt_state, states_for_reply_rnn, reply_rnn) if mode == 'INFER': self.init_reply_embed = reply_rnn_output[0] # Computes the language model loss for the comment. comment_targets = [cc for cc in self.comment_inputs[1:]] lm_loss = tf.reduce_sum(sequence_loss_by_example( lm_logits[:-1], comment_targets, self.comment_weights[1:])) gen_logits = [] with tf.variable_scope('gen_logit_projection'): for i, rnn_out in enumerate(reply_rnn_output): if i > 0: tf.get_variable_scope().reuse_variables() logits = linear(rnn_out, vocab_size, True) gen_logits.append(logits) # Computes the lanuage model loss for the reply. reply_targets = [tt for tt in self.reply_inputs[1:]] gen_loss = tf.reduce_sum(sequence_loss_by_example( gen_logits, reply_targets, self.reply_weights[1:])) loss = lm_loss + alpha * gen_loss self.total_loss = loss self.saver = tf.train.Saver(tf.all_variables()) if mode != 'TRAIN': return tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) if config.opt_method == 'SGD': optimizer = tf.train.GradientDescentOptimizer(self._lr) elif config.opt_method == 'AdaDelta': optimizer = tf.train.AdadeltaOptimizer(self._lr) elif config.opt_method == 'Adam': optimizer = tf.train.AdamOptimizer(self._lr) else: ValueError('Unknown optimizer {}'.format(config.opt_method)) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): """ Args: infer: whether the model is used for training or inference. If doing inference, we need to do two things: 1. Feed in one word at a time. 2. Give a loop function to the rnn decoder, in order to feed the previous step output into the next step. Inside the loop function, we prevent gradient updates. """ self.args = args if infer: args.batch_size = 1 args.seq_length = 1 # Can also experiment with using rnn_cell.BasicGRUCell here. cell_constructor = rnn_cell.BasicLSTMCell cell = cell_constructor(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # for training, targets is input_data shifted by one word. # see example in text_loader_tests self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Init hidden state to all zeroes. self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): # Dimensions should be: # Output * w + b # [batch_size, rnn_size] * [rnn_size, vocab_size] + [vocab_size] softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', args.vocab_size) # Word embedding. # Always place word embedding lookup on the CPU, and save GPU for # running the forward and backward pass of the LSTM. # Experience from running char-rnn on my GTX 1070 + 6820HK: # CPU utilization was about 16% during training, and GPU utilization was about 90%. with tf.device("/cpu:0"): # We learn this during training, hence this matrix is also a variable. # Each row is the word vector for one word. # TODO: consider visualizing this embedding using TSNE after training. embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) # Dimensions: [batch_size, seq_length, word_vector_length==rnn_size] embedding_lookup = tf.nn.embedding_lookup(embedding, self.input_data) # Split into a list of records, each with dimension: [batch_size, 1, word_vector_length] # This is to match tensorflow's LSTM impl: it expects a list of inputs, each is a time step. inputs = tf.split(1, args.seq_length, embedding_lookup) # Note that tensorflow wants a 2D matrix for each time step, not 3D. So remove dimension 1. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # While doing inference, we predict a word at each time step, then we feed the prediction # back into the LSTM decoder for the next timestep. This is done by giving this loop # function to the rnn decoder. # Second arg is the step number. We don't use it here. def loop(prev, _): # Dimensions: # prev * w + b # [batch_size==1, rnn_size] * [rnn_size, vocab_size] + [vocab_size] prev = tf.matmul(prev, softmax_w) + softmax_b symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, symbol) # last_state has dimension [batch_size, cell.state_size==rnn_size] # outputs is a list of records, one for each timestep of dimension [batch_size, rnn_size] outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # note that outputs is a list and cannot be multiplied with w. # we first reshape outputs to make it [batch_size * seq_length, rnn_size], # so we can multiple it with w. output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state # This allows variable learning rate during the training. # I.e. we can decrease this over time. # Notice the 'trainable=False' flag: we don't want to backprop into lr! self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_model(words_size, embedding_size, oseq_len, source_len, simplified_len, encoder_hidden, decoder_hidden, lstm_layer, batch_size, source_nfilters, source_width, is_train): args = construct_data(words_size=words_size, embedding_size=embedding_size, source_len=source_len, simplified_len=simplified_len, oseq_len=oseq_len, encoder_hidden=encoder_hidden, decoder_hidden=decoder_hidden, source_nfilters=source_nfilters, source_width=source_width) embedding = args['embedding'] conv_args = args['conv_args'] weigth_generation = args['weigth_generation'] bias_generation = args['bias_generation'] source = args['source'] defendant = args['defendant'] defendant_length = args['defendant_length'] label = args['label'] decoder_inputs = args['decoder_inputs'] loss_weights = args['loss_weights'] keep_prob = args['keep_prob'] sample_rate = args['sample_rate'] conv_encoder = encoder_conv(source=source, defendant=defendant, conv_args=conv_args, keep_prob=keep_prob, embedding=embedding, is_train=is_train) rnn_encoder, encoder_states = encoder_rnn( defendant=defendant, defendant_length=defendant_length, encoder_hidden=encoder_hidden, keep_prob=keep_prob, batch_size=batch_size, embedding=embedding) rnn_decoder, state_decoder = decoder_rnn( conv_encoder=conv_encoder, rnn_encoder=rnn_encoder, encoder_states=encoder_states, defendant=defendant, decoder_inputs=decoder_inputs, decoder_hidden=decoder_hidden, weigth_generation=weigth_generation, bias_generation=bias_generation, n_steps=oseq_len, batch_size=batch_size, lstm_layer=lstm_layer, keep_prob=keep_prob, embedding=embedding, sample_rate=sample_rate, is_train=is_train) cost = tf.reduce_mean( seq2seq.sequence_loss_by_example( logits=rnn_decoder, targets=tf.unpack(tf.transpose(label, [1, 0])), weights=tf.unpack( tf.transpose( tf.convert_to_tensor(loss_weights, dtype=tf.float32), [1, 0])))) words_prediction = tf.argmax(tf.transpose(tf.pack(rnn_decoder), [1, 0, 2]), 2) print('build model ') return { 'outputs': rnn_decoder, 'embedding': embedding, 'cost': cost, 'sample_rate': sample_rate, 'words_prediction': words_prediction, 'source': source, 'defendant': defendant, 'defendant_length': defendant_length, 'label': label, 'decoder_inputs': decoder_inputs, 'loss_weights': loss_weights, 'keep_prob': keep_prob }
def __init__(self, config, pretrained_embeddings=None, update_embeddings=True, is_training=False): self.config = config self.batch_size = batch_size = config.batch_size self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 self.vocab_size = config.vocab_size self.prem_steps = config.prem_steps self.hyp_steps = config.hyp_steps self.is_training = is_training # placeholders for inputs self.premise = tf.placeholder(tf.int32, [batch_size, self.prem_steps]) self.hypothesis = tf.placeholder(tf.int32, [batch_size, self.hyp_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, 3]) if pretrained_embeddings is not None: embedding = tf.get_variable( 'embedding', [self.vocab_size, self.config.embedding_size], dtype=tf.float32, trainable=update_embeddings) self.embedding_placeholder = tf.placeholder( tf.float32, [self.vocab_size, self.config.embedding_size]) self.embedding_init = embedding.assign(self.embedding_placeholder) else: embedding = tf.get_variable('embedding', [self.vocab_size, self.hidden_size], dtype=tf.float32) # create lists of (batch,step,hidden_size) inputs for models premise_inputs = tf.nn.embedding_lookup(embedding, self.premise) hypothesis_inputs = tf.nn.embedding_lookup(embedding, self.hypothesis) if pretrained_embeddings is not None: with tf.variable_scope("input_projection"): premise_inputs = input_projection3D(premise_inputs, self.hidden_size) with tf.variable_scope("input_projection", reuse=True): hypothesis_inputs = input_projection3D(hypothesis_inputs, self.hidden_size) # run FF networks over inputs with tf.variable_scope("FF"): prem_attn = self.feed_forward_attention(premise_inputs) with tf.variable_scope("FF", reuse=True): hyp_attn = self.feed_forward_attention(hypothesis_inputs) # This is doing all the dot-products for the feedforward attention at once. # get activations, shape: (batch, prem_steps, hyp_steps ) dot = tf.batch_matmul(prem_attn, hyp_attn, adj_y=True) hypothesis_softmax = tf.reshape(dot, [ batch_size * self.prem_steps, -1, ]) #(300,10) hypothesis_softmax = tf.expand_dims(tf.nn.softmax(hypothesis_softmax), 2) dot = tf.transpose( dot, [0, 2, 1]) # switch dimensions so we don't screw the reshape up premise_softmax = tf.reshape( dot, [batch_size * self.hyp_steps, -1]) #(200,15) premise_softmax = tf.expand_dims(tf.nn.softmax(premise_softmax), 2) # this is very ugly: we make a copy of the original input for each of the steps # in the opposite sentence, multiply with softmax weights, sum and reshape. alphas = tf.reduce_sum( premise_softmax * tf.tile(premise_inputs, [self.hyp_steps, 1, 1]), [1]) betas = tf.reduce_sum( hypothesis_softmax * tf.tile(hypothesis_inputs, [self.prem_steps, 1, 1]), [1]) # this is a list of (batch, hidden dim) tensors of hyp_steps length alphas = [ tf.squeeze(x) for x in tf.split( 1, self.hyp_steps, tf.reshape(alphas, [batch_size, -1, self.hidden_size])) ] # this is a list of (batch, hidden dim) tensors of prem_steps length betas = [ tf.squeeze(x) for x in tf.split( 1, self.prem_steps, tf.reshape(betas, [batch_size, -1, self.hidden_size])) ] # list of original premise vecs to go with betas prem_list = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.prem_steps, premise_inputs) ] # list of original hypothesis vecs to go with alphas hyp_list = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.hyp_steps, hypothesis_inputs) ] beta_concat_prems = [] alpha_concat_hyps = [] # append the relevant alpha/beta to the original word representation for input, rep in zip(prem_list, betas): beta_concat_prems.append(tf.concat(1, [input, rep])) for input, rep in zip(hyp_list, alphas): alpha_concat_hyps.append(tf.concat(1, [input, rep])) # send both through a feedforward network with shared parameters with tf.variable_scope("compare"): prem_comparison_vecs = tf.split( 0, self.prem_steps, self.feedforward_network(tf.concat(0, beta_concat_prems))) with tf.variable_scope("compare", reuse=True): hyp_comparison_vecs = tf.split( 0, self.hyp_steps, self.feedforward_network(tf.concat(0, alpha_concat_hyps))) # add representations and send through last classifier sum_prem_vec = tf.add_n(prem_comparison_vecs) sum_hyp_vec = tf.add_n(hyp_comparison_vecs) with tf.variable_scope("final_representation"): final_representation = self.feedforward_network( tf.concat(1, [sum_prem_vec, sum_hyp_vec])) # softmax over outputs to generate distribution over [neutral, entailment, contradiction] softmax_w = tf.get_variable("softmax_w", [4 * hidden_size, 3]) softmax_b = tf.get_variable("softmax_b", [3]) self.logits = tf.matmul(final_representation, softmax_w) + softmax_b # dim (batch_size, 3) _, targets = tf.nn.top_k(self.targets) loss = seq2seq.sequence_loss_by_example([self.logits], [targets], [tf.ones([batch_size])], 3) self.cost = tf.reduce_mean(loss) _, logit_max_index = tf.nn.top_k(self.logits) self.accuracy = tf.reduce_mean( tf.cast(tf.equal(logit_max_index, targets), tf.float32)) if is_training: self.lr = tf.Variable(self.config.learning_rate, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.config.max_grad_norm) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdagradOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, config): """Init model from provided configuration Args: config (dict): Model's configuration Should have: rnn_size: size of RNN hidden state num_layers: number of RNN layers rnn_type: lstm, rnn, or gru batch_size: batch size seq_length: sequence length grad_clip: Clip gradient value by this value vocab_size: size of vocabulary infer: True/False, if True, use the predicted output to feed back to RNN insted of gold target output. is_train: True if is training """ logger.info("Create model with options: \n{}".format(pprint.pformat(config))) self.rnn_size = config["rnn_size"] self.num_layers = config["num_layers"] self.rnn_type = config["rnn_type"] self.batch_size = config["batch_size"] self.seq_length = config["seq_length"] self.grad_clip = config["grad_clip"] self.vocab_size = config["vocab_size"] self.infer = config["infer"] self.is_train = config["is_train"] self.reuse = config["reuse"] if self.infer: self.batch_size = 1 self.seq_length = 1 if self.rnn_type == "rnn": cell_fn = rnn_cell.BasicRNNCell elif self.rnn_type == "gru": cell_fn = rnn_cell.GRUCell elif self.rnn_type == "lstm": cell_fn = rnn_cell.LSTMCell else: msg = "Rnn type should be either rnn, gru or lstm" logger.error(msg) sys.exit(msg) # Define the cell cell = cell_fn(self.rnn_size) # Create multiple layers RNN self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers) self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) self.initial_state = cell.zero_state(self.batch_size, tf.float32) with tf.variable_scope(MODEL_SCOPE, reuse=self.reuse): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, self.vocab_size]) softmax_b = tf.get_variable("softmax_b", [self.vocab_size]) # Model params stored in DEVICE_SCOPE (here using GPU) with tf.device(DEVICE_SCOPE): embeddings = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # Split it into list of step input, i.e. along dimension 1 inputs = tf.split(1, self.seq_length, tf.nn.embedding_lookup(embeddings, self.input_data)) """ tf.split works like numply.split, inputs is now a list of step inputs (to rnn). Each step input has shape (batch_size, 1, rnn_size). We don't need that dimension 1, remove it by squeezing. """ inputs = [tf.squeeze(_input, [1]) for _input in inputs] """ Instead of writing the neuralnet manually, use seq2seq.rnn_decoder. In test time, the predicted output is fed back to RNN instead of gold target output like in training time. """ def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b # Wow, this stop_gradient is cool prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embeddings, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if self.infer else None, scope=MODEL_SCOPE ) # Concat each sequence of the batch output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size]) # now (batch_size x seq_length) x rnn_size self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_length])] ) self.cost = tf.reduce_sum(loss) / (self.batch_size * self.seq_length) self.final_state = last_state if not self.is_train: return self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, embedding): self.args = args if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name='STAND_input') self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name='STAND_targets') self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.embedding = embedding with tf.variable_scope('STAND'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(self.embedding, self.input_data)) inputs = map(lambda i: tf.nn.l2_normalize(i, 1), [tf.squeeze(input_, [1]) for input_ in inputs]) def loop(prev, i): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.l2_normalize( tf.nn.embedding_lookup(embedding, prev_symbol), 1) o, _ = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='STAND') with tf.variable_scope('STAND', reuse=True) as scope: sf_o, _ = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop, scope=scope) output = tf.reshape(tf.concat(1, o), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) sf_output = tf.reshape(tf.concat(1, sf_o), [-1, args.rnn_size]) self_feed_logits = tf.matmul(sf_output, softmax_w) + softmax_b self.self_feed_probs = tf.nn.softmax(self_feed_logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.loss = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), args.grad_clip) for g, v in zip(grads, tvars): print v.name optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, config, pretrained_embeddings=None, update_embeddings=True, is_training=False): self.config = config self.batch_size = batch_size = config.batch_size self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 self.vocab_size = config.vocab_size self.prem_steps = config.prem_steps self.hyp_steps = config.hyp_steps self.is_training = is_training # placeholders for inputs self.premise = tf.placeholder(tf.int32, [batch_size, self.prem_steps]) self.hypothesis = tf.placeholder(tf.int32, [batch_size, self.hyp_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, 3]) if pretrained_embeddings is not None: embedding = tf.get_variable( 'embedding', [self.vocab_size, self.config.embedding_size], dtype=tf.float32, trainable=update_embeddings) self.embedding_placeholder = tf.placeholder( tf.float32, [self.vocab_size, self.config.embedding_size]) self.embedding_init = embedding.assign(self.embedding_placeholder) else: embedding = tf.get_variable('embedding', [self.vocab_size, self.hidden_size], dtype=tf.float32) # create lists of (batch,step,hidden_size) inputs for models premise_inputs = tf.nn.embedding_lookup(embedding, self.premise) hypothesis_inputs = tf.nn.embedding_lookup(embedding, self.hypothesis) if pretrained_embeddings is not None: with tf.variable_scope("input_projection"): premise_inputs = input_projection3D(premise_inputs, self.hidden_size) with tf.variable_scope("input_projection", reuse=True): hypothesis_inputs = input_projection3D(hypothesis_inputs, self.hidden_size) # run FF networks over inputs with tf.variable_scope("FF"): prem_attn = self.feed_forward_attention(premise_inputs) with tf.variable_scope("FF", reuse=True): hyp_attn = self.feed_forward_attention(hypothesis_inputs) # get activations, shape: (batch, prem_steps, hyp_steps ) dot = tf.batch_matmul(prem_attn, hyp_attn, adj_y=True) hypothesis_softmax = tf.reshape(dot, [ batch_size * self.prem_steps, -1, ]) #(300,10) hypothesis_softmax = tf.expand_dims(tf.nn.softmax(hypothesis_softmax), 2) dot = tf.transpose(dot, [0, 2, 1]) premise_softmax = tf.reshape( dot, [batch_size * self.hyp_steps, -1]) #(200,15) premise_softmax = tf.expand_dims(tf.nn.softmax(premise_softmax), 2) # this is very ugly: we make a copy of the original input for each of the steps # in the opposite sentence, multiply with softmax weights, sum and reshape. alphas = tf.reduce_sum( premise_softmax * tf.tile(premise_inputs, [self.hyp_steps, 1, 1]), [1]) betas = tf.reduce_sum( hypothesis_softmax * tf.tile(hypothesis_inputs, [self.prem_steps, 1, 1]), [1]) # this is (batch, hyp_steps, hidden dim ) alphas = [ tf.squeeze(x, [1]) for x in tf.split( 1, self.hyp_steps, tf.reshape(alphas, [batch_size, -1, self.hidden_size])) ] # this is (batch, prem_steps, hidden dim) betas = [ tf.squeeze(x, [1]) for x in tf.split( 1, self.prem_steps, tf.reshape(betas, [batch_size, -1, self.hidden_size])) ] # list of original premise vecs to go with betas prem_list = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.prem_steps, premise_inputs) ] # list of original hypothesis vecs to go with alphas hyp_list = [ tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.hyp_steps, hypothesis_inputs) ] beta_concat_prems = [] alpha_concat_hyps = [] for input, rep in zip(prem_list, betas): beta_concat_prems.append(tf.concat(1, [input, rep])) for input, rep in zip(hyp_list, alphas): alpha_concat_hyps.append(tf.concat(1, [input, rep])) prem_comparison_vecs = tf.concat( 1, [tf.expand_dims(x, 1) for x in beta_concat_prems]) hyp_comparison_vecs = tf.concat( 1, [tf.expand_dims(x, 1) for x in alpha_concat_hyps]) with tf.variable_scope("gru_inference"): inference = rnn_cell.GRUCell(self.config.inference_size) self.inference_cell = rnn_cell.MultiRNNCell([inference] * self.num_layers) self.inference_state = self.inference_cell.zero_state( self.batch_size, tf.float32) with tf.variable_scope("inference"): final_representation, remainders, self.iterations = self.do_inference_steps( self.inference_state, prem_comparison_vecs, hyp_comparison_vecs) # softmax over outputs to generate distribution over [neutral, entailment, contradiction] softmax_w = tf.get_variable("softmax_w", [self.config.inference_size, 3]) softmax_b = tf.get_variable("softmax_b", [3]) self.logits = tf.matmul(final_representation, softmax_w) + softmax_b # dim (batch_size, 3) _, targets = tf.nn.top_k(self.targets) loss = seq2seq.sequence_loss_by_example([self.logits], [targets], [tf.ones([self.batch_size])], 3) self.cost = tf.reduce_mean(loss) _, logit_max_index = tf.nn.top_k(self.logits) self.accuracy = tf.reduce_mean( tf.cast(tf.equal(logit_max_index, targets), tf.float32)) self.per_step_accs, self.per_step_dists = self.evaluate_representation( ) if is_training: self.lr = tf.Variable(self.config.learning_rate, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.config.max_grad_norm) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def main(): print("Start generating lycrics") # Initialize train and test data batch_size = FLAGS.batch_size epoch_number = FLAGS.epoch_number sequece_length = 20 rnn_hidden_units = 100 stacked_layer_nubmer = 3 # TODO: Use python 3 for encoding for Chinese #lycrics_filepath = "./data/jay_lyrics.txt" lycrics_filepath = "./data/shakespeare.txt" #with open(lycrics_filepath) as f: import codecs f = codecs.open(lycrics_filepath, encoding='utf-8') lycrics_data = f.read() words = list(set(lycrics_data)) words.sort() vocabulary_size = len(words) char_id_map = {} id_char_map = {} for index, char in enumerate(words): id_char_map[index] = char char_id_map[char] = index train_dataset = [] train_labels = [] index = 0 for i in range(batch_size): features = lycrics_data[index:index + sequece_length] labels = lycrics_data[index + 1:index + sequece_length + 1] index += sequece_length features = [char_id_map[word] for word in features] labels = [char_id_map[word] for word in labels] train_dataset.append(features) train_labels.append(labels) # Define the model batch_size = FLAGS.batch_size mode = FLAGS.mode if mode == "inference": batch_size = 1 sequece_length = 1 x = tf.placeholder(tf.int32, shape=(None, sequece_length)) y = tf.placeholder(tf.int32, shape=(None, sequece_length)) epoch_number = FLAGS.epoch_number checkpoint_dir = FLAGS.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) tensorboard_dir = FLAGS.tensorboard_dir checkpoint_file = checkpoint_dir + "/checkpoint.ckpt" steps_to_validate = FLAGS.steps_to_validate def lstm_inference(x): pass def stacked_lstm_inference(x): lstm_cell = rnn_cell.BasicLSTMCell(rnn_hidden_units) lstm_cells = rnn_cell.MultiRNNCell([lstm_cell] * stacked_layer_nubmer) initial_state = lstm_cells.zero_state(batch_size, tf.float32) with tf.variable_scope("stacked_lstm"): weights = tf.get_variable("weights", [rnn_hidden_units, vocabulary_size]) bias = tf.get_variable("bias", [vocabulary_size]) embedding = tf.get_variable("embedding", [vocabulary_size, rnn_hidden_units]) inputs = tf.nn.embedding_lookup(embedding, x) outputs, last_state = tf.nn.dynamic_rnn(lstm_cells, inputs, initial_state=initial_state) output = tf.reshape(outputs, [-1, rnn_hidden_units]) logits = tf.add(tf.matmul(output, weights), bias) return logits, lstm_cells, initial_state, last_state def inference(inputs): print("Use the model: {}".format(FLAGS.model)) if FLAGS.model == "lstm": return lstm_inference(inputs) elif FLAGS.model == "stacked_lstm": return stacked_lstm_inference(inputs) else: print("Unknow model, exit now") exit(1) # Define train op logits, lstm_cells, initial_state, last_state = inference(x) #loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logit, # y)) targets = tf.reshape(y, [-1]) loss = seq2seq.sequence_loss_by_example( [logits], [targets], [tf.ones_like(targets, dtype=tf.float32)]) loss = tf.reduce_sum(loss) predict_softmax = tf.nn.softmax(logits) learning_rate = FLAGS.learning_rate print("Use the optimizer: {}".format(FLAGS.optimizer)) if FLAGS.optimizer == "sgd": optimizer = tf.train.GradientDescentOptimizer(learning_rate) elif FLAGS.optimizer == "adadelta": optimizer = tf.train.AdadeltaOptimizer(learning_rate) elif FLAGS.optimizer == "adagrad": optimizer = tf.train.AdagradOptimizer(learning_rate) elif FLAGS.optimizer == "adam": optimizer = tf.train.AdamOptimizer(learning_rate) elif FLAGS.optimizer == "ftrl": optimizer = tf.train.FtrlOptimizer(learning_rate) elif FLAGS.optimizer == "rmsprop": optimizer = tf.train.RMSPropOptimizer(learning_rate) else: print("Unknow optimizer: {}, exit now".format(FLAGS.optimizer)) exit(1) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = optimizer.minimize(loss, global_step=global_step) saver = tf.train.Saver() tf.scalar_summary('loss', loss) init_op = tf.initialize_all_variables() # Create session to run graph with tf.Session() as sess: summary_op = tf.merge_all_summaries() writer = tf.train.SummaryWriter(tensorboard_dir, sess.graph) sess.run(init_op) if mode == "train": ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: print("Continue training from the model {}".format( ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) start_time = datetime.datetime.now() for epoch in range(epoch_number): _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={ x: train_dataset, y: train_labels }) if epoch % steps_to_validate == 0: end_time = datetime.datetime.now() print("[{}] Epoch: {}, loss: {}".format( end_time - start_time, epoch, loss_value)) saver.save(sess, checkpoint_file, global_step=step) #writer.add_summary(summary_value, step) start_time = end_time elif mode == "inference": ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: print("Load the model {}".format(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) start_time = datetime.datetime.now() word = FLAGS.inference_start_word generate_word_number = 100 generate_lyrics = word state = sess.run(lstm_cells.zero_state(1, tf.float32)) for i in range(generate_word_number): x2 = np.zeros((1, 1)) x2[0, 0] = char_id_map[word] prediction, state = sess.run([predict_softmax, last_state], feed_dict={ x: x2, initial_state: state }) predict_word_id = np.argmax(prediction[0]) word = id_char_map[predict_word_id] generate_lyrics += word end_time = datetime.datetime.now() print("[{}] Generated lyrics:\n{}".format(end_time - start_time, generate_lyrics)) else: print("Unknow mode, please choose 'train' or 'inference'") print("End of generating lycrics")
def __init__(self,args,infer=False): self.args=args if infer: args.batch_size=1 args.seq_length=1 if args.model=='rnn': cell_fn=rnn_cell.BasicRNNCell elif args.model=='gru': cell_fn=rnn_cell.GRUCell elif args.model=='lstm': cell_fn=rnn_cell.BasicLSTMCell else: raise Exception("模型不支持:{}".format(args.model)) cell=cell_fn(args.rnn_size) self.cell=cell=rnn_cell.MultiRNNCell([cell]*args.num_layers) self.input_data=tf.placeholder(tf.int32,[args.batch_size,args.seq_length]) #(10,25) self.targets=tf.placeholder(tf.int32,[args.batch_size,args.seq_length]) self.initial_state=cell.zero_state(args.batch_size,tf.float32) #因为想要达到变量共享的效果, 就要在 tf.variable_scope()的作用域下使用 tf.get_variable() 这种方式产生和提取变量. #不像 tf.Variable() 每次都会产生新的变量, tf.get_variable() 如果遇到了已经存在名字的变量时, 它会单纯的提取这个同样名字的变量, #如果不存在名字的变量再创建. with tf.variable_scope("rnnlm"): softmax_w=tf.get_variable("softmax_w",[args.rnn_size,args.vocab_size]) #args.vocab_size=19,19个方法 softmax_b=tf.get_variable("softmax_b",[args.vocab_size]) #attention=tf.get_variable("attention",[1,1,args.vocab_size]) ''' with tf.device("/cpu:0"): embedding=tf.get_variable("embedding",[args.vocab_size,args.rnn_size]) #输入数据 self.input_data 的维度是 (batch_size , seq_length) #而输出的input_embedding 的维度成为 (batch_size ,num_steps ,rnn_size). 就是一个立方体,每个样例就是从头顶上削一片下来 #词嵌入后成了这样一个三维数组,里面每一个元素是一个二维数组(25,32) temp=tf.nn.embedding_lookup(embedding,self.input_data) #(10,25,32) #tf.split()函数将长方体按每一列切片,切成了25个片,每一片都是(10,32),表示这是这一批样本们的第t个特征,即在第xt时间步传入的input,embedding代替了ont-hot inputs=tf.split(1,args.seq_length,temp) #len(inputs)=25 #print(inputs[0].shape) (10,1,32) #删除维度1 (10,32) #每个数据从一列变成了一个扁平的长方形 inputs=[tf.squeeze(input_,[1]) for input_ in inputs] ''' ''' def loop(prev,_): prev=tf.matmul(prev,softmax_w)+softmax_b #axis=1的时候,将每一行最大元素所在的索引记录下来,最后返回每一行最大元素所在的索引数组 prev_symbol=tf.stop_gradient(tf.argmax(prev,1)) #stop_gradients也是一个list,list中的元素是tensorflow graph中的op, # 一旦进入这个list,将不会被计算梯度,更重要的是,在该op之后的BP计算都不会运行 return tf.nn.embedding_lookup(embedding,prev_symbol) ''' inputs=tf.split(1,args.seq_length,self.input_data) inputs=[tf.squeeze(input_,[1]) for input_ in inputs] #inputss=[tf.reshape(self.input_data[:,i],-1) for i in range(args.seq_length)] outputs,last_state=seq2seq.embedding_attention_seq2seq(inputs,inputs,cell,args.vocab_size,args.vocab_size, args.rnn_size) #outputs,last_state=seq2seq.attention_decoder(inputs,self.initial_state, attention,cell,loop_function=loop if infer else None,scope='rnnlm') #outputs,last_state=seq2seq.rnn_decoder(inputs,self.initial_state,cell,loop_function=loop if infer else None,scope='rnnlm') self.saved_outputs=outputs #print(len(outputs)) #是一个三维数组,有25个元素,对应步长,每个元素是一个二维数组(10,32) output=tf.reshape(tf.concat(1,outputs),[-1,args.vocab_size]) #print(output) //(250,32),将这25个(10,32)的二维数组按行堆叠了起来,行数变成了10*25 #网络的最后输出(相当于最后添加了一个全连接层) #self.logits=tf.matmul(output,softmax_w)+softmax_b #(250,19) self.logits=output #过一个softmax self.probs=tf.nn.softmax(self.logits) #参数要求:output [batch*numsteps, vocab_size] #target, [batch_size, num_steps] #weight:[tf.ones([batch_size * num_steps] #output具体的维度讲解见chrome"https://blog.csdn.net/xyz1584172808/article/details/83056179?depth_1-utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task" loss=seq2seq.sequence_loss_by_example([self.logits],[tf.reshape(self.targets,[-1])],[tf.ones([args.batch_size*args.seq_length])],args.vocab_size) self.cost=tf.reduce_sum(loss)/args.batch_size/args.seq_length self.final_state=last_state self.lr=tf.Variable(0.0,trainable=False) tvars=tf.trainable_variables() grads,_=tf.clip_by_global_norm(tf.gradients(self.cost,tvars),args.grad_clip) optimizer=tf.train.AdamOptimizer(self.lr) self.train_op=optimizer.apply_gradients(zip(grads,tvars))
def __init__(self, args): self.args = args if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) ## ## input data will be of dimension ## shape = (batch_size, seq_length, invocab_size) ## self.input_data = tf.placeholder(tf.float32, [args.batch_size, args.seq_length, args.char_size]) ## ## target data will be of dimension ## shape = (batch_size, seq_length) ## NOTE : out dim not specified here ## self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) ## ## initial state is of size batch_size * state_size ## this is equivalent to tf.zeros([batch_size, state_size]) ## self.initial_state = cell.zero_state(args.batch_size, tf.float32) ## ## input and final softmax layer outputs ## here we specify the out dimention ## with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.phvocab_size]) softmax_b = tf.get_variable("softmax_b", [args.phvocab_size]) ## ## unrolling of the input to sequence length ## and removing the 1 dim ## inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] ## ## simple rnn decoder. Simple meaning without attention ## last_state is the final state from rnn after specified ## sequence length. ## last_state is the thought vector ## outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnnlm') ## ## outputs is a list of size sequence length. ## Each list element is of dimention batch_size * rnn_size ## i.e for each unrolled input, there will be one output state ## (last state) each will be of dimension rnn_size. ## outconcat = tf.concat(1, outputs) output = tf.reshape(outconcat, [-1, args.rnn_size]) ## ## final logit layer ## NOTE : x * W (where x is batch * rnn_size) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) ## ## cost function ## reshaped_target = tf.reshape(self.targets, [-1]), seq_weight = tf.ones([args.batch_size * args.seq_length]) loss = seq2seq.sequence_loss_by_example([self.logits], [reshaped_target], [seq_weight], args.phvocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state ## ## Optimizer ## Adam optimizer and gradient clipping ## self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) #(3, 2) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) #(3, 2) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) #with tf.name_scope('stddev'): # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) #tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) #tf.summary.histogram('histogram', var) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) #(4, 7) variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #7 variable_summaries(softmax_b) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) #(7,4) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_graph(self, test): """ Builds an LSTM graph in TensorFlow. """ if test: self.batch_size = 1 self.seq_len = 1 ## # LSTM Cells ## lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size) self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) ## # Data ## # inputs and targets are 2D tensors of shape self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) ## # Variables ## with tf.variable_scope('lstm_vars'): self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size]) self.bs = tf.get_variable('bs', [self.vocab_size]) # TODO: initializer? with tf.device('/cpu:0' ): # put on CPU to parallelize for faster training/ self.embeddings = tf.get_variable( 'embeddings', [self.vocab_size, self.cell_size]) # get embeddings for all input words input_embeddings = tf.nn.embedding_lookup( self.embeddings, self.inputs) # The split splits this tensor into a seq_len long list of 3D tensors of shape # [batch_size, 1, rnn_size]. The squeeze removes the 1 dimension from the 1st axis # of each tensor inputs_split = tf.split(1, self.seq_len, input_embeddings) inputs_split = [ tf.squeeze(input_, [1]) for input_ in inputs_split ] # inputs_split looks like this: # [ # tensor_<0>([ # [batchElt<0>_wordEmbedding<0>], # ..., # [batchElt<batch_size - 1>_wordEmbedding<0>] # ]), # ..., # tensor_<seq_len - 1>([ # [batchElt<0>_wordEmbedding<seq_len - 1>], # ..., # [batchElt<batch_size - 1>_wordEmbedding<seq_len - 1>] # ]) # ] def loop(prev, _): prev = tf.matmul(prev, self.ws) + self.bs prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embeddings, prev_symbol) lstm_outputs_split, self.final_state = seq2seq.rnn_decoder( inputs_split, self.initial_state, self.cell, loop_function=loop if test else None, scope='lstm_vars') lstm_outputs = tf.reshape(tf.concat(1, lstm_outputs_split), [-1, self.cell_size]) # outputs looks like this: # [ # tensor_<0>([ # [batchElt<0>_outputEmbedding<0>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<0>] # ]), # ..., # tensor_<seq_len - 1>([ # [batchElt<0>_outputEmbedding<seq_len - 1>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<seq_len - 1>] # ]) # ] # output looks like this: # tensor([ # [batchElt<0>_outputEmbedding<0>], # ..., # [batchElt<0>_outputEmbedding<seq_len - 1>], # [batchElt<1>_outputEmbedding<0>], # ..., # [batchElt<1>_outputEmbedding<seq_len - 1>], # ... # [batchElt<batch_size - 1>_outputEmbedding<0>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<seq_len - 1>] # ]) logits = tf.matmul(lstm_outputs, self.ws) + self.bs self.probs = tf.nn.softmax(logits) ## # Train ## total_loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_len])], self.vocab_size) self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len self.global_step = tf.Variable(0, trainable=False, name='global_step') self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
def __init__(self, args, infer=False): """ 数据预处理完成以后,接下来就是建立seq2seq模型了。建立模型主要分为三步: 确定好编码器和解码器中cell的结构,即采用什么循环单元,多少个神经元以及多少个循环层; 将输入数据转化成tensorflow的seq2seq.rnn_decoder需要的格式,并得到最终的输出以及最后一个隐含状态; 将输出数据经过softmax层得到概率分布,并且得到误差函数,确定梯度下降优化器; 由于tensorflow提供的rnncell共有三种,分别是RNN、GRU、LSTM,因此这里我们也提供三种选择,并且每一种都可以使用多层结构, 即MultiRNNCell :param args: :param infer: """ self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format( args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size], name='soft_w') softmax_b = build_weight([args.vocab_size], name='soft_b') word_embedding = build_weight( [args.vocab_size, args.embedding_size], name='word_embedding') inputs_list = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(word_embedding, prev_symbol) # 用于建立seq2seq的函数,rnn_decoder以及attention_decoder if not args.attention: outputs, last_state = seq2seq.rnn_decoder( inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') # rnn_decoder函数主要有四个参数 # decoder_inputs其实就是输入的数据,要求的格式为一个list,并且list中的tensor大小应该为[batch_size,input_size], # 换句话说这个list的长度就是seq_length;但我们原始的输入数据的维度为[args.batch_size, args.seq_length], # 是不是感觉缺少了一个input_size维度,其实这个维度就是word_embedding的维度,或者说word2vec的大小, # 这里需要我们手动进行word_embedding,并且这个embedding矩阵是一个可以学习的参数 # initial_state是cell的初始状态,其维度是[batch_size,cell.state_size], # 由于rnn_cell模块提供了对状态的初始化函数,因此我们可以直接调用 # cell就是我们要构建的解码器和编码器的cell,上面已经提过了。 # 最后一个参数是loop_function,其作用是在生成的时候,我们需要把解码器上一时刻的输出作为下一时刻的输入, # 并且这个loop_function需要我们自己写 # 其中outputs是与decoder_inputs同样维度的量,即每一时刻的输出; # last_state的维度是[batch_size,cell.state_size],即最后时刻的所有cell的状态。 # 接下来需要outputs来确定目标函数,而last-state的作用是作为抽样生成函数下一时刻的状态 else: self.attn_length = 5 self.attn_size = 32 self.attention_states = build_weight( [args.batch_size, self.attn_length, self.attn_size]) outputs, last_state = seq2seq.attention_decoder( inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # tensorflow中提供了sequence_loss_by_example函数用于按照权重来计算整个序列中每个单词的交叉熵, # 返回的是每个序列的log-perplexity。为了使用sequence_loss_by_example函数, # 我们首先需要将outputs通过一个前向层,同时我们需要得到一个softmax概率分布 # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) # train_op即为训练时需要运行的 self.train_op = optimizer.apply_gradients( zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace( ' ', '').replace('/', '') self.var_op = tf.global_variables()
def __init__(self, config, pretrained_embeddings=None, update_embeddings=True, is_training=False): self.config = config self.bidirectional = config.bidirectional self.batch_size = batch_size = config.batch_size self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 self.vocab_size = config.vocab_size self.prem_steps = config.prem_steps self.hyp_steps = config.hyp_steps self.is_training = is_training # placeholders for inputs self.premise = tf.placeholder(tf.int32, [batch_size, self.prem_steps]) self.hypothesis = tf.placeholder(tf.int32, [batch_size, self.hyp_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, 3]) if pretrained_embeddings is not None: embedding = tf.get_variable('embedding', [self.vocab_size, self.config.embedding_size], dtype=tf.float32, trainable=update_embeddings) self.embedding_placeholder = tf.placeholder(tf.float32, [self.vocab_size, self.config.embedding_size]) self.embedding_init = embedding.assign(self.embedding_placeholder) else: embedding = tf.get_variable('embedding', [self.vocab_size, self.hidden_size], dtype=tf.float32) # create lists of (batch,hidden_size) inputs for models premise_inputs = tf.nn.embedding_lookup(embedding, self.premise) hypothesis_inputs = tf.nn.embedding_lookup(embedding, self.hypothesis) if pretrained_embeddings is not None: with tf.variable_scope("input_projection"): premise_inputs = input_projection3D(premise_inputs, self.hidden_size) with tf.variable_scope("input_projection", reuse=True): hypothesis_inputs = input_projection3D(hypothesis_inputs, self.hidden_size) if self.config.no_cell: hyp_outputs = hypothesis_inputs premise_outputs = premise_inputs else: premise_inputs = [tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.prem_steps, premise_inputs)] hypothesis_inputs = [tf.squeeze(single_input, [1]) for single_input in tf.split(1, self.hyp_steps, hypothesis_inputs)] with tf.variable_scope("premise_f"): prem_f = rnn_cell.GRUCell(self.config.encoder_size) self.prem_cell_f = rnn_cell.MultiRNNCell([prem_f]* self.num_layers) with tf.variable_scope("premise_b"): prem_b = rnn_cell.GRUCell(self.config.encoder_size) self.prem_cell_b = rnn_cell.MultiRNNCell([prem_b]* self.num_layers) # run GRUs over premise + hypothesis if self.bidirectional: premise_outputs, prem_state_f, prem_state_b = rnn.bidirectional_rnn( self.prem_cell_f,self.prem_cell_b, premise_inputs,dtype=tf.float32, scope="gru_premise") else: premise_outputs, prem_state = rnn.rnn( self.prem_cell_f, premise_inputs, dtype=tf.float32, scope="gru_premise") premise_outputs = tf.concat(1, [tf.expand_dims(x,1) for x in premise_outputs]) with tf.variable_scope("hypothesis_f"): hyp_f = rnn_cell.GRUCell(self.config.encoder_size) self.hyp_cell_f = rnn_cell.MultiRNNCell([hyp_f] * self.num_layers) with tf.variable_scope("hypothesis_b"): hyp_b = rnn_cell.GRUCell(self.config.encoder_size) self.hyp_cell_b = rnn_cell.MultiRNNCell([hyp_b] * self.num_layers) if self.bidirectional: hyp_outputs, hyp_state_f, hyp_state_b = rnn.bidirectional_rnn( self.hyp_cell_f,self.hyp_cell_b,hypothesis_inputs,dtype=tf.float32, scope= "gru_hypothesis") else: hyp_outputs, hyp_state = rnn.rnn(self.hyp_cell_f,hypothesis_inputs, dtype=tf.float32, scope="gru_hypothesis") hyp_outputs = tf.concat(1, [tf.expand_dims(x,1) for x in hyp_outputs]) with tf.variable_scope("prediction"): prediction, stopping_probs, iterations = self.do_act_steps( premise_outputs, hyp_outputs) # make it easy to get this info out of the model later self.remainder = 1.0 - stopping_probs self.iterations = iterations #iterations = tf.Print(iterations, [iterations], message="Iterations: ", summarize=20) #remainder = tf.Print(remainder, [remainder], message="Remainder: ", summarize=20) # softmax over outputs to generate distribution over [neutral, entailment, contradiction] softmax_w = tf.get_variable("softmax_w", [2*self.rep_size, 3]) softmax_b = tf.get_variable("softmax_b", [3]) self.logits = tf.matmul(prediction, softmax_w) + softmax_b # dim (batch_size, 3) _, targets = tf.nn.top_k(self.targets) loss = seq2seq.sequence_loss_by_example( [self.logits], [targets], [tf.ones([batch_size])], 3) self.cost = tf.reduce_mean(loss) + self.config.step_penalty*tf.reduce_mean((self.remainder) + tf.cast(iterations, tf.float32)) if self.config.embedding_reg and update_embeddings: self.cost += self.config.embedding_reg * (tf.reduce_mean(tf.square(embedding))) _, logit_max_index = tf.nn.top_k(self.logits) self.accuracy = tf.reduce_mean(tf.cast(tf.equal(logit_max_index, targets), tf.float32)) if is_training: self.lr = tf.Variable(config.learning_rate, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.config.max_grad_norm) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def model(words_size, embedding_size, oseq_len, source_len, simplified_len, defendant_nfilters, defendant_width, encoder_hidden, decoder_hidden, lstm_layer, batch_size, source_nfilters, source_width, is_train): args = construct_data(words_size=words_size, embedding_size=embedding_size, source_len=source_len, simplified_len=simplified_len, oseq_len=oseq_len, encoder_hidden=encoder_hidden, decoder_hidden=decoder_hidden, source_nfilters=source_nfilters, source_width=source_width, defendant_nfilters=defendant_nfilters, defendant_width=defendant_width) embedding = args['embedding'] conv_args=args['conv_args'] weigth_generation = args['weigth_generation'] bias_generation = args['bias_generation'] weigth_copy = args['weigth_copy'] bias_copy = args['bias_copy'] source = args['source'] defendant = args['defendant'] defendant_length = args['defendant_length'] label = args['label'] decoder_inputs = args['decoder_inputs'] loss_weights = args['loss_weights'] keep_prob = args['keep_prob'] sample_rate = args['sample_rate'] conv_encoder = encoder_conv(source=source, defendant=defendant, conv_args=conv_args, keep_prob=keep_prob, embedding=embedding, is_train=is_train) rnn_encoder = encoder_rnn(defendant=defendant, defendant_length=defendant_length, encoder_hidden=encoder_hidden, keep_prob=keep_prob, batch_size=batch_size, embedding=embedding) rnn_decoder, state_decoder = decoder_rnn(conv_encoder=conv_encoder, rnn_encoder=rnn_encoder, defendant=defendant, decoder_inputs=decoder_inputs, decoder_hidden=decoder_hidden, weigth_generation=weigth_generation, weigth_copy=weigth_copy, bias_generation=bias_generation, bias_copy=bias_copy, n_steps=oseq_len, batch_size=batch_size, lstm_layer=lstm_layer, keep_prob=keep_prob, embedding=embedding, sample_rate=sample_rate, is_train=is_train) cost = tf.reduce_mean(seq2seq.sequence_loss_by_example(logits=rnn_decoder, targets=tf.unpack(tf.transpose(label, [1,0])), weights=tf.unpack(tf.transpose(tf.convert_to_tensor( loss_weights, dtype=tf.float32), [1,0])))) words_prediction = tf.argmax(tf.transpose(tf.pack(rnn_decoder), [1, 0, 2]), 2) print ('build model ') return {'outputs':rnn_decoder, 'embedding':embedding, 'cost':cost, 'sample_rate':sample_rate, 'words_prediction':words_prediction, 'source':source, 'defendant':defendant, 'defendant_length':defendant_length, 'label':label, 'decoder_inputs':decoder_inputs, 'loss_weights':loss_weights, 'keep_prob':keep_prob}
def __init__(self, embedding, max_length, initial_state, attention_states, cell, num_samples=512, feed_previous=False, update_embedding_for_previous=True, dtype=dtypes.float32, scope=None, initial_state_attention=False, **kwargs): # account for _GO and _EOS self.max_length = max_length + 2 self.lengths = kwargs.get( 'lengths', tf.placeholder(tf.int32, shape=[None], name="decoder_lengths")) self.inputs = kwargs.get('inputs', [ tf.placeholder( tf.int32, shape=[None], name="decoder_input{0}".format(i)) for i in xrange(self.max_length) ]) self.weights = kwargs.get('weights', [ tf.placeholder( tf.float32, shape=[None], name="decoder_weight{0}".format(i)) for i in xrange(self.max_length) ]) self.targets = [ self.inputs[i + 1] for i in xrange(len(self.inputs) - 1) ] self.targets.append(tf.zeros_like(self.targets[0])) num_symbols = embedding.get_shape()[0].value output_projection = None loss_function = None self.cell = cell self.feed_previous = feed_previous if num_samples > 0 and num_samples < num_symbols: with tf.device('/cpu:0'): w = tf.get_variable('proj_w', [cell.output_size, num_symbols]) w_t = tf.transpose(w) b = tf.get_variable('proj_b', [num_symbols]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device('/cpu:0'): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, num_symbols) loss_function = sampled_loss output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) output_size = num_symbols if output_size is None: output_size = cell.output_size if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with( [cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): loop_function = self._extract_argmax_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in self.inputs ] self.outputs, self.state = attention_decoder( emb_inp, self.lengths, initial_state, attention_states, cell, output_size=output_size, loop_function=loop_function, initial_state_attention=initial_state_attention) targets = [self.inputs[i + 1] for i in xrange(len(self.inputs) - 1)] targets.append(tf.zeros_like(self.inputs[-1])) # loss for each instance in batch self.instance_loss = sequence_loss_by_example( self.outputs, targets, self.weights, softmax_loss_function=loss_function) # aggregated average loss per instance for batch self.loss = tf.reduce_sum(self.instance_loss) / math_ops.cast( array_ops.shape(targets[0])[0], self.instance_loss.dtype) if output_projection is not None: self.projected_output = [ tf.matmul(o, output_projection[0]) + output_projection[1] for o in self.outputs ] self.decoded_outputs = tf.unpack( tf.argmax(tf.pack(self.projected_output), 2)) else: self.decoded_outputs = tf.unpack( tf.argmax(tf.pack(self.outputs), 2)) self.decoded_lenghts = tf.reduce_sum( tf.sign(tf.transpose(tf.pack(self.decoded_outputs))), 1) self.decoded_batch = tf.transpose(tf.pack(self.decoded_outputs))