def discriminator_stego_nn(self, img, reuse=False): with tf.variable_scope('S_network'): if reuse: tf.get_variable_scope().reuse_variables() net = img net = self.image_processing_layer(img) net = self.batch_norm(net, scope='d_s_bn0') net = conv2d(net, self.df_dim, kernel_size=[5, 5], stride=[2, 2], activation_fn=self.leaky_relu, scope='d_s_h0_conv') net = self.batch_norm(net, scope='d_s_bn1') net = conv2d(net, self.df_dim * 2, kernel_size=[5, 5], stride=[2, 2], activation_fn=self.leaky_relu, scope='d_s_h1_conv') net = self.batch_norm(net, scope='d_s_bn2') net = conv2d(net, self.df_dim * 4, kernel_size=[5, 5], stride=[2, 2], activation_fn=self.leaky_relu, scope='d_s_h2_conv') net = self.batch_norm(net, scope='d_s_bn3') net = conv2d(net, self.df_dim * 8, kernel_size=[5, 5], stride=[2, 2], activation_fn=self.leaky_relu, scope='d_s_h3_conv') net = self.batch_norm(net, scope='d_s_bn4') net = tf.reshape(net, [self.conf.batch_size, -1]) net = linear(net, 1, activation_fn=tf.nn.sigmoid, scope='d_s_h4_lin', weights_initializer=tf.random_normal_initializer(stddev=0.02)) return net
def generator_nn(self, noise, train=True): with tf.variable_scope('G_network'): if not train: tf.get_variable_scope().reuse_variables() gen = linear(noise, self.gf_dim * 8 * 4 * 4, scope='g_h0_lin', activation_fn=None, weights_initializer=tf.random_normal_initializer(stddev=0.02)) gen = tf.reshape(gen, [-1, 4, 4, self.gf_dim * 8]) # gen = self.batch_norm(gen, reuse=(not train), scope='g_bn0') gen = self.g_bn0(gen, train=train) gen = tf.nn.relu(gen) gen = self.conv2d_transpose(gen, [self.conf.batch_size, 8, 8, self.gf_dim * 4], name='g_h1') # gen = self.batch_norm(gen, reuse=(not train), scope='g_bn1') gen = self.g_bn1(gen, train=train) gen = tf.nn.relu(gen) gen = self.conv2d_transpose(gen, [self.conf.batch_size, 16, 16, self.gf_dim * 2], name='g_h2') # gen = self.batch_norm(gen, reuse=(not train), scope='g_bn2') gen = self.g_bn2(gen, train=train) gen = tf.nn.relu(gen) gen = self.conv2d_transpose(gen, [self.conf.batch_size, 32, 32, self.gf_dim * 1], name='g_h3') # gen = self.batch_norm(gen, reuse=(not train), scope='g_bn3') gen = self.g_bn3(gen, train=train) gen = tf.nn.relu(gen) out = self.conv2d_transpose(gen, [self.conf.batch_size, 64, 64, self.c_dim], name='g_out') return tf.nn.tanh(out)
def softmax_model(X, Y_, mode): Ylogits = layers.linear(X, 10) predict = tf.nn.softmax(Ylogits) classes = tf.cast(tf.argmax(predict, 1), tf.uint8) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(Ylogits, tf.one_hot(Y_, 10)))*100 train_op = layers.optimize_loss(loss, framework.get_global_step(), 0.003, "Adam") return {"predictions":predict, "classes": classes}, loss, train_op
def conv_model(X, Y_, mode): XX = tf.reshape(X, [-1, 28, 28, 1]) biasInit = tf.constant_initializer(0.1, dtype=tf.float32) Y1 = layers.conv2d(XX, num_outputs=6, kernel_size=[6, 6], biases_initializer=biasInit) Y2 = layers.conv2d(Y1, num_outputs=12, kernel_size=[5, 5], stride=2, biases_initializer=biasInit) Y3 = layers.conv2d(Y2, num_outputs=24, kernel_size=[4, 4], stride=2, biases_initializer=biasInit) Y4 = layers.flatten(Y3) Y5 = layers.relu(Y4, 200, biases_initializer=biasInit) # to deactivate dropout on the dense layer, set keep_prob=1 Y5d = layers.dropout(Y5, keep_prob=0.75, noise_shape=None, is_training=mode==learn.ModeKeys.TRAIN) Ylogits = layers.linear(Y5d, 10) predict = tf.nn.softmax(Ylogits) classes = tf.cast(tf.argmax(predict, 1), tf.uint8) loss = conv_model_loss(Ylogits, Y_, mode) train_op = conv_model_train_op(loss, mode) eval_metrics = conv_model_eval_metrics(classes, Y_, mode) return learn.ModelFnOps( mode=mode, # You can name the fields of your predictions dictionary as you like. predictions={"predictions": predict, "classes": classes}, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics )
def discriminator_2layer(H, opt, dropout, prefix='', num_outputs=1, is_reuse=None): # last layer must be linear # H = tf.squeeze(H, [1,2]) # pdb.set_trace() biasInit = tf.constant_initializer(0.001, dtype=tf.float32) H_dis = layers.fully_connected(tf.nn.dropout(H, keep_prob=dropout), num_outputs=opt.H_dis, biases_initializer=biasInit, activation_fn=tf.nn.relu, scope=prefix + 'dis_1', reuse=is_reuse) logits = layers.linear(tf.nn.dropout(H_dis, keep_prob=dropout), num_outputs=num_outputs, biases_initializer=biasInit, scope=prefix + 'dis_2', reuse=is_reuse) return logits
def network(self): net = self.images net = self.image_processing_layer(net) def get_init(): return tf.truncated_normal_initializer(stddev=0.02) net = conv2d(net, 10, [7, 7], activation_fn=tf.nn.relu, name='conv1', weights_initializer=get_init()) net = conv2d(net, 20, [5, 5], activation_fn=tf.nn.relu, name='conv2', weights_initializer=get_init()) net = tf.nn.max_pool(net, [1, 4, 4, 1], [1, 1, 1, 1], padding='SAME') net = conv2d(net, 30, [3, 3], activation_fn=tf.nn.relu, name='conv3', weights_initializer=get_init()) net = conv2d(net, 40, [3, 3], activation_fn=tf.nn.relu, name='conv4', weights_initializer=get_init()) net = tf.nn.max_pool(net, [1, 2, 2, 1], [1, 1, 1, 1], padding='SAME') net = tf.reshape(net, [self.conf.batch_size, -1]) net = linear(net, 100, activation_fn=tf.nn.tanh, name='FC1') out = linear(net, 2, activation_fn=tf.nn.softmax, name='out') return out
def _logistic_regression_model_fn(features, labels, mode): _ = mode logits = layers.linear( features, 1, weights_initializer=init_ops.zeros_initializer(), # Intentionally uses really awful initial values so that # AUC/precision/recall/etc will change meaningfully even on a toy dataset. biases_initializer=init_ops.constant_initializer(-10.0)) predictions = math_ops.sigmoid(logits) loss = loss_ops.sigmoid_cross_entropy(logits, labels) train_op = optimizers.optimize_loss( loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1) return predictions, loss, train_op
def conv_model(X, Y_): XX = tf.reshape(X, [-1, 28, 28, 1]) Y1 = layers.conv2d(XX, num_outputs=6, kernel_size=[6, 6]) Y2 = layers.conv2d(Y1, num_outputs=12, kernel_size=[5, 5], stride=2) Y3 = layers.conv2d(Y2, num_outputs=24, kernel_size=[4, 4], stride=2) Y4 = layers.flatten(Y3) Y5 = layers.relu(Y4, 200) Ylogits = layers.linear(Y5, 10) predict = tf.nn.softmax(Ylogits) classes = tf.cast(tf.argmax(predict, 1), tf.uint8) loss = tf.nn.softmax_cross_entropy_with_logits(Ylogits, tf.one_hot(Y_, 10)) train_op = layers.optimize_loss(loss, framework.get_global_step(), 0.003, "Adam") return {"predictions":predict, "classes": classes}, loss, train_op
def conv_model(X, Y_, mode): XX = tf.reshape(X, [-1, 28, 28, 1]) biasInit = tf.constant_initializer(0.1, dtype=tf.float32) Y1 = layers.conv2d(XX, num_outputs=6, kernel_size=[6, 6], biases_initializer=biasInit) Y2 = layers.conv2d(Y1, num_outputs=12, kernel_size=[5, 5], stride=2, biases_initializer=biasInit) Y3 = layers.conv2d(Y2, num_outputs=24, kernel_size=[4, 4], stride=2, biases_initializer=biasInit) Y4 = layers.flatten(Y3) Y5 = layers.relu(Y4, 200, biases_initializer=biasInit) Ylogits = layers.linear(Y5, 10) predict = tf.nn.softmax(Ylogits) classes = tf.cast(tf.argmax(predict, 1), tf.uint8) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(Ylogits, tf.one_hot(Y_, 10)))*100 train_op = layers.optimize_loss(loss, framework.get_global_step(), 0.001, "Adam") return {"predictions":predict, "classes": classes}, loss, train_op
# Define the zero state of the cell initial_state = cell.zero_state(mini_batch_size, tf.float32) # Launch dynamic RNN Network with specified cell and initial state # We use time_major=False because we would need to transpose the input on our own otherwise rnn_outputs, _rnn_states = tf.nn.dynamic_rnn(cell, x, initial_state=initial_state, time_major=False) # Get the last $eval_time_steps timestep(s) for training rnn_outputs_on_last_t_step = tf.slice( rnn_outputs, [0, n_in_time_steps - (1+eval_time_steps), 0], [mini_batch_size, eval_time_steps, n_units]) # Project output from rnn output size to n_output final_projection = lambda z: layers.linear(z, num_outputs=n_output, activation_fn=tf.nn.sigmoid) # Apply projection to every time step predicted = tf.map_fn(final_projection, rnn_outputs_on_last_t_step) # Error and backprop error = tf.nn.l2_loss(tf.subtract(tf.abs(y),tf.abs(predicted))) train_step = tf.train.AdamOptimizer(learning_rate).minimize(error) # Prediction error and accuracy accuracy = tf.reduce_mean(tf.subtract(tf.abs(y),tf.abs(predicted))) #------------------------------------------------------------------------------- # RUN THE NETWORK #-------------------------------------------------------------------------------
def generator(x, hidden_size): with tf.variable_scope('Generator'): h0 = tf.nn.softplus(layers.linear(x, hidden_size)) return layers.linear(h0, 1)
def inference_net(x, latent_size): return layers.linear(x, latent_size)
# naive dropout dropcells = [rnn.DropoutWrapper(each,input_keep_prob=pkeep) for each in cells] multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) # dropout for the softmax layer Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) H = tf.identity(H, name='H') # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, SEQLEN, ALPHASIZE ] => [ BATCHSIZE x SEQLEN, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a sequence time step or a minibatch item is the same thing. Y_flat = tf.reshape(Yr, [-1, INTERNALSIZE]) # [ BATCHSIZE x SEQLEN, INTERNALSIZE ] Ylogits = layers.linear(Y_flat, ALPHASIZE) # [ BATCHSIZE x SEQLEN, ALPHASIZE ] Y_flat_ = tf.reshape(Yo_, [-1, ALPHASIZE]) # [ BATCHSIZE x SEQLEN, ALPHASIZE ] loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_flat_) # [ BATCHSIZE x SEQLEN ] loss = tf.reshape(loss, [batchsize, -1]) # [ BATCHSIZE, SEQLEN ] Yo = tf.nn.softmax(Ylogits, name='Yo') # [ BATCHSIZE x SEQLEN, ALPHASIZE ] Y = tf.argmax(Yo, 1) # [ BATCHSIZE x SEQLEN ] Y = tf.reshape(Y, [batchsize, -1], name="Y") # [ BATCHSIZE, SEQLEN ] train_step = tf.train.AdamOptimizer(lr).minimize(loss) # stats for display seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean(tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) accuracy_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, accuracy_summary])
def _build_network(self, input_width): with self.session.graph.as_default(): if self.rnn_cell_type == "LSTM": self.rnn_cell = tf.contrib.rnn.LSTMCell(self.rnn_cell_dim) elif self.rnn_cell_type == "GRU": self.rnn_cell = tf.contrib.rnn.GRUCell(self.rnn_cell_dim) else: raise ValueError("Unknown rnn_cell {}".format(rnn_cell)) self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name='global_step') self.tokens = tf.placeholder(tf.int32, [None, None, None], name="tokens") self.token_lens = tf.placeholder(tf.int32, [None, None], name="token_lens") self.features = tf.placeholder(tf.float32, [None, None], name="features") self.labels = tf.placeholder(tf.int64, [None], name="labels") self.alphabet_size = len(self.char_vocabulary.classes_) self.dropout_keep = tf.placeholder(tf.float32) self.input_width = input_width char_embedding_matrix = tf.get_variable( "char_embeddings", [self.alphabet_size, self.EMBEDDING_SIZE], initializer=tf.random_normal_initializer(stddev=0.01), dtype=tf.float32) with tf.variable_scope("token_encoder"): tokens_flat = tf.reshape(self.tokens, [-1, tf.shape(self.tokens)[-1]]) token_lens_flat = tf.reshape(self.token_lens, [-1]) char_embeddings = tf.nn.embedding_lookup( char_embedding_matrix, tokens_flat) hidden_states, final_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=self.rnn_cell, cell_bw=self.rnn_cell, inputs=char_embeddings, sequence_length=token_lens_flat, dtype=tf.float32, scope="char_BiRNN") tokens_encoded = tf_layers.linear(tf.concat(final_states, 1), self.EMBEDDING_SIZE, scope="tokens_encoded") tokens_encoded = tf.reshape(tokens_encoded, [tf.shape(self.features)[0], -1]) self.input_layer = tf.concat((tokens_encoded, self.features), 1) self.input_layer = tf.reshape(self.input_layer, [-1, self.input_width]) # input transform self.hidden_layer = tf.nn.dropout( tf_layers.fully_connected(self.input_layer, num_outputs=self.h_width, activation_fn=None, scope="input_layer"), self.dropout_keep) # hidden layers for i in range(self.h_depth): if self.layer_type == "FeedForward": self.hidden_layer = tf.nn.dropout( tf_layers.fully_connected( self.hidden_layer, num_outputs=self.h_width, activation_fn=tf.nn.relu, scope="ff_layer_{}".format(i)), self.dropout_keep) elif self.layer_type == "Highway": self.hidden_layer = tf.nn.dropout( highway_layer(self.hidden_layer, num_outputs=self.h_width, activation_fn=tf.nn.relu, scope="highway_layer_{}".format(i)), self.dropout_keep) else: raise ValueError("Unknown hidden layer type.") self.output_layer = tf_layers.fully_connected( self.hidden_layer, num_outputs=len(self.target_encoder.classes_), activation_fn=None, scope="output_layer") self.predictions = tf.argmax(self.output_layer, 1) self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.output_layer, labels=self.labels), name="loss") self.training = tf.train.AdamOptimizer().minimize( self.loss, global_step=self.global_step) self.accuracy = tf_metrics.accuracy(self.predictions, self.labels) self.summary = tf.summary.merge([ tf.summary.scalar("train/loss", self.loss), tf.summary.scalar("train/accuracy", self.accuracy) ]) self._initialize_variables()
def __init_decoder(self): '''Initializes the decoder part of the model.''' with tf.variable_scope('decoder') as scope: output_fn = lambda outs: layers.linear( outs, self.__get_vocab_size(), scope=scope) if self.cfg.get('use_attention'): attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2]) (attention_keys, attention_values, attention_score_fn, attention_construct_fn) = seq2seq.prepare_attention( attention_states=attention_states, attention_option='bahdanau', num_units=self.decoder_cell.output_size) decoder_fn_train = seq2seq.attention_decoder_fn_train( encoder_state=self.encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn, name='attention_decoder') decoder_fn_inference = seq2seq.attention_decoder_fn_inference( output_fn=output_fn, encoder_state=self.encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn, embeddings=self.embeddings, start_of_sequence_id=Config.EOS_WORD_IDX, end_of_sequence_id=Config.EOS_WORD_IDX, maximum_length=tf.reduce_max(self.encoder_inputs_length) + 3, num_decoder_symbols=self.__get_vocab_size()) else: decoder_fn_train = seq2seq.simple_decoder_fn_train( encoder_state=self.encoder_state) decoder_fn_inference = seq2seq.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=self.encoder_state, embeddings=self.embeddings, start_of_sequence_id=Config.EOS_WORD_IDX, end_of_sequence_id=Config.EOS_WORD_IDX, maximum_length=tf.reduce_max(self.encoder_inputs_length) + 3, num_decoder_symbols=self.__get_vocab_size()) (self.decoder_outputs_train, self.decoder_state_train, self.decoder_context_state_train) = seq2seq.dynamic_rnn_decoder( cell=self.decoder_cell, decoder_fn=decoder_fn_train, inputs=self.decoder_train_inputs_embedded, sequence_length=self.decoder_train_length, time_major=True, scope=scope) self.decoder_logits_train = output_fn(self.decoder_outputs_train) self.decoder_prediction_train = tf.argmax( self.decoder_logits_train, axis=-1, name='decoder_prediction_traion') scope.reuse_variables() (self.decoder_logits_inference, decoder_state_inference, self.decoder_context_state_inference ) = seq2seq.dynamic_rnn_decoder(cell=self.decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope) self.decoder_prediction_inference = tf.argmax( self.decoder_logits_inference, axis=-1, name='decoder_prediction_inference')
# to do for LSTM's tuple state, but can be achieved by creating two vector # Variables, which are then tiled along batch dimension and grouped into tuple. batch_size = tf.shape(inputs)[1] initial_state = cell.zero_state(batch_size, tf.float32) # Given inputs (time, batch, input_size) outputs a tuple # - outputs: (time, batch, output_size) [do not mistake with OUTPUT_SIZE] # - states: (time, batch, hidden_size) rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, time_major=True) # project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding # an extra layer here. final_projection = lambda x: layers.linear( x, num_outputs=OUTPUT_SIZE, activation_fn=tf.nn.sigmoid) # apply projection to every timestep. predicted_outputs = map_fn(final_projection, rnn_outputs) # compute elementwise cross entropy. error = -(outputs * tf.log(predicted_outputs + TINY) + (1.0 - outputs) * tf.log(1.0 - predicted_outputs + TINY)) error = tf.reduce_mean(error) # optimize train_fn = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(error) # assuming that absolute difference between output and correct answer is 0.5 # or less we can round it to the correct output. accuracy = tf.reduce_mean(
def build(self): hparam = self.hparam if hparam.code_ndim == 3: code_shape = [None, hparam.timesteps, hparam.code_dim] cond_shape = [None, hparam.timesteps, hparam.cond_dim] elif hparam.code_ndim == 2: code_shape = [None, hparam.code_dim] cond_shape = [None, hparam.cond_dim] code = ori_code = tf.placeholder("float32", code_shape, name='code') if hparam.conditional: assert hparam.onehot is False, "NotImplemented: cond with onehot" cond = tf.placeholder("float32", cond_shape, name='condition') code = tf.concat([code, cond], -1) real_seq = tf.placeholder("int32", [None, hparam.timesteps], name='real_seq') real_seq_img = tf.one_hot(real_seq, hparam.vocab_size) dis_train = tf.placeholder('bool', name='is_train') bs = tf.shape(ori_code)[0] # generator final_states = [] init_states = [] with tf.variable_scope('generator'): # play with code step = int(hparam.timesteps / np.prod(hparam.repeats)) first_input = code if hparam.code_ndim == 3 else \ tf.tile(tf.expand_dims(code, 1), (1, step, 1)) if hparam.timestep_pad: first_input = tf.concat([ first_input, tf.tile( tf.expand_dims( tf.expand_dims(tf.lin_space(0., 1., step), 0), -1), (bs, 1, 1)), ], -1) outputs = [first_input] for ind in range(len(hparam.cells)): repeat = hparam.repeats[ind] cell_size = hparam.cells[ind] bi = hparam.bidirection[ind] with tf.variable_scope('layer{}'.format(ind)): # if ind == len(hparam.repeats) and \ # hparam.last_bidirectional: if bi: # assert(repeat == 1) fw_cell = hparam.basic_cell(cell_size) bw_cell = hparam.basic_cell(cell_size) fw_init = fw_cell.zero_state(bs, tf.float32) bw_init = fw_cell.zero_state(bs, tf.float32) output, state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, outputs[-1], initial_state_fw=fw_init, initial_state_bw=bw_init, dtype=tf.float32, ) output = tf.concat(output, 2) cell_size *= 2 init_states.extend([fw_init, bw_init]) final_states.append(state) else: cell = hparam.basic_cell(cell_size) init = cell.zero_state(bs, tf.float32) output, state = tf.nn.dynamic_rnn( cell, outputs[-1], dtype=tf.float32, ) init_states.append(init) final_states.append(state) # output = output * 2 if repeat != 1: step *= repeat output = tf.reshape(tf.tile(output, (1, 1, repeat)), [bs, step, cell_size]) outputs.append(output) outputs[-1] = outputs[-1][:, :hparam.timesteps, :] for o in outputs: print o with tf.variable_scope('decision'): if hparam.deconv_decision: fake_seq_img = outputs[-1] right = slim.fully_connected(fake_seq_img, 32 * 10, activation_fn=None) right = tf.reshape(right, [bs, hparam.timesteps, 10, 32]) # right = tf.nn.softmax(right, -1) right = slim.conv2d_transpose( right, 1, [1, 12], stride=(1, 12), activation_fn=tf.tanh)[:, :, :, 0] fake_seq_img = right fake_seq_img = tf.concat([ tf.ones([bs, hparam.timesteps, 4]) * -1, fake_seq_img, tf.ones([bs, hparam.timesteps, 4]) * -1 ], axis=2) print fake_seq_img else: fake_seq_img = outputs[-1] fake_seq_img = layers.linear(fake_seq_img, hparam.vocab_size) outputs.append(fake_seq_img) fake_seq_img = tf.tanh(fake_seq_img) # fake_seq_img = tf.nn.softmax(fake_seq_img, -1) outputs.append(fake_seq_img) fake_seq = tf.argmax(fake_seq_img, -1) if hparam.plus_code: fake_seq_img = tf.clip_by_value(fake_seq_img + code, -1., +1.) # discriminator if hparam.rnn_dis: def dis(seq_img, bn_scope, reuse=False): with tf.variable_scope('discriminator', reuse=reuse): print 'dis' slices = tf.unstack(seq_img, axis=1) fw_cell = hparam.basic_cell(32) # bw_cell = hparam.basic_cell(64) x, state = tf.nn.static_rnn( fw_cell, # bw_cell, slices, dtype=tf.float32, ) x = tf.stack(x, axis=1) print x # x = tf.concat(x, 2) x = slim.linear(x, 1) print x x = slim.flatten(x) print x x = slim.linear(x, 1) print x # x = tf.nn.sigmoid(x) return x else: def dis(seq_img, bn_scope, reuse=False, cond_vec=None): with tf.variable_scope('discriminator', reuse=reuse): fs = 32 covariance = tf.matmul(seq_img, seq_img, transpose_b=True) x = tf.expand_dims(covariance, -1) x = lrelu(slim.conv2d(x, fs * 1, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 2, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 4, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 4, [5, 5])) x = slim.max_pool2d(x, (2, 2)) covariance_feat = slim.flatten(x) # x = tf.nn.embedding_lookup(embeddings, seq) # x = ResNetBuilder(dis_train, # bn_scopes=['fake', 'real'], # bn_scope=bn_scope).\ # resnet(x, structure=[2, 2, 2, 2], filters=8, nb_class=1) # note axis fs = 32 x = seq_img x = tf.expand_dims(seq_img, -1) x = lrelu(slim.conv2d(x, fs * 1, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 2, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 4, [5, 5])) x = slim.max_pool2d(x, (2, 2)) x = lrelu(slim.conv2d(x, fs * 4, [5, 5])) x = slim.max_pool2d(x, (2, 2)) seq_feat = slim.flatten(x) feat = tf.concat([covariance_feat, seq_feat], axis=1) if cond_vec is not None: feat = tf.concat([feat, cond_vec], axis=-1) feat = lrelu(slim.linear(feat, 200)) x = slim.linear(feat, 1) # x = tf.nn.sigmoid(x) return x # opt # problematic with the reuse bn # fake_seq_img = tf.where( # tf.greater(fake_seq_img, 0.5), # fake_seq_img, # tf.zeros_like(fake_seq_img)) if hparam.conditional: if len(cond_shape) == 3: raise Exception("NotImplemented: cond with ndim3 (DisNet)") cond_real = tf.placeholder("float32", [None, hparam.cond_dim], name='cond_real') fake_dis_pred = dis(fake_seq_img, cond_vec=cond if hparam.conditional else None, bn_scope='fake') real_dis_pred = dis(real_seq_img, cond_vec=cond_real if hparam.conditional else None, bn_scope='real', reuse=True) # traditional GAN loss # G_loss = tf.reduce_mean(-safe_log(fake_dis_pred)) # D_loss = tf.reduce_mean(-safe_log(real_dis_pred)) +\ # tf.reduce_mean(-safe_log(1-fake_dis_pred)) # IWGAN epsilon = tf.random_uniform(minval=0, maxval=1.0, shape=()) print 'grad' intepolation = fake_seq_img * epsilon + real_seq_img * (1.0 - epsilon) inte_dis_pred = dis(intepolation, cond_vec=(cond * epsilon + cond_real * (1.0 - epsilon)) / 2. if hparam.conditional else None, bn_scope='intepolation', reuse=True) grad = tf.gradients(inte_dis_pred, intepolation)[0] print grad grad = tf.reshape(grad, (-1, hparam.timesteps * hparam.vocab_size)) print grad D_loss = tf.reduce_mean(fake_dis_pred) - \ tf.reduce_mean(real_dis_pred) + \ 10*tf.reduce_mean(tf.square(tf.norm(grad, ord=2, axis=1)-1)) G_loss = -tf.reduce_mean(fake_dis_pred) print D_loss print G_loss fake_seq_img_grad = tf.gradients(G_loss, fake_seq_img)[0] G_opt = tf.train.AdamOptimizer(learning_rate=hparam.G_lr, beta1=0.5, beta2=0.9) # D_opt = tf.train.GradientDescentOptimizer(learning_rate=hparam.D_lr) D_opt = tf.train.AdamOptimizer(learning_rate=hparam.D_lr, beta1=0.5, beta2=0.9) D_iter = tf.Variable(0, name='D_iter') G_iter = tf.Variable(0, name='G_iter') trainable_gen_var = reduce(lambda x, y: x + y, [ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, ele) for ele in hparam.trainable_gen ], []) G_train_op = slim.learning.create_train_op( G_loss, G_opt, variables_to_train=trainable_gen_var, global_step=G_iter, clip_gradient_norm=hparam.G_clipnorm) D_train_op = slim.learning.create_train_op( D_loss, D_opt, variables_to_train=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator"), global_step=D_iter, ) iter_step = tf.Variable(0, name='iter_step') iter_step_op = iter_step.assign_add(1) # input self.ori_code = ori_code self.code = code self.real_seq = real_seq self.real_seq_img = real_seq_img if hparam.conditional: self.cond = cond self.cond_real = cond_real # summary self.summary_fake_img = tf.summary.image( 'fake_img', tf.expand_dims(fake_seq_img, -1)) self.summary_real_img = tf.summary.image( 'real_img', tf.expand_dims(real_seq_img, -1)) self.summary_G_loss = tf.summary.scalar('G_loss', G_loss) self.summary_D_loss = tf.summary.scalar('D_loss', D_loss) self.summary_fake_dis_pred = tf.summary.scalar( 'fake_dis_pred', tf.reduce_mean(fake_dis_pred)) self.summary_real_dis_pred = tf.summary.scalar( 'real_dis_pred', tf.reduce_mean(real_dis_pred)) self.summary_fake_img_grad = tf.summary.image( 'gradient_map', tf.expand_dims(fake_seq_img_grad, -1)) self.summary_first_input = tf.summary.image( 'noise', tf.expand_dims(first_input, -1)) self.gen_outputs = outputs # debug self.fake_seq_img = fake_seq_img self.first_input = first_input self.init_states = tuple(init_states) self.final_states = tuple(final_states) self.bs_tensor = bs # train self.dis_train = dis_train self.G_train_op = G_train_op self.D_train_op = D_train_op self.iter_step = iter_step self.iter_step_op = iter_step_op # output self.fake_seq = fake_seq self.built = True
def train_rnn(args): SEQLEN = 50 BATCHSIZE = args.batch_size ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 5 learning_rate = 0.0002 # small learning rate dropout_keep = .9 # only some dropout they use .8 but .9 is my preference text_files = args.data_path + '*.txt' # get all of the text files from data_path codetext, valitext, bookranges = txt.read_data_files(text_files, validation=True) # set epoch size based on batchsize and sequence len epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) # model placeholders lr = tf.placeholder(tf.float32, name='learning_rate') p_keep = tf.placeholder(tf.float32, name='p_keep') batch_size = tf.placeholder(tf.int32, name='batch_size') # input placeholders X = tf.placeholder(tf.uint8, [None, None], name='X') # Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # expected outputs = same sequence shifted by 1 Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # input state Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # Using a NLAYERS=3 of cells, unrolled SEQLEN=30 times # dynamic_rnn infers SEQLEN from the size of the inputs Xo cells = [rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)] # weird dropout, well simple dropout drop_cells = [ rnn.DropoutWrapper(cell, input_keep_prob=p_keep) for cell in cells ] multi_cell = rnn.MultiRNNCell(drop_cells, state_is_tuple=False) multi_cell = rnn.DropoutWrapper(multi_cell, output_keep_prob=p_keep) # ^ The last layer is for the softmax dropout Yr, H = tf.nn.dynamic_rnn(multi_cell, Xo, dtype=tf.float32, initial_state=Hin) # H is that last state H = tf.identity(H, name='H') # give it a tf name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, SEQLEN, ALPHASIZE ] => [ BATCHSIZE x SEQLEN, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a sequence time step or a minibatch item is the same thing. Yflat = tf.reshape(Yr, [-1, INTERNALSIZE]) Ylogits = layers.linear(Yflat, ALPHASIZE) Yflat_ = tf.reshape(Yo_, [-1, ALPHASIZE]) loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=Ylogits, labels=Yflat_) loss = tf.reshape(loss, [batch_size, -1]) Yo = tf.nn.softmax(Ylogits, name='Yo') Y = tf.argmax(Yo, 1) Y = tf.reshape(Y, [batch_size, -1], name="Y") train_step = tf.train.AdamOptimizer(lr).minimize(loss) # stats for display seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) acc_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. This will save Tensorboard information into a different # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that # you can compare training and validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) #summary_writer = tf.summary.FileWriter("log/" + timestamp + "-training") #validation_writer = tf.summary.FileWriter("log/" + timestamp + "-validation") # For saving models os.makedirs(args.save_dir, exist_ok=True) # Only the last checkpoint will be saved saver = tf.train.Saver(max_to_keep=10) gen_file = open(args.save_dir + 'generated.txt', 'w') # For displaying progress # - changing this to my own implementation # - Theyres is too much output # for display: init the progress bar # TODO: Change this guy eventually DISPLAY_FREQ = 50 _50_BATCHES = DISPLAY_FREQ * BATCHSIZE * SEQLEN progress = txt.Progress(DISPLAY_FREQ, size=111 + 2, msg="Training on next " + str(DISPLAY_FREQ) + " batches") # istate = np.zeros([BATCHSIZE, INTERNALSIZE * NLAYERS]) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) step = 0 # Training loop for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext, BATCHSIZE, SEQLEN, nb_epochs=args.epochs): # train on one minibatch feed_dict = { X: x, Y_: y_, Hin: istate, lr: learning_rate, p_keep: dropout_keep, batch_size: BATCHSIZE } _, y, ostate = sess.run([train_step, Y, H], feed_dict=feed_dict) # validation step if step % _50_BATCHES == 0: feed_dict = { X: x, Y_: y_, Hin: istate, p_keep: 1.0, batch_size: BATCHSIZE } # no dropout for validation y, l, bl, acc, smm = sess.run( [Y, seqloss, batchloss, accuracy, summaries], feed_dict=feed_dict) txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc, epoch_size, step, epoch) #summary_writer.add_summary(smm, step) # run a validation step every 50 batches # The validation text should be a single sequence but that's too slow (1s per 1024 chars!), # so we cut it up and batch the pieces (slightly inaccurate) # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower. if step % _50_BATCHES == 0 and len(valitext) > 0: VALI_SEQLEN = 1 * 1024 # Sequence length for validation. State will be wrong at the start of each sequence. bsize = len(valitext) // VALI_SEQLEN txt.print_validation_header(len(codetext), bookranges) vali_x, vali_y, _ = next( txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1)) # all data in 1 batch vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS]) feed_dict = { X: vali_x, Y_: vali_y, Hin: vali_nullstate, p_keep: 1.0, # no dropout for validation batch_size: bsize } ls, acc, smm = sess.run([batchloss, accuracy, summaries], feed_dict=feed_dict) txt.print_validation_stats(ls, acc) # save validation data for Tensorboard #validation_writer.add_summary(smm, step) #saver.save(sess, '{}/rnn_val_save'.format(args.save_dir+'val/'), global_step=step) # display a short text generated with the current weights and biases (every 150 batches) if step // 3 % _50_BATCHES == 0: txt.print_text_generation_header() ry = np.array([[txt.convert_from_alphabet(ord("K"))]]) rh = np.zeros([1, INTERNALSIZE * NLAYERS]) gen_file.write( '----------------- STEP {} -----------------\n'.format(step)) for k in range(1000): ryo, rh = sess.run([Yo, H], feed_dict={ X: ry, p_keep: 1.0, Hin: rh, batch_size: 1 }) rc = txt.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) letter = (chr(txt.convert_to_alphabet(rc))) gen_file.write(letter) print(letter, end='') ry = np.array([[rc]]) txt.print_text_generation_footer() gen_file.write('\n') # display progress bar progress.step(reset=step % _50_BATCHES == 0) # loop state around istate = ostate step += BATCHSIZE * SEQLEN gen_file.close() saved_file = saver.save(sess, '{}rnn_{}'.format(args.save_dir, args.epochs), global_step=step) print("Saved file: " + saved_file)
def __init__(self, inputs_tf, dimo, dimz, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, env_name, **kwargs): """The discriminator network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = tf.placeholder(tf.float32, shape=(None, self.dimo)) self.z_tf = tf.placeholder(tf.float32, shape=(None, self.dimz)) self.g_tf = tf.placeholder(tf.float32, shape=(None, self.dimg)) obs_tau_excludes_goal, obs_tau_achieved_goal = split_observation_tf( self.env_name, self.o_tau_tf) obs_excludes_goal, obs_achieved_goal = split_observation_tf( self.env_name, self.o_tf) # Discriminator networks with tf.variable_scope('state_mi'): # Mutual Information Neural Estimation # shuffle and concatenate x_in = obs_tau_excludes_goal y_in = obs_tau_achieved_goal y_in_tran = tf.transpose(y_in, perm=[1, 0, 2]) y_shuffle_tran = tf.random_shuffle(y_in_tran) y_shuffle = tf.transpose(y_shuffle_tran, perm=[1, 0, 2]) x_conc = tf.concat([x_in, x_in], axis=-2) y_conc = tf.concat([y_in, y_shuffle], axis=-2) # propagate the forward pass layerx = tf_layers.linear(x_conc, int(self.hidden / 2)) layery = tf_layers.linear(y_conc, int(self.hidden / 2)) layer2 = tf.nn.relu(layerx + layery) output = tf_layers.linear(layer2, 1) output = tf.nn.tanh(output) # split in T_xy and T_x_y predictions N_samples = tf.shape(x_in)[-2] T_xy = output[:, :N_samples, :] T_x_y = output[:, N_samples:, :] # compute the negative loss (maximise loss == minimise -loss) mean_exp_T_x_y = tf.reduce_mean(tf.math.exp(T_x_y), axis=-2) neg_loss = -(tf.reduce_mean(T_xy, axis=-2) - tf.math.log(mean_exp_T_x_y)) neg_loss = tf.check_numerics(neg_loss, 'check_numerics caught bad neg_loss') self.mi_tf = neg_loss with tf.variable_scope('skill_ds'): self.logits_tf = nn(obs_achieved_goal, [int(self.hidden / 2)] * self.layers + [self.dimz]) self.sk_tf = tf.nn.softmax_cross_entropy_with_logits( labels=self.z_tf, logits=self.logits_tf) self.sk_r_tf = -1 * self.sk_tf
] dropcells = [rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells] stacked_cells = rnn.MultiRNNCell(dropcells, state_is_tuple=True) stacked_cells = rnn.DropoutWrapper(stacked_cells, output_keep_prob=pkeep) # Let dynamic_rnn do all the work init_state = stacked_cells.zero_state(batchsize, tf.float32) Y_out, last_state = tf.nn.dynamic_rnn(stacked_cells, Xo, dtype=tf.float32, initial_state=init_state) # Flatten and set up softmax layer Y_flat = tf.reshape(Y_out, [-1, state_size]) # batch_size*time_steps,state_size Ylogits = layers.linear(Y_flat, NUM_CHARS) # batch_size*time_steps,NUM_CHARS Yflat_ = tf.reshape(Yo_, [-1, NUM_CHARS]) # batch_size*time_steps,NUM_CHARS Yo = tf.nn.softmax(Ylogits) # batch_size*time_steps,NUM_CHARS Y = tf.argmax(Yo, 1) # batch_size*time_steps Y = tf.reshape(Y, [batchsize, -1]) # batch_size,time_steps # Define our loss function loss = tf.nn.softmax_cross_entropy_with_logits( logits=Ylogits, labels=Yflat_) # batch_size*time_steps loss = tf.reshape(loss, [batchsize, -1]) # batch_size,time_steps # Define the training step using AdamOptimizer train_step = tf.train.AdamOptimizer(learn_rate).minimize(loss) # Define saver to create checkpoints during training if not os.path.exists("checkpoints"):
def discriminator_0layer(H, opt, dropout, prefix='', num_outputs=1, is_reuse=None): H = tf.squeeze(H) biasInit = tf.constant_initializer(0.001, dtype=tf.float32) logits = layers.linear(tf.nn.dropout(H, keep_prob=dropout), num_outputs=num_outputs, biases_initializer=biasInit, scope=prefix + 'dis', reuse=is_reuse) return logits
if args.cell_type == 1: net = [rnn.BasicLSTMCell(args.internal_size, state_is_tuple=False) for _ in range(args.layers)] else: net = [rnn.GRUCell(args.internal_size) for _ in range(args.layers)] net = [rnn.DropoutWrapper(cell, input_keep_prob=dropout_prob) for cell in net] multi_rnn = rnn.MultiRNNCell(net, state_is_tuple=False) drop_multi_rnn = rnn.DropoutWrapper(multi_rnn, output_keep_prob=dropout_prob) Yr, H = tf.nn.dynamic_rnn(drop_multi_rnn, Xo, initial_state=initial_state, dtype=tf.float32) H = tf.identity(H, name="H") Yflat = tf.reshape(Yr, [-1, args.internal_size]) Ylogits = layers.linear(Yflat, VOCAB_SIZE) Yflat_ = tf.reshape(Yo_, [-1, VOCAB_SIZE]) loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_) loss = tf.reshape(loss, [batchsize, -1]) Yo = tf.nn.softmax(Ylogits, name="Yo") Y = tf.argmax(Yo, 1) Y = tf.reshape(Y, [batchsize, -1], name="Y") train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) # stats for display
def test_dynamic_rnn_decoder_time_major(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)) as varscope: # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant( 0.5, shape=[input_sequence_length, batch_size, encoder_embedding_size]) decoder_inputs = constant_op.constant( 0.4, shape=[decoder_sequence_length, batch_size, decoder_embedding_size]) decoder_length = constant_op.constant( decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,]) with variable_scope.variable_scope("rnn") as scope: # setting up weights for computing the final output output_fn = lambda x: layers.linear(x, num_decoder_symbols, scope=scope) # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) with variable_scope.variable_scope("decoder") as scope: # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size) decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_train( encoder_state=encoder_state)) (decoder_outputs_train, decoder_state_train, decoder_context_state_train) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, #TODO: find out why it goes to +1 maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res, decoder_context_state_train_res) = sess.run([ decoder_outputs_train, decoder_state_train, decoder_context_state_train ]) (decoder_outputs_inference_res, decoder_state_inference_res, decoder_context_state_inference_res) = sess.run([ decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference ]) # Assert outputs self.assertEqual((decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual(decoder_sequence_length, decoder_context_state_inference_res) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) self.assertEqual(decoder_sequence_length, decoder_context_state_train_res) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])
def generative_net(z, data_size): return layers.linear(z, data_size)
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob if hparams.use_cudnn: outputs, initial_state, final_state = make_cudnn( expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode, dropout_keep_prob=dropout_keep_prob, residual_connections=hparams.residual_connections) else: cell = make_rnn_cell( hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = contrib_layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[:, logits_offsets[i]: logits_offsets[i + 1]])) predictions.append( tf.argmax( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float( tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float( tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps( labels[:length]) return np.float32(num_steps) num_steps = tf.py_func(batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum( softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = contrib_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy( labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in six.iteritems(vars_to_summarize): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf_nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf_nest.flatten(final_state): tf.add_to_collection('final_state', state)
def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, dataset='cifar', scope='vgg_16'): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): def ConvBatchRelu(layer_input, n_output_plane, name): with variable_scope.variable_scope(name): output = layers.conv2d(layer_input, n_output_plane, [3, 3], scope='conv') output = layers.batch_norm(output, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_training) return output filters = [ 64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512 ] if dataset == 'f_mnist': filters = [_ // 4 for _ in filters] elif dataset != 'cifar': raise NotImplementedError( "Dataset {} is not supported!".format(dataset)) net = ConvBatchRelu(inputs, filters[0], 'conv1_1') net = ConvBatchRelu(net, filters[1], 'conv1_2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = ConvBatchRelu(net, filters[2], 'conv2_1') net = ConvBatchRelu(net, filters[3], 'conv2_2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = ConvBatchRelu(net, filters[4], 'conv3_1') net = ConvBatchRelu(net, filters[5], 'conv3_2') net = ConvBatchRelu(net, filters[6], 'conv3_3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = ConvBatchRelu(net, filters[7], 'conv4_1') net = ConvBatchRelu(net, filters[8], 'conv4_2') net = ConvBatchRelu(net, filters[9], 'conv4_3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4') net = ConvBatchRelu(net, filters[10], 'conv5_1') net = ConvBatchRelu(net, filters[11], 'conv5_2') net = ConvBatchRelu(net, filters[12], 'conv5_3') if dataset == 'cifar': net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = layers.flatten(net, scope='flatten6') net = layers_lib.dropout(net, 0.5, is_training=is_training, scope='dropout6') net = layers.relu(net, filters[13]) net = layers_lib.dropout(net, 0.5, is_training=is_training, scope='dropout6') net = layers.linear(net, num_classes) # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) end_points[sc.name + '/fc8'] = net return net, end_points
def make_cudnn(inputs, rnn_layer_sizes, batch_size, mode, dropout_keep_prob=1.0, residual_connections=False): """Builds a sequence of cuDNN LSTM layers from the given hyperparameters. Args: inputs: A tensor of RNN inputs. rnn_layer_sizes: A list of integer sizes (in units) for each layer of the RNN. batch_size: The number of examples per batch. mode: 'train', 'eval', or 'generate'. For 'generate', CudnnCompatibleLSTMCell will be used. dropout_keep_prob: The float probability to keep the output of any given sub-cell. residual_connections: Whether or not to use residual connections. Returns: outputs: A tensor of RNN outputs, with shape `[batch_size, inputs.shape[1], rnn_layer_sizes[-1]]`. initial_state: The initial RNN states, a tuple with length `len(rnn_layer_sizes)` of LSTMStateTuples. final_state: The final RNN states, a tuple with length `len(rnn_layer_sizes)` of LSTMStateTuples. """ cudnn_inputs = tf.transpose(inputs, [1, 0, 2]) if len(set(rnn_layer_sizes)) == 1 and not residual_connections: initial_state = tuple( contrib_rnn.LSTMStateTuple( h=tf.zeros([batch_size, num_units], dtype=tf.float32), c=tf.zeros([batch_size, num_units], dtype=tf.float32)) for num_units in rnn_layer_sizes) if mode != 'generate': # We can make a single call to CudnnLSTM since all layers are the same # size and we aren't using residual connections. cudnn_initial_state = state_tuples_to_cudnn_lstm_state( initial_state) cell = contrib_cudnn_rnn.CudnnLSTM(num_layers=len(rnn_layer_sizes), num_units=rnn_layer_sizes[0], direction='unidirectional', dropout=1.0 - dropout_keep_prob) cudnn_outputs, cudnn_final_state = cell( cudnn_inputs, initial_state=cudnn_initial_state, training=mode == 'train') final_state = cudnn_lstm_state_to_state_tuples(cudnn_final_state) else: # At generation time we use CudnnCompatibleLSTMCell. cell = contrib_rnn.MultiRNNCell([ contrib_cudnn_rnn.CudnnCompatibleLSTMCell(num_units) for num_units in rnn_layer_sizes ]) cudnn_outputs, final_state = tf.nn.dynamic_rnn( cell, cudnn_inputs, initial_state=initial_state, time_major=True, scope='cudnn_lstm/rnn') else: # We need to make multiple calls to CudnnLSTM, keeping the initial and final # states at each layer. initial_state = [] final_state = [] for i in range(len(rnn_layer_sizes)): # If we're using residual connections and this layer is not the same size # as the previous layer, we need to project into the new size so the # (projected) input can be added to the output. if residual_connections: if i == 0 or rnn_layer_sizes[i] != rnn_layer_sizes[i - 1]: cudnn_inputs = contrib_layers.linear( cudnn_inputs, rnn_layer_sizes[i]) layer_initial_state = (contrib_rnn.LSTMStateTuple( h=tf.zeros([batch_size, rnn_layer_sizes[i]], dtype=tf.float32), c=tf.zeros([batch_size, rnn_layer_sizes[i]], dtype=tf.float32)), ) if mode != 'generate': cudnn_initial_state = state_tuples_to_cudnn_lstm_state( layer_initial_state) cell = contrib_cudnn_rnn.CudnnLSTM( num_layers=1, num_units=rnn_layer_sizes[i], direction='unidirectional', dropout=1.0 - dropout_keep_prob) cudnn_outputs, cudnn_final_state = cell( cudnn_inputs, initial_state=cudnn_initial_state, training=mode == 'train') layer_final_state = cudnn_lstm_state_to_state_tuples( cudnn_final_state) else: # At generation time we use CudnnCompatibleLSTMCell. cell = contrib_rnn.MultiRNNCell([ contrib_cudnn_rnn.CudnnCompatibleLSTMCell( rnn_layer_sizes[i]) ]) cudnn_outputs, layer_final_state = tf.nn.dynamic_rnn( cell, cudnn_inputs, initial_state=layer_initial_state, time_major=True, scope='cudnn_lstm/rnn' if i == 0 else 'cudnn_lstm_%d/rnn' % i) if residual_connections: cudnn_outputs += cudnn_inputs cudnn_inputs = cudnn_outputs initial_state += layer_initial_state final_state += layer_final_state outputs = tf.transpose(cudnn_outputs, [1, 0, 2]) return outputs, tuple(initial_state), tuple(final_state)
def train(): samples = tf.placeholder(tf.float32, [None, None, INPUT_SIZE]) # (batch, time, in) ground_truth = tf.placeholder(tf.float32, [None, OUTPUT_SIZE]) # (batch, out) cell, initial_state = create_model(model=FLAGS.model, num_cells=[FLAGS.rnn_cells] * FLAGS.rnn_layers, batch_size=FLAGS.batch_size) rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, samples, dtype=tf.float32, initial_state=initial_state) # Split the outputs of the RNN into the actual outputs and the state update gate rnn_outputs, updated_states = split_rnn_outputs(FLAGS.model, rnn_outputs) out = layers.linear(inputs=rnn_outputs[:, -1, :], num_outputs=OUTPUT_SIZE) # Compute L2 loss mse = tf.nn.l2_loss(ground_truth - out) / FLAGS.batch_size # Compute loss for each updated state budget_loss = compute_budget_loss(FLAGS.model, mse, updated_states, FLAGS.cost_per_sample) # Combine all losses loss = mse + budget_loss # Optimizer opt, grads_and_vars = compute_gradients(loss, FLAGS.learning_rate, FLAGS.grad_clip) train_fn = opt.apply_gradients(grads_and_vars) sess = tf.Session() log_dir = os.path.join(FLAGS.logdir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) valid_writer = tf.summary.FileWriter(log_dir + '/val') sess.run(tf.global_variables_initializer()) try: num_iters = 0 while True: # Generate new batch and perform SGD update x, y = generate_batch(min_val=MIN_VAL, max_val=MAX_VAL, seq_length=FLAGS.sequence_length, batch_size=FLAGS.batch_size) sess.run([train_fn], feed_dict={samples: x, ground_truth: y}) num_iters += 1 # Evaluate on validation data generated on the fly if num_iters % FLAGS.evaluate_every == 0: valid_error, valid_steps = 0., 0. for _ in range(FLAGS.validation_batches): valid_x, valid_y = generate_batch( min_val=MIN_VAL, max_val=MAX_VAL, seq_length=FLAGS.sequence_length, batch_size=FLAGS.batch_size) valid_iter_error, valid_used_inputs = sess.run( [mse, updated_states], feed_dict={ samples: valid_x, ground_truth: valid_y }) valid_error += valid_iter_error if valid_used_inputs is not None: valid_steps += compute_used_samples(valid_used_inputs) else: valid_steps += FLAGS.sequence_length valid_error /= FLAGS.validation_batches valid_steps /= FLAGS.validation_batches valid_writer.add_summary(scalar_summary('error', valid_error), num_iters) valid_writer.add_summary( scalar_summary('used_samples', valid_steps / FLAGS.sequence_length), num_iters) valid_writer.flush() print("Iteration %d, " "validation error: %.7f, " "validation samples: %.2f%%" % (num_iters, valid_error, 100. * valid_steps / FLAGS.sequence_length)) except KeyboardInterrupt: pass
def main(args): """Main function to train the model. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir model_dir = args.model_dir log_dir = args.log_dir existing_model = args.existing_model # Extract model parameters. batch_size = args.batch_size dropout_pkeep = args.dropout_pkeep hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size learning_rate = args.learning_rate # Extract additional flags. debug = args.debug validation = args.validation # Split corpus for training and validation. # validation_text will be empty if validation is False. code_text, validation_text, input_ranges = utils.read_data_files( input_dir, validation=validation) # Bail out if we don't have enough corpus for training. if len(code_text) < batch_size * constants.TRAINING_SEQLEN + 1: return constants.ExitCode.CORPUS_TOO_SMALL # Get corpus files info. Will be used in debug mode to generate sample text. files_info_list = [] if debug: files_info_list = utils.get_files_info(input_dir) assert files_info_list # Calculate validation batch size. It will be 0 if we choose not to validate. validation_batch_size = len(validation_text) // constants.VALIDATION_SEQLEN # Display some stats on the data. epoch_size = len(code_text) // (batch_size * constants.TRAINING_SEQLEN) utils.print_data_stats(len(code_text), len(validation_text), epoch_size) # Set graph-level random seed, so any random sequence generated in this # graph is repeatable. It could also be removed. tf.set_random_seed(0) # Define placeholder for learning rate, dropout and batch size. lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') batchsize = tf.placeholder(tf.int32, name='batchsize') # Input data. input_bytes = tf.placeholder(tf.uint8, [None, None], name='input_bytes') input_onehot = tf.one_hot(input_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Expected outputs = same sequence shifted by 1, since we are trying to # predict the next character. expected_bytes = tf.placeholder(tf.uint8, [None, None], name='expected_bytes') expected_onehot = tf.one_hot(expected_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Input state. hidden_state = tf.placeholder( tf.float32, [None, hidden_state_size * hidden_layer_size], name='hidden_state') # "naive dropout" implementation. cells = [rnn.GRUCell(hidden_state_size) for _ in range(hidden_layer_size)] dropcells = [ rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells ] multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) output_raw, next_state = tf.nn.dynamic_rnn(multicell, input_onehot, dtype=tf.float32, initial_state=hidden_state) next_state = tf.identity(next_state, name='next_state') # Reshape training outputs. output_flat = tf.reshape(output_raw, [-1, hidden_state_size]) output_logits = layers.linear(output_flat, constants.ALPHA_SIZE) # Reshape expected outputs. expected_flat = tf.reshape(expected_onehot, [-1, constants.ALPHA_SIZE]) # Compute training loss. loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_logits, labels=expected_flat) loss = tf.reshape(loss, [batchsize, -1]) # Use softmax to normalize training outputs. output_onehot = tf.nn.softmax(output_logits, name='output_onehot') # Use argmax to get the max value, which is the predicted bytes. output_bytes = tf.argmax(output_onehot, 1) output_bytes = tf.reshape(output_bytes, [batchsize, -1], name='output_bytes') # Choose Adam optimizer to compute gradients. optimizer = tf.train.AdamOptimizer(lr).minimize(loss) # Stats for display. seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(expected_bytes, tf.cast(output_bytes, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar('batch_loss', batchloss) acc_summary = tf.summary.scalar('batch_accuracy', accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. # This will save Tensorboard information in folder specified in command line. # Two sets of data are saved so that you can compare training and # validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-training')) validation_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-validation')) # Init for saving models. # They will be saved into a directory specified in command line. saver = tf.train.Saver(max_to_keep=constants.MAX_TO_KEEP) # For display: init the progress bar. step_size = batch_size * constants.TRAINING_SEQLEN frequency = constants.DISPLAY_FREQ * step_size progress = utils.Progress(constants.DISPLAY_FREQ, size=constants.DISPLAY_LEN, msg='Training on next {} batches'.format( constants.DISPLAY_FREQ)) # Set initial state. state = np.zeros([batch_size, hidden_state_size * hidden_layer_size]) session = tf.Session() # We continue training on exsiting model, or start with a new model. if existing_model: print('Continue training on existing model: {}'.format(existing_model)) try: saver.restore(session, existing_model) except: print(('Failed to restore existing model since model ' 'parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR else: print('No existing model provided. Start training with a new model.') session.run(tf.global_variables_initializer()) # Num of bytes we have trained so far. steps = 0 # Training loop. for input_batch, expected_batch, epoch in utils.rnn_minibatch_sequencer( code_text, batch_size, constants.TRAINING_SEQLEN, nb_epochs=constants.EPOCHS): # Train on one mini-batch. feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, lr: learning_rate, pkeep: dropout_pkeep, batchsize: batch_size } _, predicted, new_state = session.run( [optimizer, output_bytes, next_state], feed_dict=feed_dict) # Log training data for Tensorboard display a mini-batch of sequences # every `frequency` batches. if debug and steps % frequency == 0: feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, pkeep: 1.0, batchsize: batch_size } predicted, seq_loss, batch_loss, acc_value, summaries_value = session.run( [output_bytes, seqloss, batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_learning_learned_comparison(input_batch, predicted, seq_loss, input_ranges, batch_loss, acc_value, epoch_size, steps, epoch) summary_writer.add_summary(summaries_value, steps) # Run a validation step every `frequency` batches. # The validation text should be a single sequence but that's too slow. # We cut it up and batch the pieces (slightly inaccurate). if validation and steps % frequency == 0 and validation_batch_size: utils.print_validation_header(len(code_text), input_ranges) validation_x, validation_y, _ = next( utils.rnn_minibatch_sequencer(validation_text, validation_batch_size, constants.VALIDATION_SEQLEN, 1)) null_state = np.zeros( [validation_batch_size, hidden_state_size * hidden_layer_size]) feed_dict = { input_bytes: validation_x, expected_bytes: validation_y, hidden_state: null_state, pkeep: 1.0, batchsize: validation_batch_size } batch_loss, acc_value, summaries_value = session.run( [batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_validation_stats(batch_loss, acc_value) # Save validation data for Tensorboard. validation_writer.add_summary(summaries_value, steps) # Display a short text generated with the current weights and biases. # If enabled, there will be a large output. if debug and steps // 4 % frequency == 0: utils.print_text_generation_header() file_info = utils.random_element_from_list(files_info_list) first_byte, file_size = file_info['first_byte'], file_info[ 'file_size'] ry = np.array([[first_byte]]) rh = np.zeros([1, hidden_state_size * hidden_layer_size]) sample = [first_byte] for _ in range(file_size - 1): feed_dict = { input_bytes: ry, pkeep: 1.0, hidden_state: rh, batchsize: 1 } ryo, rh = session.run([output_onehot, next_state], feed_dict=feed_dict) rc = utils.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) sample.append(rc) ry = np.array([[rc]]) print(repr(utils.decode_to_text(sample))) utils.print_text_generation_footer() # Save a checkpoint every `10 * frequency` batches. Each checkpoint is # a version of model. if steps // 10 % frequency == 0: saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) # Display progress bar. if debug: progress.step(reset=steps % frequency == 0) # Update state. state = new_state steps += step_size # Save the model after training is done. saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) return constants.ExitCode.SUCCESS
def discriminator(x, hidden_size, scope='Discriminator', reuse=False): with tf.variable_scope(scope, reuse=reuse): h0 = tf.tanh(layers.linear(x, hidden_size * 2)) h1 = tf.tanh(layers.linear(h0, hidden_size * 2)) h2 = tf.tanh(layers.linear(h1, hidden_size * 2)) return tf.sigmoid(layers.linear(h2, 1))
def create_model(input_tensor, mode, hyper_params): """ Creates a function classifier model using a gru. :param input_tensor: A dictionary containing all input tensors. :param mode: If the network is training or evaluating (tf.estimator.ModeKeys) :param hyper_params: The hyper parameters object containing {"arch": {"pkeep": Float, "sequence_length": Int, "hidden_layer_depth": Int, "hidden_layer_size": Int, "output_dimension": Int}} :return: The model as a dictionary of output tensors. """ outputs = {} with tf.variable_scope('GruFunctionClassifier') as scope: batch_size = hyper_params.train.batch_size if mode == tf.estimator.ModeKeys.EVAL: batch_size = hyper_params.train.validation_batch_size if mode == tf.estimator.ModeKeys.PREDICT: batch_size = 1 # Define inputs input_tensor = tf.reshape( input_tensor["feature"], (batch_size, hyper_params.arch.sequence_length, 1)) Hin = tf.zeros([ batch_size, hyper_params.arch.hidden_layer_size * hyper_params.arch.hidden_layer_depth ], tf.float32, name="Hin") # Define the actual cells cells = [ rnn.GRUCell(hyper_params.arch.hidden_layer_size) for _ in range(hyper_params.arch.hidden_layer_depth) ] # "naive dropout" implementation if mode == tf.estimator.ModeKeys.TRAIN: cells = [ rnn.DropoutWrapper(cell, input_keep_prob=hyper_params.arch.pkeep) for cell in cells ] multicell = rnn.MultiRNNCell(cells, state_is_tuple=False) if mode == tf.estimator.ModeKeys.TRAIN: multicell = rnn.DropoutWrapper( multicell, output_keep_prob=hyper_params.arch.pkeep ) # dropout for the softmax layer Yr, H = tf.nn.dynamic_rnn(multicell, input_tensor, dtype=tf.float32, initial_state=Hin) H = tf.identity(H, name='H') # just to give it a name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, SEQLEN, self.hyper_params.arch.output_dim ] => [ BATCHSIZE x SEQLEN, self.hyper_params.arch.output_dim ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a sequence time step or a minibatch item is the same thing. # Select last output. output = tf.transpose(Yr, [1, 0, 2]) last = tf.gather(output, int(output.get_shape()[0]) - 1) outputs["logits"] = layers.linear(last, hyper_params.arch.output_dimension) outputs["probs"] = tf.nn.softmax(outputs["logits"], name="probs") return outputs
def __init__(self, params): """ :param params: dictionary with fields: "N_HIDDEN": number of hidden states "N_BINS": number of bins on input/ output "LEARNING_RATE": learning rate in optimizer """ self.params = params tf.reset_default_graph() self.session = tf.Session() self.inputs = tf.placeholder(tf.float32, (None, None, params['N_BINS'])) self.cell = tf.contrib.rnn.LSTMCell(params['N_HIDDEN'], state_is_tuple=True) self.batch_size = tf.shape(self.inputs)[1] self.h_init = tf.Variable(tf.zeros([1, params['N_HIDDEN']]), trainable=True) self.h_init_til = tf.tile(self.h_init, [self.batch_size, 1]) self.c_init = tf.Variable(tf.zeros([1, params['N_HIDDEN']]), trainable=True) self.c_init_til = tf.tile(self.c_init, [self.batch_size, 1]) self.initial_state = LSTMStateTuple(self.c_init_til, self.h_init_til) self.rnn_outputs, self.rnn_states = \ tf.nn.dynamic_rnn(self.cell, self.inputs, initial_state=self.initial_state, time_major=True) with tf.variable_scope("output"): self.intermediate_projection = \ lambda x: layers.fully_connected(x, num_outputs=params['N_HIDDEN']) self.final_projection = \ lambda x: layers.linear(x, num_outputs=params['N_BINS']) self.intermediate_features = tf.map_fn( self.intermediate_projection, self.rnn_outputs) self.final_features = tf.map_fn(self.final_projection, self.intermediate_features) self.predicted_outputs = layers.softmax(self.final_features) with tf.variable_scope("train"): self.outputs = \ tf.placeholder(tf.float32, (None, None, params['N_BINS'])) self.mask = tf.placeholder(tf.float32, (None, None, params['N_BINS'])) self.all_errors = losses.categorical_crossentropy( self.outputs * self.mask, self.predicted_outputs) self.error = tf.reduce_mean(self.all_errors) self.train_fn = \ tf.train.AdamOptimizer(learning_rate=params['LEARNING_RATE']) \ .minimize(self.error)
# RNN # [ BATCHSIZE, SEQLEN, ALPHASIZE ] input_x = tf.one_hot(inputs, VOCAB_SIZE) input_y = tf.one_hot(targets, VOCAB_SIZE) # creating RNN cell rnn_cell = tf.contrib.rnn.GRUCell(HIDDEN_SIZE) # run RNN rnn_outputs, final_state = tf.nn.dynamic_rnn(rnn_cell, input_x, initial_state=init_state, dtype=tf.float32) # add dense layer on top of the RNN outputs rnn_outputs_flat = tf.reshape(rnn_outputs, [-1, HIDDEN_SIZE]) dense_layer = layers.linear(rnn_outputs_flat, VOCAB_SIZE) labels = tf.reshape(input_y, [-1, VOCAB_SIZE]) output_softmax = tf.nn.softmax(dense_layer) # Loss loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=dense_layer, labels=labels)) # Minimizer minimizer = tf.train.AdagradOptimizer(learning_rate=LEARNING_RATE).minimize(loss) # Gradient clipping ''' # Here's where the magic happens! grads_and_vars = minimizer.compute_gradients(loss) grad_clipping = tf.constant(5.0, name="grad_clipping")
def train(self): samples = tf.placeholder(tf.float32, shape=[ self.BATCH_SIZE, self.SEQUENCE_LENGTH, self.EMBEDDING_LENGTH ], name='Samples') # (batch, time, in) ground_truth = tf.placeholder(tf.int64, shape=[self.BATCH_SIZE], name='GroundTruth') probs = tf.placeholder( tf.float32, shape=[self.BATCH_SIZE, self.SEQUENCE_LENGTH, 1], name='Probs') mask = tf.placeholder(tf.float32, shape=[self.BATCH_SIZE, self.SEQUENCE_LENGTH, 1], name='padding_mask') cell, initial_state = create_model(model='skip_lstm', num_cells=[self.HIDDEN_UNITS], batch_size=self.BATCH_SIZE) rnn_outputs, rnn_states = tf.nn.dynamic_rnn( cell, samples, dtype=tf.float32, initial_state=initial_state) # Split the outputs of the RNN into the actual outputs and the state update gate rnn_outputs, updated_states = split_rnn_outputs( 'skip_lstm', rnn_outputs) # print(f"\nUpdated states are {updated_states}.\n") logits = layers.linear(inputs=rnn_outputs[:, -1, :], num_outputs=self.OUTPUT_SIZE) predictions = tf.argmax(logits, 1) # Compute cross-entropy loss printer_lab = tf.cond( tf.math.reduce_any( tf.logical_or( tf.equal(tf.zeros_like(ground_truth), ground_truth), tf.equal(tf.ones_like(ground_truth), ground_truth))), lambda: tf.no_op(), lambda: tf.print("Found a label out of range: ", [ground_truth])) with tf.control_dependencies([printer_lab]): cross_entropy_per_sample = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=ground_truth) # cross_entropy_per_sample = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=ground_truth) # max_ce = tf.math.maximum(cross_entropy_per_sample) # median_ce = tf.math.median(cross_entropy_per_sample) # printer_max = tf.Print(max_ce, [max_ce], "The maximum cross entropy is ") # printer_median = tf.Print(median_ce, [median_ce], "The median cross entropy is ") printer_Nan = tf.cond( tf.math.reduce_any(tf.math.is_nan(cross_entropy_per_sample)), lambda: tf.print("Found NaN in entropy loss", output_stream=sys.stderr), lambda: tf.no_op()) with tf.control_dependencies([printer_Nan]): cross_entropy = tf.reduce_mean( tf.boolean_mask(cross_entropy_per_sample, tf.is_finite(cross_entropy_per_sample))) # tf.where(tf.math.is_nan(cross_entropy_per_sample), # tf.ones(cross_entropy_per_sample.get_shape()), # cross_entropy_per_sample)) # Compute accuracy accuracy = tf.reduce_mean( tf.cast(tf.equal(predictions, ground_truth), tf.float32)) # updated_states = tf.boolean_mask(updated_states, mask) # Compute loss for each updated state budget_loss = compute_budget_loss('skip_lstm', cross_entropy, updated_states, self.COST_PER_SAMPLE, mask) # printer_Nan = tf.cond(tf.math.reduce_any(tf.math.is_nan(budget_loss)), # lambda: tf.print("Found NaN in budget loss"), lambda: tf.no_op()) # with tf.control_dependencies([printer_Nan]): # budget_loss = tf.where(tf.math.is_nan(budget_loss), # tf.ones(budget_loss.get_shape()), # budget_loss) # Compute loss for the amount of surprisal surprisal_loss = compute_surprisal_loss('skip_lstm', cross_entropy, updated_states, probs, self.SURPRISAL_COST, mask) # Avoid encouraging to not skip. # printer_Nan = tf.cond(tf.math.reduce_any(tf.math.is_nan(surprisal_loss)), # lambda: tf.print("Found NaN in surprisal loss"), lambda: tf.no_op()) # with tf.control_dependencies([printer_Nan]): # surprisal_loss = tf.where(tf.math.logical_or(tf.equal(surprisal_loss, tf.zeros_like(surprisal_loss)), # tf.math.is_nan(surprisal_loss)), tf.ones_like(surprisal_loss), # surprisal_loss) loss = cross_entropy + budget_loss + surprisal_loss loss = tf.reshape(loss, []) loss = tf.where(tf.is_nan(loss), tf.ones_like(loss), loss) # Optimizer opt, grads_and_vars = compute_gradients( loss, self.LEARNING_RATE, 1) # used to be 1 is for gradient clipping train_fn = opt.apply_gradients(grads_and_vars) sess = tf.Session() # log_dir = os.path.join(self.LOG_DIR, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # val_writer = tf.summary.FileWriter(log_dir + '/validation') # Initialize weights sess.run(tf.global_variables_initializer()) # Results train_loss_plt = np.zeros((self.NUM_EPOCHS)) loss_plt = np.zeros((self.NUM_EPOCHS, self.ITERATIONS_PER_EPOCH, 3)) val_acc_df = np.zeros((self.NUM_EPOCHS)) train_acc_df = np.zeros((self.NUM_EPOCHS)) test_acc_df = np.zeros((self.NUM_EPOCHS)) train_update_df = np.zeros((self.NUM_EPOCHS)) val_update_df = np.zeros((self.NUM_EPOCHS)) test_update_df = np.zeros((self.NUM_EPOCHS)) test_time_df = np.zeros((self.NUM_EPOCHS)) read_embs = np.zeros( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH, self.EMBEDDING_LENGTH)) non_read_embs = np.zeros( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH, self.EMBEDDING_LENGTH)) read_surps = np.ones( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH)) non_read_surps = np.ones( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH)) # FILE_NAME = f'hu{self.HIDDEN_UNITS}_bs{self.BATCH_SIZE}_lr{self.LEARNING_RATE}_b{self.COST_PER_SAMPLE}_s{self.SURPRISAL_COST}_t{self.TRIAL}' try: train_matrix, train_labels, train_probs, train_mask = self.input_fn( split='train') val_matrix, val_labels, val_probs, val_mask = self.input_fn( split='val') test_matrix, test_labels, test_probs, test_mask = self.input_fn( split='test') # train_loss_plt = np.empty((self.NUM_EPOCHS, self.ITERATIONS_PER_EPOCH) for epoch in range(self.NUM_EPOCHS): # Load the training dataset into the pipeline # sess.run(train_model_spec['iterator_init_op']) # sess.run(train_model_spec['samples']) start_time = time.time() train_accuracy, train_steps, train_loss = 0, 0, 0 for iteration in range(self.ITERATIONS_PER_EPOCH): # Perform SGD update # print(iteration, train_probs[iteration].shape) out = sess.run( [ train_fn, loss, accuracy, updated_states, cross_entropy, budget_loss, surprisal_loss ], feed_dict={ samples: train_matrix[iteration], ground_truth: train_labels[iteration], probs: train_probs[iteration], mask: train_mask[iteration] }) train_accuracy += out[2] train_loss += out[1] loss_plt[epoch][iteration] = out[ 4:] # entropy, budget, surprisal if out[3] is not None: train_steps += compute_used_samples( out[3] * train_mask[iteration]) else: train_steps += np.count_nonzero(train_mask[iteration]) duration = time.time() - start_time train_accuracy /= self.ITERATIONS_PER_EPOCH train_loss /= self.ITERATIONS_PER_EPOCH train_steps /= (np.count_nonzero(train_mask) / self.BATCH_SIZE) train_loss_plt[epoch] = train_loss train_acc_df[epoch] = train_accuracy train_update_df[epoch] = train_steps val_accuracy, val_loss, val_steps = 0, 0, 0 for iteration in range(self.VAL_ITERS): val_iter_accuracy, val_iter_loss, val_used_inputs = sess.run( [accuracy, loss, updated_states], feed_dict={ samples: val_matrix[iteration], ground_truth: val_labels[iteration], probs: val_probs[iteration], mask: val_mask[iteration] }) val_accuracy += val_iter_accuracy val_loss += val_iter_loss if val_used_inputs is not None: val_steps += compute_used_samples(val_used_inputs * val_mask[iteration]) else: val_steps += np.count_nonzero(val_mask[iteration]) val_accuracy /= self.VAL_ITERS val_loss /= self.VAL_ITERS val_steps /= (np.count_nonzero(val_mask) / self.BATCH_SIZE) val_acc_df[epoch] = val_accuracy val_update_df[epoch] = val_steps # val_writer.add_summary(scalar_summary('accuracy', val_accuracy), epoch) # val_writer.add_summary(scalar_summary('loss', val_loss), epoch) # val_writer.add_summary(scalar_summary('used_samples', val_steps / self.SEQUENCE_LENGTH), epoch) # val_writer.flush() # print("Epoch %d/%d, " # "duration: %.2f seconds, " # "train accuracy: %.2f%%, " # "train samples: %.2f (%.2f%%), " # "val accuracy: %.2f%%, " # "val samples: %.2f (%.2f%%)" % (epoch + 1, # self.NUM_EPOCHS, # duration, # 100. * train_accuracy, # train_steps, # 100. * train_steps / self.SEQUENCE_LENGTH, # 100. * val_accuracy, # val_steps, # 100. * val_steps / self.SEQUENCE_LENGTH)) # # print("Absolute losses: entropy: %.3f, budget: %.3f, surprisal: %.3f." % (loss_abs[0], loss_abs[1], loss_abs[2])) # # print("Percentage losses: entropy: %.2f%%, budget: %.2f%%, surprisal: %.2f%%.\n" % (loss_perc[0], loss_perc[1], loss_perc[2])) loss_abs = loss_plt[epoch].mean(axis=0) loss_perc = np.divide(loss_abs, (loss_abs.sum())) * 100 self.logger.info("\nEpoch %d/%d, " "duration: %.2f seconds, " "train accuracy: %.2f%%, " "train samples: %.2f%%, " "val accuracy: %.2f%%, " "val samples: %.2f%%" % (epoch + 1, self.NUM_EPOCHS, duration, 100. * train_accuracy, 100. * train_steps, 100. * val_accuracy, 100. * val_steps)) self.logger.info( "Absolute losses: entropy: %.3f, budget: %.3f, surprisal: %.3f." % (loss_abs[0], loss_abs[1], loss_abs[2])) self.logger.info( "Percentage losses: entropy: %.2f%%, budget: %.2f%%, surprisal: %.2f%%." % (loss_perc[0], loss_perc[1], loss_perc[2])) print( f"entropy: {loss_plt[epoch, :, 0].mean()}, budget: {loss_plt[epoch, :, 1].mean()}, surprisal: {loss_plt[epoch, :, 2].mean()}." ) analysis_update = val_accuracy + 1e-4 > val_acc_df.max() if analysis_update: self.logger.info("Updating Analysis") read_embs = np.zeros( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH, self.EMBEDDING_LENGTH)) non_read_embs = np.zeros( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH, self.EMBEDDING_LENGTH)) read_surps = np.full((self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH), -1) non_read_surps = np.full( (self.TEST_ITERS * self.BATCH_SIZE * self.SEQUENCE_LENGTH), -1) test_accuracy, test_loss, test_steps, t = 0, 0, 0, 0 for iteration in range(self.TEST_ITERS): t0 = time.time() test_iter_accuracy, test_iter_loss, test_used_inputs = sess.run( [accuracy, loss, updated_states], feed_dict={ samples: test_matrix[iteration], ground_truth: test_labels[iteration], probs: test_probs[iteration], mask: test_mask[iteration] }) t += time.time() - t0 test_accuracy += test_iter_accuracy test_loss += test_iter_loss if test_used_inputs is not None: test_steps += compute_used_samples( test_used_inputs * test_mask[iteration]) if analysis_update: try: re, nre, rs, nrs = stats_used_samples( test_used_inputs, test_matrix[iteration], test_probs[iteration], test_mask[iteration]) if len(re) > 0: read_embs[ self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH:self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH + len(re)] = re if len(nre) > 0: non_read_embs[ self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH:self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH + len(nre)] = nre if len(rs) > 0: read_surps[ self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH:self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH + len(rs.flatten())] = rs.flatten( ) # take out flatten but should not be the problem if len(nrs) > 0: non_read_surps[ self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH:self.BATCH_SIZE * iteration * self.SEQUENCE_LENGTH + len(nrs.flatten())] = nrs.flatten() except Exception as e: self.logger.info("Could not update analysis") self.logger.error(e) pass else: test_steps += np.count_nonzero(test_mask[iteration]) test_accuracy /= self.TEST_ITERS test_loss /= self.TEST_ITERS test_steps /= (np.count_nonzero(test_mask) / self.BATCH_SIZE) test_time_df[epoch] = t test_acc_df[epoch] = test_accuracy test_update_df[epoch] = test_steps self.logger.info("Test time: %.2f seconds, " "test accuracy: %.2f%%, " "test samples: %.2f%%.\n" % (test_time_df[epoch], 100. * test_accuracy, 100. * test_steps)) if self.EARLY_STOPPING and epoch > 15: if epoch == 16: best_accuracy = val_acc_df.max() best_idx = val_acc_df.argmax() if best_accuracy < val_acc_df[epoch] + 1e-4: best_accuracy = val_acc_df[epoch] best_idx = epoch elif best_idx + 15 < epoch: val_update_df = val_update_df[:epoch] val_acc_df = val_acc_df[:epoch] train_acc_df = train_acc_df[:epoch] train_update_df = train_update_df[:epoch] loss_plt = loss_plt[:epoch] test_acc_df = test_acc_df[:epoch] test_update_df = test_update_df[:epoch] test_time_df = test_time_df[:epoch] self.logger.info( "Training was interrupted with early stopping") break except KeyboardInterrupt: self.logger.info("Training was interrupted manually") pass try: df_dict = self.CONFIG_DICT df_dict['val_acc'] = val_acc_df df_dict['val_updates'] = val_update_df df_dict['train_acc'] = train_acc_df df_dict['train_updates'] = train_update_df df_dict['test_acc'] = test_acc_df df_dict['test_updates'] = test_update_df df_dict['test_time'] = test_time_df loss_plt_mean = loss_plt.mean(axis=1).transpose() df_dict['entropy_loss'] = loss_plt_mean[0] df_dict['budget_loss'] = loss_plt_mean[1] df_dict['surprisal_loss'] = loss_plt_mean[2] df = pd.DataFrame(df_dict) df.drop(columns=['epochs', 'file_name'], inplace=True) csv_loc = '../csvs' if not os.path.exists(csv_loc): os.makedirs(csv_loc) df.to_csv(f"{csv_loc}/{self.FILE_NAME}.csv") except Exception as e: print(e) self.logger.info("Could not create csvs") pass ## Saving analysis statistics try: analysis_loc = '../analysis' if not os.path.exists(analysis_loc): os.makedirs(analysis_loc) print("Read words") read_words = get_words_from_embedding(self.EMBEDDING_DICT, read_embs) print("Skipped words") non_read_words = get_words_from_embedding(self.EMBEDDING_DICT, non_read_embs) pickle.dump(read_words, open(f"{analysis_loc}/{self.FILE_NAME}_read_vocab.pkl", 'wb'), protocol=0) pickle.dump( non_read_words, open(f"{analysis_loc}/{self.FILE_NAME}_non_read_vocab.pkl", 'wb'), protocol=0) read_surps = np.vstack(read_surps).flatten() non_read_surps = np.vstack(non_read_surps).flatten() np.save( open(f"{analysis_loc}/{self.FILE_NAME}_read_surprisals.npy", 'wb'), read_surps[read_surps >= 0]) np.save( open( f"{analysis_loc}/{self.FILE_NAME}_non_read_surprisals.npy", 'wb'), non_read_surps[non_read_surps >= 0]) except Exception as e: print(e) self.logger.info( "Something went wrong when reporting analysis results") pass sess.close() tf.reset_default_graph()
def __init__(self, sequence_length: int, batch_size: int, gru_internal_size: int, num_hidden_layers: int, stats_log_dir: str): """Construct `RNNTextModel` with specified hyperparameters. This model is a deep recurrent neural network that uses GRU cells for long-term memory. `sequence_length` is the length of each input sequence. Longer sequences mean the model has longer-term memory, but networks that use longer sequences (and thus, have more time steps) are harder/take longer to learn. `batch_size` is the number of sequences to put in each mini-batch. The network's weights are onlt adjusted after each mini-batch. `gru_internal_size` specifies the number of nodes inside each hidden GRU cell layer. `num_hidden_layers` specifies the number of hidden layers (number of GRU cell lateys) to use in the deep RNN. The model's loss and graph will be periodically logged to the `stats_log_dir` directory. """ # Mark time this text model was initially created self._timestamp = str(math.trunc(time.time())) # Store hyperparameters self._sequence_length = sequence_length self._batch_size = batch_size self._gru_internal_size = gru_internal_size self._num_hidden_layers = num_hidden_layers # --------------------------------------------------------------------- # Graph Inputs # --------------------------------------------------------------------- X = tf.placeholder(tf.uint8, [None, None], name='X') self._inputs = { # Hyperparameters. 'learning_rate': tf.placeholder(tf.float32, name='learning_rate'), 'batch_size': tf.placeholder(tf.int32, name='batch_size'), # Dimensions: [ batch_size, sequence_length ] 'X': X, # Dimensions: [ batch_size, sequence_length, ALPHABET_SIZE ] 'Xo': tf.one_hot(X, ALPHABET_SIZE, 1.0, 0.0), # Input cell state. # Dimensions: [batch_size, gru_internal_size * num_hidden_layers] 'H_in': tf.placeholder( tf.float32, [None, self._gru_internal_size * self._num_hidden_layers], name='Hin') } # Define expected RNN outputs. This is used for training. # This is the same sequence as the input sequence, but shifted by 1 # since we are trying to predict the next character. Y_exp = tf.placeholder(tf.uint8, [None, None], name='Y_exp') self._expected_outputs = { # Dimensions: [ batch_size, sequence_length ] 'Y': Y_exp, # Dimensions: [ batch_size, sequence_length, ALPHABET_SIZE ] 'Yo': tf.one_hot(Y_exp, ALPHABET_SIZE, 1.0, 0.0) } # --------------------------------------------------------------------- # Hidden Layers # --------------------------------------------------------------------- # Define internal/hidden RNN layers. The RNN is composed of a certain # number of hidden layers, where each node is `GruCell` that uses # `gru_internal_size` as the internal state size of a single cell. A # higher `gru_internal_size` means more complex state can be stored in # a single cell. self._cells = [ rnn.GRUCell(self._gru_internal_size) for _ in range(self._num_hidden_layers) ] self._multicell = rnn.MultiRNNCell(self._cells, state_is_tuple=False) # Using `dynamic_rnn` means Tensorflow "performs fully dynamic # unrolling" of the network. This is faster than compiling the full # graph at initialisation time. # # Note that compiling the full grapgh at train time isn’t that big of # an issue for training, because we only need to build the graph once. # It could be a big issue, however, if we need to build the graph # multiple times at test time. And remember, this training loop does # occassionally process inputs via test time, through the occassional # reports it outputs. # # Yr: [ batch_size, sequence_length, gru_internal_size ] # H: [ batch_size, gru_internal_size * num_hidden_layers ] # H_out is the last state in the sequence. Yr, H_out = tf.nn.dynamic_rnn(self._multicell, self._inputs['Xo'], dtype=tf.float32, initial_state=self._inputs['H_in']) # --------------------------------------------------------------------- # Outputs # --------------------------------------------------------------------- # Softmax layer implementation: # Flatten the first two dimensions of the output. This performs the # following transformation: # # [ batch_size, sequence_length, ALPHABET_SIZE ] # => [ batch_size x sequence_length, ALPHABET_SIZE ] Yflat = tf.reshape(Yr, [-1, self._gru_internal_size]) # After this transformation, apply softmax readout layer. This way, the # weights and biases are shared across unrolled time steps. From the # readout point of view, a value coming from a cell or a minibatch is # the same thing. Ylogits = layers.linear( Yflat, ALPHABET_SIZE) # [ batch_size x sequence_length, ALPHABET_SIZE ] Yflat_ = tf.reshape( # [ batch_size x sequence_length, ALPHABET_SIZE ] self._expected_outputs['Yo'], [-1, ALPHABET_SIZE]) Yo = tf.nn.softmax( Ylogits, name='Yo') # [ batch_size x sequence_length, ALPHABET_SIZE ] Y = tf.argmax(Yo, 1) # [ batch_size x sequence_length ] Y = tf.reshape(Y, [self._inputs['batch_size'], -1], name='Y') # [ batch_size, sequence_length ] # Store the output nodes in a dictionary for easy access later. self._outputs = { 'Y': Y, # Output cell state after running a time step of the recurrent # network. We specify this just to give H_out a identifiable name. 'H_out': tf.identity(H_out, name='H_out') } # Commpute the loss (error) of the network. self._loss = tf.nn.softmax_cross_entropy_with_logits( # [ batch_size x sequence_length ] logits=Ylogits, labels=Yflat_) self._loss = tf.reshape( # [ batch_size, sequence_length ] self._loss, [self._inputs['batch_size'], -1]) # Used to adjust the weights at each training step, sich that the # loss function is minimised. self._train_step = tf.train.AdamOptimizer().minimize(self._loss) # Stats not used to directly train the network, but are logged so they # can be viewed by the human user. self._sequence_loss = tf.reduce_mean(self._loss, 1) self._batch_loss = tf.reduce_mean(self._sequence_loss) self._batch_accuracy = tf.reduce_mean( tf.cast( tf.equal(self._expected_outputs['Y'], tf.cast(self._outputs['Y'], tf.uint8)), tf.float32)) self._initialise_tf_session() self._build_statistics(stats_log_dir)
def __call__(self, content_words, content_len, date, target, target_len): embeddings, reg_loss = self._encoder(content_words, content_len, date) for i in range(self.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): embeddings = multihead_attention(queries=embeddings, keys=embeddings, values=embeddings, num_heads=self.num_heads, dropout_rate=self.dropout_rate, training=self.training, causality=False) # feed forward embeddings = ff(embeddings, num_units=[self.feed_forward_in_dim, self.model_dim]) # shape(?,300,512) outputs = tf.reduce_max(embeddings, axis=1) output_feature = outputs if self.training and self.dropout_rate > 0: print("In training mode, use dropout") outputs = tf.nn.dropout(outputs, keep_prob=1 - self.dropout_rate) with tf.variable_scope("MlpLayer") as hidden_layer_scope: outputs = layers.fully_connected( outputs, num_outputs=self.model_dim, activation_fn=tf.nn.tanh, scope=hidden_layer_scope, reuse=tf.AUTO_REUSE ) outputs = layers.linear( outputs, self.num_vocabulary, scope="Logit_layer", reuse=tf.AUTO_REUSE ) loss = None training_list = [] if target is not None: non_zero_indices = tf.where(tf.not_equal(target, 0)) col_indices = tf.cast(tf.gather_nd(target, non_zero_indices), tf.int64) expanded_target = to_dense( SparseTensor( indices=tf.concat([ tf.reshape(non_zero_indices[:, 0], [-1, 1]), tf.reshape(col_indices, [-1, 1]), ], axis=1), values=tf.ones([tf.shape(non_zero_indices)[0]], dtype=tf.float32), dense_shape=[self._batch_size, self.num_vocabulary] ) ) target_dist = expanded_target / tf.cast(tf.reshape(target_len, [-1, 1]), tf.float32) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=outputs, labels=tf.stop_gradient(target_dist) ) + reg_loss ) # set differnt lr print("**** learning_rate ****") tvars = tf.trainable_variables() for var in tvars: if not var.op.name.startswith("bert"): training_list.append(var) return Namespace( logit=outputs, feature=output_feature, loss=loss, training_list=training_list )
def output_fn(x): return layers.linear(x, num_decoder_symbols, scope=scope)
def train(): samples = tf.placeholder(tf.float32, [None, None, INPUT_SIZE]) # (batch, time, in) ground_truth = tf.placeholder(tf.int64, [None]) # (batch, out) cell, initial_state = create_model(model=FLAGS.model, num_cells=[FLAGS.rnn_cells] * FLAGS.rnn_layers, batch_size=FLAGS.batch_size) rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, samples, dtype=tf.float32, initial_state=initial_state) # Split the outputs of the RNN into the actual outputs and the state update gate rnn_outputs, updated_states = split_rnn_outputs(FLAGS.model, rnn_outputs) out = layers.linear(inputs=rnn_outputs[:, -1, :], num_outputs=OUTPUT_SIZE) # Compute cross-entropy loss cross_entropy_per_sample = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=ground_truth) cross_entropy = tf.reduce_mean(cross_entropy_per_sample) # Compute accuracy accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), ground_truth), tf.float32)) # Compute loss for each updated state budget_loss = compute_budget_loss(FLAGS.model, cross_entropy, updated_states, FLAGS.cost_per_sample) # Combine all losses loss = cross_entropy + budget_loss # Optimizer opt, grads_and_vars = compute_gradients(loss, FLAGS.learning_rate, FLAGS.grad_clip) train_fn = opt.apply_gradients(grads_and_vars) sess = tf.Session() log_dir = os.path.join(FLAGS.logdir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) valid_writer = tf.summary.FileWriter(log_dir + '/val') sess.run(tf.global_variables_initializer()) try: num_iters = 0 while True: # Generate new batch and perform SGD update x, y = generate_batch(FLAGS.batch_size, FLAGS.sampling_period, FLAGS.signal_duration, START_PERIOD, END_PERIOD, START_TARGET_PERIOD, END_TARGET_PERIOD) sess.run([train_fn], feed_dict={samples: x, ground_truth: y}) num_iters += 1 # Evaluate on validation data generated on the fly if num_iters % FLAGS.evaluate_every == 0: valid_accuracy, valid_steps = 0., 0. for _ in range(FLAGS.validation_batches): valid_x, valid_y = generate_batch(FLAGS.batch_size, FLAGS.sampling_period, FLAGS.signal_duration, START_PERIOD, END_PERIOD, START_TARGET_PERIOD, END_TARGET_PERIOD) valid_iter_accuracy, valid_used_inputs = sess.run( [accuracy, updated_states], feed_dict={ samples: valid_x, ground_truth: valid_y}) valid_accuracy += valid_iter_accuracy if valid_used_inputs is not None: valid_steps += compute_used_samples(valid_used_inputs) else: valid_steps += SEQUENCE_LENGTH valid_accuracy /= FLAGS.validation_batches valid_steps /= FLAGS.validation_batches valid_writer.add_summary(scalar_summary('accuracy', valid_accuracy), num_iters) valid_writer.add_summary(scalar_summary('used_samples', valid_steps / SEQUENCE_LENGTH), num_iters) valid_writer.flush() print("Iteration %d, " "validation accuracy: %.2f%%, " "validation samples: %.2f (%.2f%%)" % (num_iters, 100. * valid_accuracy, valid_steps, 100. * valid_steps / SEQUENCE_LENGTH)) except KeyboardInterrupt: pass
def build_rnn(movies_cnt, cell_type='gru', user_aware=True, user_cnt=None, rating_aware=True, rnn_unit=300, user_embedding=300, movie_emb_dim=300, feed_previous=True, loss_weights=None, rating_with_user=False, batch_size=32): loss_weights = loss_weights or [10, 2] movie_idx_ph = tf.placeholder(tf.int32, [None, None]) _, maxlen = tf.unstack(tf.shape(movie_idx_ph)) if_training = tf.placeholder_with_default(True, []) cell = cells[cell_type](num_units=rnn_unit) movie_embeddings = build_embedding(movies_cnt, movie_emb_dim, 'movie_embedding') if user_aware and user_cnt is None: raise ValueError if user_aware: user_idx_ph = tf.placeholder(tf.int32, [None]) if cell_type == 'lstm': c_user_embedding = build_embedding(user_cnt, user_embedding, name='user_c_embedding') h_user_embedding = build_embedding(user_cnt, user_embedding, name='user_h_embedding') state = LSTMStateTuple( c=tf.nn.embedding_lookup(c_user_embedding, user_idx_ph), h=tf.nn.embedding_lookup(h_user_embedding, user_idx_ph)) elif cell_type == 'gru': user_embedding = build_embedding(user_cnt, user_embedding, name='user_embedding') state = tf.nn.embedding_lookup(user_embedding, user_idx_ph) else: state = cell.zero_state(batch_size=batch_size, dtype=tf.float32) def _choose_best(vec, reuse=False): with tf.variable_scope(name_or_scope='chooser', reuse=reuse) as scope: w = tf.get_variable(name='weights', shape=[movie_emb_dim, movies_cnt]) b = tf.get_variable(name='bias', shape=[movies_cnt]) return tf.matmul(vec, w) + b # not using dynamic_rnn since I want to feed previous output def walker(idx, input, outputs, state, fprev): output, state = cell(input, state) new_idx = tf.cond( fprev[idx], lambda: tf.cast(tf.argmax(_choose_best(output), 1), tf.int32), lambda: movie_idx_ph[:, idx + 1]) input = tf.nn.embedding_lookup(movie_embeddings, new_idx) return idx + 1, input, tf.concat( (outputs, tf.expand_dims(output, axis=1)), axis=1), state, fprev def cond(idx, input, outputs, state, fprev): return idx < maxlen - 1 idx = tf.Variable(0) input = tf.nn.embedding_lookup(movie_embeddings, movie_idx_ph[:, 0]) feed_prev = tf.placeholder(tf.bool, [None], name='feed_prev_ph') loop_vars = [ idx, input, tf.zeros((batch_size, 0, movie_emb_dim), dtype=tf.float32), state, feed_prev ] shape_invs = [ idx.get_shape(), input.get_shape(), tf.TensorShape((batch_size, None, movie_emb_dim)), state.get_shape(), feed_prev.get_shape() ] print(len(loop_vars), len(shape_invs)) idx, last_output, outputs, state, fp = tf.while_loop( cond, walker, loop_vars=loop_vars, shape_invariants=shape_invs) logits = tf.reshape(outputs, (-1, rnn_unit)) logits = _choose_best(logits, reuse=True) logits = tf.reshape(logits, (batch_size, -1, movies_cnt)) clf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=movie_idx_ph[:, 1:]) def training_mask(): clf_mask = tf.greater(movie_idx_ph[:, 1:], tf.cast(0, dtype=tf.int32)) clf_mask = tf.cast(clf_mask, tf.float32) return clf_mask def val_mask(): clf_mask = tf.greater(movie_idx_ph[:, 1:], tf.cast(0, dtype=tf.int32)) clf_mask = tf.cast(clf_mask, tf.float32) clf_mask = tf.multiply(clf_mask, tf.cast(feed_prev[1:], tf.float32)) return clf_mask clf_mask = tf.cond(if_training, training_mask, val_mask) clf_loss *= clf_mask clf_loss = tf.reduce_sum(clf_loss) / tf.reduce_sum(clf_mask) total_loss = loss_weights[0] * clf_loss if rating_aware: true_ratings = tf.placeholder('float', [None, None]) ratings = linear(outputs, 1) ratings = tf.squeeze(ratings, axis=2) rat_loss = tf.square(ratings - true_ratings) mask = tf.greater(true_ratings, tf.cast(0, dtype=tf.float32)) mask = tf.cast(mask, tf.float32) rat_loss *= mask rat_loss = tf.reduce_sum(rat_loss) / tf.reduce_sum(mask) total_loss += loss_weights[1] * rat_loss bag = { 'base': [movie_idx_ph, total_loss, clf_loss], 'feed_prev': feed_prev, 'if_training': if_training, 'movie_embeddings': movie_embeddings } if user_aware: bag['user'] = user_idx_ph if rating_aware: bag['ratings'] = [true_ratings, rat_loss] return bag
pkeep = tf.placeholder(tf.float32, name='pkeep') batchsize = tf.placeholder(tf.int32, name='batchsize') # Inputs X = tf.placeholder(tf.uint32, [None, None], name='X') # [ BATCHSIZE, SEQUENCE_LENGTH ] Xo = tf.one_hot(X, ALPHABET_LENGTH, 1.0, 0.0) # [ BATCHSIZE, SEQUENCE_LENGTH , ALPHABET_LENGTH ] # Expected outputs Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, SEQUENCE_LENGTH ] Yo_ = tf.one_hot(Y_, ALPHABET_LENGTH, 1.0, 0.0) # [ BATCHSIZE, SEQUENCE_LENGTH , ALPHABET_LENGTH ] # Initial internal cell state Hin = tf.placeholder(tf.float32, [None, INTERNAL_SIZE * LAYERS], name='Hin') # [ BATCHSIZE , INTERNAL_SIZE * LAYERS ] # Deep stacked GRU cell deep_drop_cell = tfu.rnn.MultiDropoutGRUCell(size=INTERNAL_SIZE, pkeep=DROPOUT_KEEP_RATE, layers=LAYERS) # Output predictions and output state Yr, H = tf.nn.dynamic_rnn(deep_drop_cell, Xo, dtype=tf.float32, initial_state=Hin) H = tf.identity(H, name='H') Yflat = tf.reshape(Yr, [-1, INTERNAL_SIZE]) Ylogits = layers.linear(Yflat, ALPHABET_LENGTH) Yflat_ = tf.reshape(Yo_, [-1, ALPHABET_LENGTH]) loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_) loss = tf.reshape(loss, [batchsize, -1]) Yo = tf.nn.softmax(Ylogits, name='Yo') Y = tf.argmax(Yo, 1) Y = tf.reshape(Y, [batchsize, -1], name='Y') train_step = tf.train.AdamOptimizer(lr).minimize(loss)
batchsize = tf.placeholder(tf.int32, name='batchsize') lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') X = tf.placeholder(tf.uint8, [None, None], name='X') # Input vector Xo = tf.one_hot(X, ALPHA_SIZE, 1.0, 0.0) # One Hots create vector size ALPHA_SIZE, all set 0 except character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # Output tensor Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0) # OneHot our output also Hin = tf.placeholder(tf.float32, [None, NUM_OF_GRUS*NUM_LAYERS], name='Hin') # Recurrent input states cells = [rnn.GRUCell(NUM_OF_GRUS) for _ in range(NUM_LAYERS)] # Create all our GRU cells per layer dropcells = [rnn.DropoutWrapper(cell,input_keep_prob=pkeep) for cell in cells] # DropOut inside RNN multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) # DropOut for SoftMax layer Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) # Unrolling through time happens here H = tf.identity(H, name='H') # Last state of sequence Yflat = tf.reshape(Yr, [-1, NUM_OF_GRUS]) Ylogits = layers.linear(Yflat, ALPHA_SIZE) Yflat_ = tf.reshape(Yo_, [-1, ALPHA_SIZE]) loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_) loss = tf.reshape(loss, [batchsize, -1]) Yo = tf.nn.softmax(Ylogits, name='Yo') Y = tf.argmax(Yo, 1) Y = tf.reshape(Y, [batchsize, -1], name="Y") train_step = tf.train.AdamOptimizer(lr).minimize(loss) # Calculate Statistics for Analysis seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean(tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) acc_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, acc_summary])
def main(_): # load data, either shakespeare, or the Python source of Tensorflow itself shakedir = FLAGS.text_dir # shakedir = "../tensorflow/**/*.py" codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=True) # display some stats on the data epoch_size = len(codetext) // (FLAGS.train_batch_size * FLAGS.seqlen) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, FLAGS.seqlen ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # expected outputs = same sequence shifted by 1 since we are trying to predict the next character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, FLAGS.seqlen ] Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # input state Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # [ BATCHSIZE, INTERNALSIZE * NLAYERS] # using a NLAYERS=3 layers of GRU cells, unrolled FLAGS.seqlen=30 times # dynamic_rnn infers FLAGS.seqlen from the size of the inputs Xo onecell = rnn.GRUCell(INTERNALSIZE) dropcell = rnn.DropoutWrapper(onecell, input_keep_prob=pkeep) multicell = rnn.MultiRNNCell([dropcell] * NLAYERS, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) # Yr: [ BATCHSIZE, FLAGS.seqlen, INTERNALSIZE ] # H: [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence H = tf.identity(H, name='H') # just to give it a name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] => [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a cell or a minibatch is the same thing Yflat = tf.reshape( Yr, [-1, INTERNALSIZE]) # [ BATCHSIZE x FLAGS.seqlen, INTERNALSIZE ] Ylogits = layers.linear( Yflat, ALPHASIZE) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Yflat_ = tf.reshape( Yo_, [-1, ALPHASIZE]) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] loss = tf.nn.softmax_cross_entropy_with_logits( logits=Ylogits, labels=Yflat_) # [ BATCHSIZE x FLAGS.seqlen ] loss = tf.reshape(loss, [batchsize, -1]) # [ BATCHSIZE, FLAGS.seqlen ] Yo = tf.nn.softmax(Ylogits, name='Yo') # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Y = tf.argmax(Yo, 1) # [ BATCHSIZE x FLAGS.seqlen ] Y = tf.reshape(Y, [batchsize, -1], name="Y") # [ BATCHSIZE, FLAGS.seqlen ] train_step = tf.train.AdamOptimizer(lr).minimize(loss) # stats for display seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) acc_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. This will save Tensorboard information into a different # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that # you can compare training and validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-training")) validation_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-validation")) # Init for saving models. They will be saved into a directory named 'checkpoints'. # Only the last checkpoint is kept. if not os.path.exists(FLAGS.checkpoint_dir): os.mkdir(FLAGS.checkpoint_dir) saver = tf.train.Saver(max_to_keep=1) # for display: init the progress bar DISPLAY_FREQ = 50 _50_BATCHES = DISPLAY_FREQ * FLAGS.train_batch_size * FLAGS.seqlen progress = txt.Progress(DISPLAY_FREQ, size=111 + 2, msg="Training on next " + str(DISPLAY_FREQ) + " batches") # init istate = np.zeros([FLAGS.train_batch_size, INTERNALSIZE * NLAYERS]) # initial zero input state init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) step = 0 # training loop for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext, FLAGS.train_batch_size, FLAGS.seqlen, nb_epochs=1000): # train on one minibatch feed_dict = { X: x, Y_: y_, Hin: istate, lr: FLAGS.learning_rate, pkeep: FLAGS.dropout_pkeep, batchsize: FLAGS.train_batch_size } _, y, ostate, smm = sess.run([train_step, Y, H, summaries], feed_dict=feed_dict) # save training data for Tensorboard summary_writer.add_summary(smm, step) # display a visual validation of progress (every 50 batches) if step % _50_BATCHES == 0: feed_dict = { X: x, Y_: y_, Hin: istate, pkeep: 1.0, batchsize: FLAGS.train_batch_size } # no dropout for validation y, l, bl, acc = sess.run([Y, seqloss, batchloss, accuracy], feed_dict=feed_dict) txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc, epoch_size, step, epoch) # run a validation step every 50 batches # The validation text should be a single sequence but that's too slow (1s per 1024 chars!), # so we cut it up and batch the pieces (slightly inaccurate) # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower. if step % _50_BATCHES == 0 and len(valitext) > 0: VALI_SEQLEN = 1 * 1024 # Sequence length for validation. State will be wrong at the start of each sequence. bsize = len(valitext) // VALI_SEQLEN txt.print_validation_header(len(codetext), bookranges) vali_x, vali_y, _ = next( txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1)) # all data in 1 batch vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS]) feed_dict = { X: vali_x, Y_: vali_y, Hin: vali_nullstate, pkeep: 1.0, # no dropout for validation batchsize: bsize } ls, acc, smm = sess.run([batchloss, accuracy, summaries], feed_dict=feed_dict) txt.print_validation_stats(ls, acc) # save validation data for Tensorboard validation_writer.add_summary(smm, step) # display a short text generated with the current weights and biases (every 150 batches) if step // 3 % _50_BATCHES == 0: txt.print_text_generation_header() ry = np.array([[txt.convert_from_alphabet(ord("K"))]]) rh = np.zeros([1, INTERNALSIZE * NLAYERS]) for k in range(1000): ryo, rh = sess.run([Yo, H], feed_dict={ X: ry, pkeep: 1.0, Hin: rh, batchsize: 1 }) rc = txt.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) print(chr(txt.convert_to_alphabet(rc)), end="") ry = np.array([[rc]]) txt.print_text_generation_footer() # save a checkpoint (every 500 batches) if step // 10 % _50_BATCHES == 0: saver.save(sess, FLAGS.checkpoint_dir + '/rnn_train_' + timestamp, global_step=step) # display progress bar progress.step(reset=step % _50_BATCHES == 0) # loop state around istate = ostate step += FLAGS.train_batch_size * FLAGS.seqlen
def _inference(self): logging.info('...create inference') #fw_state_list = tf.unstack(self.fw_state, axis=0) #fw_state_tuple = tuple( # [tf.contrib.rnn.LSTMStateTuple(fw_state_list[idx][0], fw_state_list[idx][1]) # for idx in range(self.num_layers)]) #bw_state_list = tf.unstack(self.bw_state, axis=0) #bw_state_tuple = tuple( # [tf.contrib.rnn.LSTMStateTuple(bw_state_list[idx][0], bw_state_list[idx][1]) # for idx in range(self.num_layers)]) fw_cells = list() for i in range(0, self.num_layers): if (self.cell_type == 'lstm'): cell = rnn.LSTMCell(num_units=self.cell_sizes[i], state_is_tuple=True) elif (self.cell_type == 'gru'): # change to GRU cell = rnn.LSTMCell(num_units=self.cell_sizes[i], state_is_tuple=True) else: cell = rnn.BasicRNNCell(num_units=self.cell_sizes[i]) cell = rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) fw_cells.append(cell) self.fw_cells = rnn.MultiRNNCell(fw_cells, state_is_tuple=True) if (self.direction == 2): # bidirectional print('bidirectional') bw_cells = list() for i in range(0, self.num_layers): if (self.cell_type == 'lstm'): cell = rnn.LSTMCell(num_units=self.cell_sizes[i], state_is_tuple=True) elif (self.cell_type == 'gru'): # change to GRU cell = rnn.LSTMCell(num_units=self.cell_sizes[i], state_is_tuple=True) else: cell = rnn.BasicRNNCell(num_units=self.cell_sizes[i]) cell = rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) bw_cells.append(cell) self.bw_cells = rnn.MultiRNNCell(bw_cells, state_is_tuple=True) if (self.direction == 1): rnn_outputs, states = tf.nn.dynamic_rnn( self.fw_cells, self.inputs, #initial_state=fw_state_tuple, sequence_length=self.seq_lengths, dtype=tf.float32, time_major=True) else: # self.direction = 2 # bidirectional rnn outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw=self.fw_cells, cell_bw=self.bw_cells, #initial_state_fw=fw_state_tuple, #initial_state_bw=bw_state_tuple, dtype=tf.float32, sequence_length=self.seq_lengths, inputs=self.inputs, time_major=True) rnn_outputs = tf.concat(outputs, axis=2) # project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding # an extra layer here. self.projection = lambda x: layers.linear( x, num_outputs=self.label_classes, activation_fn=tf.nn.sigmoid) self.logits = tf.map_fn(self.projection, rnn_outputs, name="logits") self.probs = tf.nn.softmax(self.logits, name="probs") self.states = states tf.add_to_collection('probs', self.probs)