def test_measurement(): opt = YFOptimizer(zero_debias=False) w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ) ) b_grad_val = tf.placeholder(tf.float32, shape=(1, ) ) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) ) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 for i in range(n_iter): feed_dict = {w_grad_val: (i + 1) * np.ones( [n_dim, ], dtype=np.float32), b_grad_val: (i + 1) * np.ones( [1, ], dtype=np.float32) } res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, apply_op], feed_dict=feed_dict) g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3 assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3 print "sync measurement test passed!"
def test_lr_mu(): opt = YFOptimizer(zero_debias=False) w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True) b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True) x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32) loss = tf.multiply(w, x) + b tvars = tf.trainable_variables() w_grad_val = tf.Variable(np.zeros( [n_dim, ] ), dtype=tf.float32, trainable=False) b_grad_val = tf.Variable(np.zeros([1, ] ), dtype=tf.float32, trainable=False) apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) ) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) target_h_max = 0.0 target_h_min = 0.0 g_norm_squared_avg = 0.0 g_norm_avg = 0.0 g_avg = 0.0 target_dist = 0.0 target_lr = 0.1 target_mu = 0.0 for i in range(n_iter): sess.run(tf.assign(w_grad_val, (i + 1) * np.ones( [n_dim, ], dtype=np.float32) ) ) sess.run(tf.assign(b_grad_val, (i + 1) * np.ones( [1, ], dtype=np.float32) ) ) res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, opt._lr_var, opt._mu_var, apply_op] ) res[5] = opt._lr_var.eval() res[6] = opt._mu_var.eval() g_norm_squared_avg = 0.999 * g_norm_squared_avg \ + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2) g_norm_avg = 0.999 * g_norm_avg \ + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) ) g_avg = 0.999 * g_avg + 0.001 * (i + 1) target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1) target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1) target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1) target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg if i > 0: lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max) target_lr = 0.999 * target_lr + 0.001 * lr target_mu = 0.999 * target_mu + 0.001 * mu # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \ # " var ", res[3], target_var, " dist ", res[4], target_dist # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3 assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3 assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3 assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3 assert target_lr == 0.0 or np.abs(target_lr - res[5] ) < np.abs(res[5] ) * 1e-3 assert target_mu == 0.0 or np.abs(target_mu - res[6] ) < np.abs(res[6] ) * 5e-3 print "lr and mu computing test passed!"
def __init__(self, is_training, config, input_, opt_method='sgd'): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. def lstm_cell(): # With the latest TensorFlow source code (as of Mar 27, 2017), # the BasicLSTMCell will need a reuse parameter which is unfortunately not # defined in TensorFlow 1.0. To maintain backwards compatibility, we add # an argument check here: if 'reuse' in inspect.getargspec( tf.contrib.rnn.BasicLSTMCell.__init__).args: return tf.contrib.rnn.BasicLSTMCell( size, forget_bias=0.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) else: return tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, data_type()) with tf.device("cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # inputs = tf.unstack(inputs, num=num_steps, axis=1) # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type())]) # self._cost = cost = tf.reduce_sum(loss) / batch_size self._cost = cost = tf.reduce_sum(loss) / (batch_size * num_steps) self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) self._mu = tf.Variable(0.0, trainable=False) self._grad_norm_thresh = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() self.tvars = tvars self.grads = tf.gradients(cost, tvars) grads_clip, self.grad_norm = tf.clip_by_global_norm( self.grads, self._grad_norm_thresh) if opt_method == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'mom': print("using sgd mom") optimizer = tf.train.MomentumOptimizer(self._lr, self._mu) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'adam': optimizer = tf.train.AdamOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads_clip, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) elif opt_method == 'YF': optimizer = YFOptimizer(lr=1.0, mu=0.0) self._train_op = optimizer.apply_gradients(zip(self.grads, tvars)) else: raise Exception("optimizer not supported") self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) self._new_mu = tf.placeholder(tf.float32, shape=[], name="new_momentum") self._mu_update = tf.assign(self._mu, self._new_mu) self._new_grad_norm_thresh = tf.placeholder( tf.float32, shape=[], name="new_grad_norm_thresh") self._grad_norm_thresh_update = tf.assign(self._grad_norm_thresh, self._new_grad_norm_thresh)