def test_hv_with_builtin(): iris = load_iris() x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) net_w, net_out = vectorize_model(model.var_list, model.inp[-1]) v = tf.constant(np.ones(net_w.tensor.get_shape()), dtype=tf.float32) # vector of ones of right shape ce_builtin = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=net_out, labels=y) ) # this is the builtin function advertised on tensorflow for computing cross entropy loss with softmax output ce_standard = tf.reduce_mean( -tf.reduce_sum(y * tf.log(tf.nn.softmax(net_out)), reduction_indices=[1]) # this is normal CE loss ) hvp_builtin = hvp( ce_builtin, net_w.tensor, v) # WITH PREVIOUS VERSIONS (r.0.11) WAS 0. NOW RAISES ERROR # UPDATE r1.2 now it's working! yeah! hessian_builtin = tf.hessians(ce_builtin, net_w.tensor)[0] hvp_standard = hvp(ce_standard, net_w.tensor, v) hessian_standard = tf.hessians(ce_standard, net_w.tensor)[0] def training_supplier(): return {x: iris.train.data, y: iris.train.target} ts = tf.train.GradientDescentOptimizer(.1).minimize( ce_standard, var_list=model.var_list) with tf.Session().as_default() as ss: tf.global_variables_initializer().run() print('builtin, standard:', ss.run([ce_builtin, ce_standard], feed_dict=training_supplier())) for _ in range(2000): ts.run(feed_dict=training_supplier()) print('builtin', ss.run([hvp_builtin, hessian_builtin], feed_dict=training_supplier())) # output is wrongly 0. print( 'standard', ss.run([hvp_standard, hessian_standard], feed_dict=training_supplier()))
def iris_logistic_regression(augment=0): iris = load_iris() x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) model_w, model_y = vectorize_model(model.var_list, model.inp[-1], augment=augment) error = tf.reduce_mean(cross_entropy_loss(model_y, y)) correct_prediction = tf.equal(tf.argmax(model_y, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) return iris, x, y, model, model_w, model_y, error, accuracy
def iris_logistic_regression(augment=0): """ Simple model for testing purposes :param augment: :return: """ iris = load_iris(partitions_proportions=(.3, .3)) x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = rf.LinearModel(x, 4, 3) model_w, model_y = rf.vectorize_model(model.var_list, model.inp[-1], augment=augment) error = tf.reduce_mean(rf.cross_entropy_loss(model_y, y)) correct_prediction = tf.equal(tf.argmax(model_y, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) return iris, x, y, model, model_w, model_y, error, accuracy
def _test_doh_iris(self): # FIXME to check. Probably now does not work tf.reset_default_graph() for _ in range(1): with tf.Graph().as_default(): iris = load_iris() x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') # net = FFNN(x, [4, 20, 20, 20, 3]) net = LinearModel(x, 4, 3) net_w, net_out = vectorize_model(net.var_list, net.inp[-1]) l2_factor = tf.Variable(.001, name='l2_factor') eta = tf.Variable(.1, name='learning_rate') error = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(net_out, y)) tr_error = error + l2_factor * dot(net_w, net_w) hyper_list = [l2_factor, eta] doh = ReverseHG( GradientDescentOptimizer.create(net_w, eta, loss=tr_error), hyper_list, error, []) T = 2000 def training_supplier(step): return {x: iris.train.data, y: iris.train.target} def validation_supplier(step=None): return {x: iris.test.data, y: iris.test.target} with tf.name_scope('summaries'): # write summaries here? s_err = tf.summary.scalar('error', error) s_np = tf.summary.scalar('squared_norm_p', dot(doh.p_dict, doh.p_dict)) s_hyper_der = [ tf.summary.scalar('hyper/' + hy.name, hy_der) for hy, hy_der in zip(hyper_list, doh.hyper_derivatives) ] fw_merged = tf.summary.merge([s_err]) bw_merged = tf.summary.merge([s_np] + s_hyper_der) su = SummaryUtil(ops=fw_merged, writer='summary_test/fw', condition=lambda step: step % 10 == 0, fd_supplier=training_supplier) su_bk = SummaryUtil(ops=bw_merged, writer='summary_test/bw', condition=lambda step: step % 10 == 0, fd_supplier=training_supplier) pn = norm(doh.p_dict) pu_bk = PrintUtils( stepwise_pu(lambda ss, step: ss.run(pn), 100)) bk_merged = MergedUtils(pu_bk, SSU(su_bk)) with tf.Session().as_default(): tf.variables_initializer(hyper_list).run() hyper_grads = doh.run_all(T, training_supplier, validation_supplier, forward_su=SSU(su), backward_su=bk_merged) print(hyper_grads)
def test_single_hp(self): tf.reset_default_graph() T = 100 lr = .01 hyper_iterations = 10 hyper_learning_rate = .001 iris = load_iris([.4, .4]) x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) w, model_out = vectorize_model(model.var_list, model.inp[-1]) error = tf.reduce_mean(cross_entropy_loss(model_out, y)) correct_prediction = tf.equal(tf.argmax(model_out, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) eta = tf.Variable(lr, name='eta') dynamics_dict = GradientDescentOptimizer.create(w, lr=eta, loss=error) doh = ReverseHG(dynamics_dict, hyper_dict={error: eta}) grad = tf.gradients(error, w.tensor)[0] hyper_dict = {error: (eta, -grad)} direct_doh = ForwardHG(dynamics_dict, hyper_dict=hyper_dict) # noinspection PyUnusedLocal def all_training_supplier(step=None): return {x: iris.train.data, y: iris.train.target} training_supplier = all_training_supplier # noinspection PyUnusedLocal def validation_supplier(step=None): return {x: iris.validation.data, y: iris.validation.target} # noinspection PyUnusedLocal def test_supplier(step=None): return {x: iris.test.data, y: iris.test.target} psu = PrintUtils( stepwise_pu( lambda ses, step: print( 'test accuracy', ses.run(accuracy, feed_dict=test_supplier())), T - 1), ) psu2 = None history_test_accuracy = [] history_eta = [] # noinspection PyUnusedLocal def save_accuracies(ses, step): history_test_accuracy.append( ses.run(accuracy, feed_dict=test_supplier())) history_eta.append(ses.run(eta)) after_forward_su = PrintUtils( unconditional_pu(save_accuracies), unconditional_pu(lambda ses, step: print( 'training error', error.eval(feed_dict=all_training_supplier()) ))) delta_hyper = tf.placeholder(tf.float32) hyper_upd_ops = [ hyp.assign( tf.minimum(tf.maximum(hyp - delta_hyper, tf.zeros_like(hyp)), tf.ones_like(hyp))) for hyp in doh.hyper_list ] # check the sign of gradient # In[ ]: diffs = [] with tf.Session(config=TestDohDirectDoh.config).as_default() as ss: tf.variables_initializer([eta]).run() for _ in range(hyper_iterations): direct_doh.initialize() for _k in range(T): direct_doh.step_forward( train_feed_dict_supplier=training_supplier, summary_utils=psu) direct_res = direct_doh.hyper_gradient_vars( validation_suppliers=training_supplier) res = doh.run_all(T, train_feed_dict_supplier=training_supplier, after_forward_su=after_forward_su, val_feed_dict_suppliers=training_supplier, forward_su=psu, backward_su=psu2) collected_hyper_gradients = list( ReverseHG.std_collect_hyper_gradients(res).values()) [ ss.run(hyper_upd_ops[j], feed_dict={ delta_hyper: hyper_learning_rate * collected_hyper_gradients[j] }) for j in range(len(doh.hyper_list)) ] self.assertLess( np.linalg.norm( np.array(direct_res[eta]) - np.array(collected_hyper_gradients)), 1.e-5) diffs.append( np.array(direct_res[eta]) - np.array(collected_hyper_gradients)) ev_data = ExampleVisiting(iris, 10, 10) T = ev_data.T training_supplier = ev_data.create_feed_dict_supplier(x, y) with tf.Session(config=TestDohDirectDoh.config).as_default() as ss: tf.variables_initializer([eta]).run() for _ in range(hyper_iterations): ev_data.generate_visiting_scheme() direct_doh.initialize() for _k in range(T): direct_doh.step_forward( train_feed_dict_supplier=training_supplier, summary_utils=psu) direct_res = direct_doh.hyper_gradient_vars( validation_suppliers=all_training_supplier) res = doh.run_all( T, train_feed_dict_supplier=training_supplier, after_forward_su=after_forward_su, val_feed_dict_suppliers=all_training_supplier, forward_su=psu, backward_su=psu2) collected_hyper_gradients = list( ReverseHG.std_collect_hyper_gradients(res).values()) [ ss.run(hyper_upd_ops[j], feed_dict={ delta_hyper: hyper_learning_rate * collected_hyper_gradients[j] }) for j in range(len(doh.hyper_list)) ] self.assertLess( np.linalg.norm( np.array(direct_res[eta]) - np.array(collected_hyper_gradients)), 1.e-5)
def _test_multiple_hp(self, momentum=False): tf.reset_default_graph() T = 100 lr = .01 hyper_iterations = 10 hyper_learning_rate = .001 mu = None if momentum: mu = tf.Variable(.7, name='mu') iris = load_iris([.4, .4]) x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) w, model_out, mat_W, b = vectorize_model(model.var_list, model.inp[-1], model.Ws[0], model.bs[0], augment=momentum) error = tf.reduce_mean(cross_entropy_loss(model_out, y)) gamma = tf.Variable([0., 0.], name='gamma') regularizer = gamma[0] * tf.reduce_sum( mat_W**2) + gamma[1] * tf.reduce_sum(b**2) training_error = error + regularizer correct_prediction = tf.equal(tf.argmax(model_out, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) eta = tf.Variable(lr, name='eta') if momentum: dynamics_dict = MomentumOptimizer.create(w, lr=eta, mu=mu, loss=training_error) else: dynamics_dict = GradientDescentOptimizer.create( w, lr=eta, loss=training_error) if momentum: doh = ReverseHyperGradient(dynamics_dict, hyper_dict={ training_error: [eta, mu], error: [gamma] }) else: doh = ReverseHyperGradient(dynamics_dict, hyper_dict={ training_error: [eta], error: [gamma] }) # In[8]: true_w = w.var_list(Vl_Mode.TENSOR)[0] grad = tf.gradients(training_error, true_w)[0] _grad_reg = tf.gradients(regularizer, gamma)[0] grad_reg = tf.stack([ tf.gradients(_grad_reg[0], true_w)[0], tf.gradients(_grad_reg[1], true_w)[0] ], axis=1) if momentum: w_b, m = w.var_list(Vl_Mode.TENSOR) # noinspection PyUnresolvedReferences grad = ZMergedMatrix([ -tf.transpose([mu * m + grad]), tf.zeros([m.get_shape().as_list()[0], 1]) ]) grad_reg = ZMergedMatrix([-eta * grad_reg, grad_reg]) grad_mu = ZMergedMatrix([-(eta * m), m]) else: grad_mu = None grad_reg *= eta if momentum: hyper_dict = { training_error: [(eta, grad), (mu, grad_mu)], error: (gamma, grad_reg) } direct_doh = ForwardHyperGradient(dynamics_dict, hyper_dict=hyper_dict) else: hyper_dict = { training_error: (eta, -grad), error: (gamma, -grad_reg) } direct_doh = ForwardHyperGradient(dynamics_dict, hyper_dict=hyper_dict) # noinspection PyUnusedLocal def all_training_supplier(step=None): return {x: iris.train.data, y: iris.train.target} training_supplier = all_training_supplier # noinspection PyUnusedLocal def validation_supplier(step=None): return {x: iris.validation.data, y: iris.validation.target} # noinspection PyUnusedLocal def test_supplier(step=None): return {x: iris.test.data, y: iris.test.target} psu = PrintUtils( stepwise_pu( lambda ses, step: print( 'test accuracy', ses.run(accuracy, feed_dict=test_supplier())), T - 1), ) norm_p = norm(tf.concat(list(doh.p_dict.values()), 0)) psu2 = PrintUtils( stepwise_pu( lambda ses, step: print('norm of costate', ses.run(norm_p)), T - 1)) history_test_accuracy = [] history_eta = [] # noinspection PyUnusedLocal def save_accuracies(ses, step): history_test_accuracy.append( ses.run(accuracy, feed_dict=test_supplier())) history_eta.append(ses.run(eta)) after_forward_su = PrintUtils( unconditional_pu(save_accuracies), unconditional_pu(lambda ses, step: print( 'training error', error.eval(feed_dict=all_training_supplier()) ))) delta_hyper = tf.placeholder(tf.float32) hyper_upd_ops = { hyp: hyp.assign(tf.maximum(hyp - delta_hyper, tf.zeros_like(hyp))) for hyp in doh.hyper_list } # check the sign of gradient with tf.Session(config=TestDohDirectDoh.config).as_default() as ss: tf.variables_initializer(doh.hyper_list).run() for _ in range(hyper_iterations): direct_doh.initialize() for _k in range(T): direct_doh.step_forward( train_feed_dict_supplier=training_supplier, summary_utils=psu) validation_suppliers = { training_error: training_supplier, error: validation_supplier } if momentum: direct_res = direct_doh.hyper_gradient_vars( validation_suppliers=validation_suppliers) else: direct_res = direct_doh.hyper_gradient_vars( validation_suppliers=validation_suppliers) res = doh.run_all(T, train_feed_dict_supplier=training_supplier, after_forward_su=after_forward_su, val_feed_dict_suppliers={ error: validation_supplier, training_error: training_supplier }, forward_su=psu, backward_su=psu2) collected_hyper_gradients = ReverseHyperGradient.std_collect_hyper_gradients( res) [ ss.run(hyper_upd_ops[hyp], feed_dict={ delta_hyper: hyper_learning_rate * collected_hyper_gradients[hyp] }) for hyp in doh.hyper_list ] for hyp in doh.hyper_list: self.assertLess( np.linalg.norm( np.array(direct_res[hyp]) - np.array(collected_hyper_gradients[hyp])), 1.e-5)