def _optimizers(lr, mlr0, mlr_decay, learn_lr=True): io_optim = far.GradientDescentOptimizer(far.get_hyperparameter('lr', lr) if learn_lr else tf.constant(lr, name='lr')) gs = tf.get_variable('global_step', initializer=0, trainable=False) meta_lr = tf.train.inverse_time_decay(mlr0, gs, 1., mlr_decay) oo_optim = tf.train.AdamOptimizer(meta_lr) farho = far.HyperOptimizer() return io_optim, gs, meta_lr, oo_optim, farho
def get_stc_hyperparameter(name, initializer=None, shape=None, constraints=None, sample_func=None, hyper_probs=None): """ Get a stochastic hyperparameter. Defaults to Bernoulli hyperparameter. Mostly follows the signature of `tf.get_variable` :param name: a name for the hyperparameter :param initializer: an initializer (or initial value) for the parameters of the distribution :param shape: a shape for the stochastic hyperparameter :param constraints: additional (simple) constraints for the parameters of the distribution :param sample_func: a function that takes the distribution parameters and returns a sample :param hyper_probs: the variables used for the underlying probability distribution :return: The stochastic hyperparameter (not the distribution variables!) """ if constraints is None: constraints = lambda _v: tf.maximum(tf.minimum(_v, 1.), 0.) if hyper_probs is None: # creates the hyperparameter that is also used for sampling hyper_probs = tf.get_variable(name + '/' + GraphKeys.STOCHASTIC_HYPER, trainable=False, constraint=constraints, initializer=initializer, shape=shape, collections=[ GraphKeys.GLOBAL_VARIABLES, GraphKeys.STOCHASTIC_HYPER ]) if sample_func is None: sample_func = bernoulli_hard_sample hyper_sample = far.get_hyperparameter( name, initializer=sample_func(hyper_probs), collections=GraphKeys.STOCHASTIC_HYPER) far.utils.remove_from_collection(GraphKeys.GLOBAL_VARIABLES, hyper_sample) with tf.control_dependencies([tf.variables_initializer([hyper_sample]) ]): # re-initialize and return the value _STC_INITIALIZERs[hyper_sample] = hyper_sample.read_value() _STC_MAP[hyper_sample] = hyper_probs return hyper_sample
h1_hyp = tcl.fully_connected( x, 300, variables_collections=far.HYPERPARAMETERS_COLLECTIONS, trainable=False) out_hyp = tcl.fully_connected( h1_hyp, datasets.train.dim_target, variables_collections=far.HYPERPARAMETERS_COLLECTIONS, trainable=False) print('Initial model weights (hyperparameters)') [print(e) for e in far.utils.hyperparameters()] # far.utils.remove_from_collection(far.GraphKeys.MODEL_VARIABLES, *far.utils.hyperparameters()) # get an hyperparameter for weighting the examples for the inner objective loss (training error) weights = far.get_hyperparameter('ex_weights', tf.zeros(batch)) # build loss and accuracy # inner objective (training error), weighted mean of cross entropy errors (with sigmoid to be sure is > 0) with tf.name_scope('errors'): tr_loss = tf.reduce_mean( tf.sigmoid(weights) * tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=out)) # outer objective (validation error) (not weighted) val_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=out)) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(out, 1)), tf.float32)) # optimizers # get an hyperparameter for the learning rate
# In[16]: try: ss.close() except: pass tf.reset_default_graph() ss = tf.InteractiveSession() v1 = tf.Variable([1.,3]) v2 = tf.Variable([[-1., -2], [1., 0.]]) # In[17]: lmbd = far.get_hyperparameter('lambda', initializer=tf.ones_initializer, shape=v2.get_shape()) cost = tf.reduce_mean(v1**2) + tf.reduce_sum(lmbd*v2**2) io_optim = far.AdamOptimizer(epsilon=1.e-6) #io_optim = far.MomentumOptimizer(far.get_hyperparameter('eta', 0.1), far.get_hyperparameter('mu', .9)) io_optim_dict = io_optim.minimize(cost) oo = tf.reduce_mean(v1*v2) # In[18]: rhg = far.ReverseHg()
return datasets.train, datasets.validation def g_logits(x, y): with tf.variable_scope('model'): h1 = layers.fully_connected(x, 300) logits = layers.fully_connected(h1, int(y.shape[1])) return logits x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x') y = tf.placeholder(tf.float32, shape=(None, 10), name='y') logits = g_logits(x, y) train_set, validation_set = get_data() lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples)) lr = far.get_hyperparameter('lr', initializer=0.01) ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) L = tf.reduce_mean(tf.sigmoid(lambdas) * ce) E = tf.reduce_mean(ce) inner_optimizer = far.GradientDescentOptimizer(lr) outer_optimizer = tf.train.AdamOptimizer() hyper_step = far.HyperOptimizer().minimize(E, outer_optimizer, L, inner_optimizer) T = 200 # Number of inner iterations train_set_supplier = train_set.create_supplier(x, y) validation_set_supplier = validation_set.create_supplier(x, y) tf.global_variables_initializer().run()
import far_ho as far import tensorflow as tf import numpy as np run_gd = False # true for constant step size right_step = False tf.reset_default_graph() ss = tf.InteractiveSession() L = tf.constant(10.65158) kappa = .25 lmbd = far.get_hyperparameter('lmbd', .008921) # L / (1 + lmbd) sol = L / (1 + lmbd) # w = tf.get_variable('w', initializer=sol) w = tf.get_variable('w', initializer=tf.zeros_initializer, shape=(1, )) b = tf.get_variable('b', initializer=tf.ones_initializer, shape=(2, )) outer_obj = (w - 2.)**2 / 2. + lmbd**2 # this should be a callable! yeah def inner_obj(var_list): w = var_list[0] obj = (w - L)**2 / 2. + lmbd * (w)**2 / 2 + tf.reduce_sum(var_list[1]**2) return obj[0] io_lip = 1. + lmbd
try: ss.close() except: pass tf.reset_default_graph() ss = tf.InteractiveSession() v1 = tf.Variable([1., 3]) v2 = tf.Variable([[-1., -2], [1., 0.]]) # In[17]: lmbd = far.get_hyperparameter('lambda', initializer=tf.ones_initializer, shape=v2.get_shape()) cost = tf.reduce_mean(v1**2) + tf.reduce_sum(lmbd * v2**2) io_optim = far.AdamOptimizer(epsilon=1.e-6) #io_optim = far.MomentumOptimizer(far.get_hyperparameter('eta', 0.1), far.get_hyperparameter('mu', .9)) io_optim_dict = io_optim.minimize(cost) oo = tf.reduce_mean(v1 * v2) # In[18]: rhg = far.ReverseHG() rhg.compute_gradients(oo, io_optim_dict)
# build loss and accuracy # inner objective (training error), weighted mean of cross entropy errors (with sigmoid to be sure is > 0) with tf.name_scope('errors'): #tr_loss = tf.reduce_mean(tf.sigmoid(weights)*tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=out)) #tr_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=out)) # outer objective (validation error) (not weighted) val_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=out)) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(out, 1)), tf.float32)) # optimizers # get an hyperparameter for the learning rate lr = far.get_hyperparameter('lr', 0.01) io_optim = far.GradientDescentOptimizer( lr ) # for training error minimization an optimizer from far_ho is needed oo_optim = tf.train.AdamOptimizer( ) # for outer objective optimizer all optimizers from tf are valid print('hyperparameters to optimize') [print(h) for h in far.hyperparameters()] # build hyperparameter optimizer farho = far.HyperOptimizer() run = farho.minimize( val_loss, oo_optim, val_loss,
import tensorflow as tf import far_ho as far tf.reset_default_graph() ss = tf.InteractiveSession() v1 = tf.Variable([10., 3]) v2 = tf.Variable([[-1., -2], [1., -21.]]) # In[17]: lmbd = far.get_hyperparameter('lambda', initializer=tf.ones_initializer, shape=v2.get_shape()) reg2 = far.get_hyperparameter('reg2', 0.1) eta = far.get_hyperparameter('eta', 0.1) beta1 = far.get_hyperparameter('beta1', 1.) beta2 = far.get_hyperparameter('beta2', 2.) # noinspection PyTypeChecker cost = tf.reduce_mean(v1**2) + tf.reduce_sum(lmbd*v2**2) + reg2*tf.nn.l2_loss(v1) io_optim = far.AdamOptimizer(eta, tf.nn.sigmoid(beta1), tf.nn.sigmoid(beta2), epsilon=1.e-4) oo = tf.reduce_mean(v1*v2) rhg = far.ReverseHG()
initializer=w_init, collections=far.HYPERPARAMETERS_COLLECTIONS, trainable=False) fb_hyp = tf.get_variable('fb_hyp', (t_feature,), tf.float32, initializer=b_init, collections=far.HYPERPARAMETERS_COLLECTIONS, trainable=False) fe_emb_hyp = tf.tensordot(tf.one_hot(x, t_feature), fe_hyp, axes=1) fe_emb_hyp = tf.reduce_sum(tf.reduce_prod(fe_emb_hyp, axis=1), axis=1) fb_emb_hyp = tf.tensordot(tf.one_hot(x, t_feature), fb_hyp, axes=1) fb_emb_hyp = tf.reduce_sum(fb_emb_hyp, axis=1) out_hyp = tf.add_n([fe_emb_hyp, fb_emb_hyp]) print('Initial model weights (hyperparameters)') [print(e) for e in far.utils.hyperparameters()]; weights = far.get_hyperparameter('ex_weights', tf.zeros(datasets.train.num_examples)) with tf.name_scope('errors'): # tr_loss = tf.reduce_mean(tf.sigmoid(weights) * tf.losses.mean_squared_error(y, out)) # val_loss = tf.reduce_mean(tf.losses.mean_squared_error(y, out)) tr_loss = 0.5 * tf.reduce_sum(tf.sigmoid(weights) * tf.square(y - out)) tr_loss += 0.01 * tf.reduce_sum(tf.square(fe)) tr_loss += 0.01 * tf.reduce_sum(tf.square(fb)) val_loss = 0.5 * tf.reduce_sum(tf.square(y - out)) val_loss += 0.01 * tf.reduce_sum(tf.square(weights)) accuracy = tf.keras.metrics.mean_squared_error(y, tf.clip_by_value(out, 1.0, 5.0)) lr = far.get_hyperparameter('lr', 0.01) # lr = tf.constant(0.01, name='lr') io_optim = far.GradientDescentOptimizer(lr) # for training error minimization an optimizer from far_ho is needed oo_optim = tf.train.AdamOptimizer() # for outer objective optimizer all optimizers from tf are valid
def build(metasets, hyper_model_builder, learn_lr, lr0, MBS, mlr0, mlr_decay, batch_norm_before_classifier, weights_initializer, process_fn=None): exs = [em.SLExperiment(metasets) for _ in range(MBS)] hyper_repr_model = hyper_model_builder(exs[0].x, 'HyperRepr') if learn_lr: lr = far.get_hyperparameter('lr', lr0) else: lr = tf.constant(lr0, name='lr') gs = tf.get_variable('global_step', initializer=0, trainable=False) meta_lr = tf.train.inverse_time_decay(mlr0, gs, decay_steps=1., decay_rate=mlr_decay) io_opt = far.GradientDescentOptimizer(lr) oo_opt = tf.train.AdamOptimizer(meta_lr) far_ho = far.HyperOptimizer() for k, ex in enumerate(exs): # print(k) # DEBUG with tf.device(available_devices[k % len(available_devices)]): repr_out = hyper_repr_model.for_input(ex.x).out other_train_vars = [] if batch_norm_before_classifier: batch_mean, batch_var = tf.nn.moments(repr_out, [0]) scale = tf.Variable(tf.ones_like(repr_out[0])) beta = tf.Variable(tf.zeros_like(repr_out[0])) other_train_vars.append(scale) other_train_vars.append(beta) repr_out = tf.nn.batch_normalization(repr_out, batch_mean, batch_var, beta, scale, 1e-3) ex.model = em.models.FeedForwardNet( repr_out, metasets.train.dim_target, output_weight_initializer=weights_initializer, name='Classifier_%s' % k) ex.errors['training'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=ex.y, logits=ex.model.out)) ex.errors['validation'] = ex.errors['training'] ex.scores['accuracy'] = tf.reduce_mean(tf.cast( tf.equal(tf.argmax(ex.y, 1), tf.argmax(ex.model.out, 1)), tf.float32), name='accuracy') # simple training step used for testing (look ex.optimizers['ts'] = tf.train.GradientDescentOptimizer( lr).minimize(ex.errors['training'], var_list=ex.model.var_list) optim_dict = far_ho.inner_problem(ex.errors['training'], io_opt, var_list=ex.model.var_list + other_train_vars) far_ho.outer_problem(ex.errors['validation'], optim_dict, oo_opt, hyper_list=tf.get_collection( far.GraphKeys.HYPERPARAMETERS), global_step=gs) far_ho.finalize(process_fn=process_fn) saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES), max_to_keep=240) return exs, far_ho, saver