def __init__(self, optimizer, hyper_dict, method, hyper_grad_kwargs=None, hyper_optimizer_class=AdamOptimizer, **optimizers_kwargs): """ Interface instance of gradient-based hyperparameter optimization methods. :param optimizer: parameter optimization dynamics (obtained from `Optimizer.create` methods) :param hyper_dict: dictionary of validation errors and list of hyperparameters to be optimized :param method: method with which to compute hyper-gradients: Forward or Reverse-Ho :param hyper_grad_kwargs: dictionary of keyword arguments for `HyperGradient` classes (usually None) :param hyper_optimizer_class: (default Adam) Optimizer class for optimization of the hyperparameters :param optimizers_kwargs: keyword arguments for hyperparameter optimizers (like hyper-learning rate) """ assert method in [ReverseHG, ForwardHG] assert hyper_optimizer_class is None or issubclass( hyper_optimizer_class, Optimizer) assert isinstance(hyper_dict, dict) assert isinstance(optimizer, Optimizer) if not hyper_grad_kwargs: hyper_grad_kwargs = {} self.hyper_iteration_step = GlobalStep(name='hyper_iteration_step') self._report_hyper_it_init = tf.report_uninitialized_variables( [self.hyper_iteration_step.var]) # self.hyper_batch_step = GlobalStep(name='hyper_batch_step') self.hyper_batch_step = GlobalStep(name='batch_step') # automatically links eventual optimizer global step (like in Adam) to HyperGradient global step hyper_grad_kwargs['global_step'] = hyper_grad_kwargs.get( 'global_step', optimizer.global_step if hasattr( optimizer, 'global_step') else GlobalStep()) # automatically links eventual hyper-optimizer global step (like in Adam) to batch_step if hyper_optimizer_class == AdamOptimizer: optimizers_kwargs['global_step'] = self.hyper_batch_step optimizers_kwargs.setdefault('eps', 1.e-14) self.hyper_gradients = method(optimizer, hyper_dict, **hyper_grad_kwargs) if hyper_optimizer_class: # noinspection PyTypeChecker self.hyper_optimizers = create_hyperparameter_optimizers( self.hyper_gradients, optimizer_class=hyper_optimizer_class, **optimizers_kwargs) else: self.hyper_optimizers = None
def __init__(self, forward_hyper_grad, hyperparameter_optimizers, hyper_projections=None, hyper_step=None): """ Helper class to perform Real Time Hyperparameter optimization. See section 3.3 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) :param forward_hyper_grad: instance of `ForwardHyperGradient`. Used to compute hyper-gradients :param hyperparameter_optimizers: single or list of Optimizer for the hyper-parameter descent procedure :param hyper_projections: (optional) list of assign ops that performs projection to onto a convex subset of the hyperparameter space. :param hyper_step: (optional) instance of `GlobalStep` class that keeps tracks of the number of hyper-batches performed so far. """ assert isinstance(forward_hyper_grad, ForwardHyperGradient) self.direct_doh = forward_hyper_grad assert isinstance(hyperparameter_optimizers, (list, Optimizer)), "hyper_opt_dicts should be a single " \ "Optimizer or a list of Optimizer. Instead" \ "is %s" % hyperparameter_optimizers self.hyper_opt_dicts = as_list(hyperparameter_optimizers) self.hyper_projections = hyper_projections or [] self.hyper_step = hyper_step or GlobalStep()
def create(w, lr=1.e-3, beta1=.9, beta2=.999, eps=1.e-8, global_step=None, loss=None, grad=None, w_is_state=True, name='Adam'): # FIXME rewrite this """ Adam optimizer. :param w: :param lr: :param beta1: :param beta2: :param eps: :param global_step: :param loss: :param grad: :param w_is_state: :param name: :return: """ # beta1_pow = tf.Variable(beta1) # for the moment skip the implementation of this optimization. assert grad is not None or loss is not None, "One between grad or loss must be given" with tf.name_scope(name): if w_is_state: assert isinstance( w, MergedVariable), "%s is not instance of MergedVariable" % w assert len(w.var_list(Vl_Mode.TENSOR)) == 3, "%s is not augmented correctly, len of w.var_list(" \ "Vl_Mode.TENSOR should be 3, but is " \ "%d" % (w, len(w.var_list(Vl_Mode.TENSOR))) w_base, m, v = w.var_list(Vl_Mode.TENSOR) else: w_base = w m = tf.Variable(tf.zeros(w.get_shape())) v = tf.Variable(tf.zeros(w.get_shape())) if grad is None: grad = tf.gradients(loss, w_base)[0] if global_step is None: global_step = GlobalStep() m_k = beta1 * m + (1. - beta1) * grad v_k = beta2 * v + (1. - beta2) * grad**2 lr_k = lr * tf.sqrt( 1. - tf.pow(beta2, tf.to_float(global_step.var + 1))) / ( 1. - tf.pow(beta1, tf.to_float(global_step.var + 1))) w_base_k = w_base - lr_k * ( beta1 * m + (1. - beta1) * grad) / tf.sqrt(beta2 * v + (1. - beta2) * grad**2 + eps) jac_z = None # TODO!!!!! # noinspection PyUnresolvedReferences dynamics = tf.concat([w_base_k, m_k, v_k], 0) if w_base_k.get_shape().ndims != 0 \ else tf.stack([w_base_k, m_k, v_k], 0) # scalar case if w_is_state: w_base_mv, m_mv, v_mv = w.var_list(Vl_Mode.RAW) else: w_base_mv, m_mv, v_mv = w_base, m, v return AdamOptimizer(w=w_base, m=m, v=v, global_step=global_step, assign_ops=[ w_base_mv.assign(w_base_k), m_mv.assign(m_k), v_mv.assign(v_k) ], dynamics=dynamics, jac_z=jac_z, gradient=grad, learning_rate=lr, momentum_factor=beta1, second_momentum_factor=beta2, raw_w=w)
def create(w, lr=1.e-3, beta1=.9, beta2=.999, eps=1.e-6, global_step=None, loss=None, grad=None, w_is_state=True, name='Adam', _debug_jac_z=False): # FIXME rewrite this """ Adam optimizer. :param w: all weight vector :param lr: learning rate :param beta1: first momentum factor :param beta2: second momentum factor :param eps: term for numerical stability (higher than proposed default) :param global_step: :param loss: scalar tensor :param grad: vector tensor :param w_is_state: :param name: :param _debug_jac_z: :return: """ # beta1_pow = tf.Variable(beta1) # for the moment skip the implementation of this optimization. assert grad is not None or loss is not None, "One between grad or loss must be given" with tf.name_scope(name): if w_is_state: assert isinstance( w, MergedVariable), "%s is not instance of MergedVariable" % w assert len(w.var_list(VlMode.TENSOR)) == 3, "%s is not augmented correctly, len of w.var_list(" \ "VlMode.TENSOR should be 3, but is " \ "%d" % (w, len(w.var_list(VlMode.TENSOR))) w_base, m, v = w.var_list(VlMode.TENSOR) else: w_base = w m = tf.Variable(tf.zeros(w.get_shape())) v = tf.Variable(tf.zeros(w.get_shape())) if grad is None: grad = tf.gradients(loss, w_base)[0] if global_step is None: global_step = GlobalStep() m_k = tf.multiply(beta1, m) + (1. - beta1) * grad v_k = tf.multiply(beta2, v) + (1. - beta2) * grad**2 bias_correction = tf.sqrt( 1. - tf.pow(beta2, tf.to_float(global_step.var + 1))) / ( 1. - tf.pow(beta1, tf.to_float(global_step.var + 1))) lr_k = lr * bias_correction v_epsilon_k = tf.multiply(beta2, v) + (1. - beta2) * grad**2 + eps v_tilde_k = tf.sqrt(v_epsilon_k) # + eps """ to make it the same as tensorflow adam optimizer the eps should go after the square root... this brings however some problems in the computation of the hypergradient, therefore we put it inside! SHOULD BETTER INVESTIGATE THE ISSUE. (maybe the jacobian computation should be done again) """ # TODO THESE QUANTITIES ARE NEEDED FOR FORWARD-HG IN VARIOUS PLACES... FIND A BETTER WAY TO COMPUTE THEM # ONLY IF NEEDED v_k_eps_32 = tf.pow(v_epsilon_k, 1.5) pre_j_11_out = -lr_k * ((1. - beta1) / v_tilde_k - ((1. - beta2) * grad * m_k) / v_k_eps_32) pre_j_31_out = 2. * (1. - beta2) * grad w_base_k = w_base - lr_k * (tf.multiply(beta1, m) + (1. - beta1) * grad) / v_tilde_k # noinspection PyUnresolvedReferences def _jac_z(z): if _debug_jac_z: # I guess this would take an incredible long time to compile for large systems d = dynamics.get_shape().as_list()[0] // 3 r, u, s = z.var_list(VlMode.TENSOR) j11 = tf.stack([ tf.gradients(w_base_k[i], w_base)[0] for i in range(d) ]) j12 = tf.stack( [tf.gradients(w_base_k[i], m)[0] for i in range(d)]) j13 = tf.stack( [tf.gradients(w_base_k[i], v)[0] for i in range(d)]) j1 = tf.concat([j11, j12, j13], axis=1) jz1 = tf.matmul(j11, r) + tf.matmul(j12, u) + tf.matmul( j13, s) # second block j21 = tf.stack( [tf.gradients(m_k[i], w_base)[0] for i in range(d)]) j22 = tf.stack( [tf.gradients(m_k[i], m)[0] for i in range(d)]) j23 = tf.stack( [tf.gradients(m_k[i], v)[0] for i in range(d)]) j2 = tf.concat([j21, j22, j23], axis=1) jz2 = tf.matmul(j21, r) + tf.matmul(j22, u) + tf.matmul( j23, s) # third block j31 = tf.stack( [tf.gradients(v_k[i], w_base)[0] for i in range(d)]) j32 = tf.stack( [tf.gradients(v_k[i], m)[0] for i in range(d)]) j33 = tf.stack( [tf.gradients(v_k[i], v)[0] for i in range(d)]) j3 = tf.concat([j31, j32, j33], axis=1) jz3 = tf.matmul(j31, r) + tf.matmul(j32, u) + tf.matmul( j33, s) tf.concat([j1, j2, j3], axis=0, name='Jacobian') return ZMergedMatrix([jz1, jz2, jz3]) else: assert loss is not None, 'Should specify loss to use jac_z' r, u, s = z.var_list(VlMode.TENSOR) with tf.name_scope('Jac_Z'): hessian_r_product = hvp(loss=loss, w=w_base, v=r, name='hessian_r_product') # hessian_r_product = hvp(loss=loss, w=w.tensor, v=z.tensor, name='hessian_r_product')[:d, :d] j_11_r_tilde = l_diag_mul(pre_j_11_out, hessian_r_product, name='j_11_r_tilde') j_11_r = tf.identity(j_11_r_tilde + r, 'j_11_r') j_12_u_hat = tf.identity(-lr_k * beta1 / v_tilde_k, name='j_12_u_hat') j_12_u = l_diag_mul(j_12_u_hat, u, name='j_12_u') j_13_s_hat = tf.identity(lr_k * beta2 * m_k / (2 * v_k_eps_32), name='j_13_s_hat') j_13_s = l_diag_mul(j_13_s_hat, s, name='j_13_s') jac_z_1 = tf.identity(j_11_r + j_12_u + j_13_s, name='jac_z_1') # end first bock j_21_r = tf.identity((1. - beta1) * hessian_r_product, name='j_21_r') j_22_u = tf.identity(beta1 * u, name='j_22_u') # j_23_s = tf.zeros_like(s) # would be... jac_z_2 = tf.identity(j_21_r + j_22_u, name='jac_z_2') # end second block j_31_r = l_diag_mul(pre_j_31_out, hessian_r_product, name='j_31_r') # j_32_u = tf.zeros_like(u) # would be j_33_s = tf.identity(beta2 * s, name='j_33_s') jac_z_3 = tf.identity(j_31_r + j_33_s, name='jac_z_3') res = [jac_z_1, jac_z_2, jac_z_3] # print('res', res) return ZMergedMatrix(res) # algorithmic partial derivatives (as functions so that we do not create unnecessary nodes def _d_dyn_d_lr(_name): res = [ -bias_correction * m_k / v_tilde_k, tf.zeros_like(m_k), tf.zeros_like(v_k) # just aesthetics ] return ZMergedMatrix(res, name=_name) def _d_dyn_d_hyp_gl(cross_der_l, _name): dwt_dl_hat = pre_j_11_out dwt_dl = l_diag_mul(dwt_dl_hat, cross_der_l) dmt_dl = (1 - beta1) * cross_der_l dvt_dl = l_diag_mul(pre_j_31_out, cross_der_l) return ZMergedMatrix([dwt_dl, dmt_dl, dvt_dl], name=_name) # noinspection PyUnresolvedReferences dynamics = tf.concat([w_base_k, m_k, v_k], 0) if w_base_k.get_shape().ndims != 0 \ else tf.stack([w_base_k, m_k, v_k], 0) # scalar case if w_is_state: w_base_mv, m_mv, v_mv = w.var_list(VlMode.RAW) else: w_base_mv, m_mv, v_mv = w_base, m, v return AdamOptimizer(w=w_base, m=m, v=v, global_step=global_step, assign_ops=[ w_base_mv.assign(w_base_k), m_mv.assign(m_k), v_mv.assign(v_k) ], dynamics=dynamics, jac_z=_jac_z, gradient=grad, learning_rate=lr, momentum_factor=beta1, second_momentum_factor=beta2, raw_w=w, loss=loss, d_dyn_d_lr=_d_dyn_d_lr, d_dyn_d_hyper=_d_dyn_d_hyp_gl)
def __init__(self, optimizer, hyper_dict, global_step=None): """ Creates a new object that computes the hyper-gradient of validation errors in forward mode. See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter) (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians efficiently yet (suggestions or pointer are welcomed) :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable (never tested on Variables actually) ... self.w_t = MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \ 'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict self.hyper_list = [] # more comfortable to use self.d_dynamics_d_hypers = [] self.hyper_dict = {} # standardizes hyper_dict parameter for k, v in hyper_dict.items(): list_v = as_list(v) assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\ % (hyper_dict, list_v[0]) self.hyper_dict[k] = list_v # be sure values are lists! self.hyper_list += [pair[0] for pair in list_v] self.d_dynamics_d_hypers += [pair[1] for pair in list_v] self.val_errors = [] # will follow the same order as hyper_list for hyp in self.hyper_list: # find the right validation error for hyp! for k, v in hyper_dict.items(): all_hypers = [pair[0] for pair in as_list(v)] if hyp in all_hypers: self.val_errors.append(k) break for i, der in enumerate( self.d_dynamics_d_hypers ): # this automatic casting at the moment works only for SGD if not isinstance(der, ZMergedMatrix): print('Try casting d_dynamics_d_hyper to ZMergedMatrix') self.d_dynamics_d_hypers[i] = ZMergedMatrix(der) print('Successful') with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self.fw_ops = self.w.assign( self.tr_dynamics) # TODO add here when hypers are sequence with tf.name_scope('direct_HO'): ''' Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector ''' self.zs = [self._create_z(hyp) for hyp in self.hyper_list] self.zs_dynamics = [ optimizer.jac_z(z) + dd_dh for z, dd_dh in zip(self.zs, self.d_dynamics_d_hypers) ] print('z dynamics', self.zs_dynamics[0]) print('z', self.zs[0]) self.zs_assigns = [ z.assign(z_dyn) for z, z_dyn in zip(self.zs, self.zs_dynamics) ] self.grad_val_err = [ tf.gradients(v_e, self.w_t)[0] for v_e in self.val_errors ] assert all([ g is not None for g in self.grad_val_err ]), 'Some gradient of the validation error is None!' self.grad_wrt_hypers = [ dot(gve, z.tensor) for z, gve in zip(self.zs, self.grad_val_err) ] with tf.name_scope( 'hyper_gradients' ): # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable self.hyper_gradient_vars = [ tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp)) for hyp in self.hyper_list ] self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } self._hyper_assign_ops = [ v.assign(ght) for v, ght in zip( self.hyper_gradient_vars, self.grad_wrt_hypers) ]
def __init__(self, optimizer, hyper_dict, state_history=None, global_step=None): """ Creates a new object that computes the hyper-gradient of validation errors in reverse mode. See section 3.1 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: insance of Optimizer class, which contains the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyperparameter or list_of_hyperparameters}` where `validation_error` is a scalar tensor and `list_of_hyperparameters` is a list of tensorflow variables that represents the hyperparameters :param state_history: (default: empty list) state history manager: should implement methods `clear`, `append`, `__getitem__` :param global_step: optional instance of GlobalStep class """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable # TODO check if it works also with w as simple Variable self.w_t = MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of ' \ '(tf.Tensor, hyperparameters)' % hyper_dict self.val_error_dict = hyper_dict self.hyper_list = [] for k, v in hyper_dict.items(): self.hyper_list += as_list(v) self.val_error_dict[k] = as_list(v) # be sure that are all lists self.w_hist = state_history or [] with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self._fw_ops = optimizer.assign_ops # TODO add here when hyper-parameters are sequence # backward assign ops with tf.name_scope('backward'): # equation (9) p_T = { ve: tf.gradients(ve, self.w_t)[0] for ve, hyp_list in self.val_error_dict.items() } # deltaE(s_t) self.p_dict = { ve: tf.Variable(pt, name='p') for ve, pt in p_T.items() } # for nullity check self._abs_sum_p = tf.reduce_sum( tf.stack([ tf.reduce_sum(tf.abs(p), name='l1_p') for p in self.p_dict.values() ])) # build Lagrangian function with tf.name_scope('lagrangian'): self.lagrangians_dict = { ve: dot(p, self.tr_dynamics) for ve, p in self.p_dict.items() } # TODO read below ''' In the following {if else} block there are two ways of computing the the dynamics of the update of the Lagrangian multipliers. The procedures SHOULD produce the same result, however, for some strange reason, if w is indeed a state varibale that contains auxiliary components (e.g. velocity in Momentum algorithm, ...) there is a difference in the two methods and the right one is the first one. This is possibly due to the order in wich the derivatives are taken by tensorflow, but furhter investigation is necessary. ''' # detects if some auxiliary variables are used. if isinstance(self.w, MergedVariable) and \ any([isinstance(v, MergedVariable) for v in self.w.var_list(Vl_Mode.RAW)]): state_components = self.w.var_list(Vl_Mode.TENSOR) # equation (8) self.p_dynamics = { ve: tf.concat(tf.gradients(lagrangian, state_components), 0) for ve, lagrangian in self.lagrangians_dict.items() } else: # equation (8) self.p_dynamics = { ve: tf.gradients(lagrangian, self.w_t)[0] for ve, lagrangian in self.lagrangians_dict.items() } # equation (7) self._bk_ops = [ self.p_dict[ve].assign(self.p_dynamics[ve]) for ve in self.val_error_dict ] # TODO add here when hp are sequ. with tf.name_scope('w_history_ops'): self._w_placeholder = tf.placeholder(self.w_t.dtype) self._back_hist_op = self.w.assign(self._w_placeholder) with tf.name_scope('hyper_derivatives'): # equation (10) without summation. self.hyper_derivatives = [ (self.val_error_dict[ve], tf.gradients(lagrangian, self.val_error_dict[ve])) for ve, lagrangian in self.lagrangians_dict.items() ] # list of couples (hyper_list, list of symbolic hyper_gradients) (lists are unhashable!) with tf.name_scope( 'hyper_gradients' ): # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable self._grad_wrt_hypers_placeholder = tf.placeholder( tf.float32, name='placeholder') # TODO this placeholder is not really necessary... just added to minimize the changes needed # (merge with RICCARDO) self.hyper_gradient_vars = [ tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp)) for hyp in self.hyper_list ] self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } self._hyper_assign_ops = { h: v.assign(self._grad_wrt_hypers_placeholder) for h, v in self.hyper_gradients_dict.items() }
class HyperOptimizer: """ Interface class for gradient-based hyperparameter optimization methods. """ def __init__(self, optimizer, hyper_dict, method, hyper_grad_kwargs=None, hyper_optimizer_class=AdamOptimizer, **optimizers_kwargs): """ Interface instance of gradient-based hyperparameter optimization methods. :param optimizer: parameter optimization dynamics (obtained from `Optimizer.create` methods) :param hyper_dict: dictionary of validation errors and list of hyperparameters to be optimized :param method: method with which to compute hyper-gradients: Forward or Reverse-Ho :param hyper_grad_kwargs: dictionary of keyword arguments for `HyperGradient` classes (usually None) :param hyper_optimizer_class: (default Adam) Optimizer class for optimization of the hyperparameters :param optimizers_kwargs: keyword arguments for hyperparameter optimizers (like hyper-learning rate) """ assert method in [ReverseHG, ForwardHG] assert hyper_optimizer_class is None or issubclass( hyper_optimizer_class, Optimizer) assert isinstance(hyper_dict, dict) assert isinstance(optimizer, Optimizer) if not hyper_grad_kwargs: hyper_grad_kwargs = {} self.hyper_iteration_step = GlobalStep(name='hyper_iteration_step') self._report_hyper_it_init = tf.report_uninitialized_variables( [self.hyper_iteration_step.var]) # self.hyper_batch_step = GlobalStep(name='hyper_batch_step') self.hyper_batch_step = GlobalStep(name='batch_step') # automatically links eventual optimizer global step (like in Adam) to HyperGradient global step hyper_grad_kwargs['global_step'] = hyper_grad_kwargs.get( 'global_step', optimizer.global_step if hasattr( optimizer, 'global_step') else GlobalStep()) # automatically links eventual hyper-optimizer global step (like in Adam) to batch_step if hyper_optimizer_class == AdamOptimizer: optimizers_kwargs['global_step'] = self.hyper_batch_step optimizers_kwargs.setdefault('eps', 1.e-14) self.hyper_gradients = method(optimizer, hyper_dict, **hyper_grad_kwargs) if hyper_optimizer_class: # noinspection PyTypeChecker self.hyper_optimizers = create_hyperparameter_optimizers( self.hyper_gradients, optimizer_class=hyper_optimizer_class, **optimizers_kwargs) else: self.hyper_optimizers = None @property def hyper_list(self): """ :return: list of hyperparameters that are/will be optimized """ return self.hyper_gradients.hyper_list def initialize(self, session=None, complete_reinitialize=False): """ Initialize all tensorflow variables. This method has two behaviours: - first time it is called (after entering a Session run block) or when flag `complete_reinitialize` is `True` initializes all the relevant variables - subsequent times, reinitialize only model variables (next hyper-iteration). :param: complete_reinitialize: (default `False`) if True reinitialize hyper-step counts and hyperparameter optimizers regardless of :param: session: optional tensorflow session (if None default session is used) :return: True if this is the first initialization """ ss = tf.get_default_session() assert ss, 'No default session.' never_initialized = bool(self._report_hyper_it_init.eval()) if complete_reinitialize or never_initialized: # never initialized or subsequent run of # Session run block (for instance in a Ipython book) tf.variables_initializer(self.hyper_gradients.hyper_list).run() if self.hyper_optimizers: [ opt.support_variables_initializer().run() for opt in self.hyper_optimizers ] tf.variables_initializer( [self.hyper_iteration_step.var, self.hyper_batch_step.var]).run() else: self.hyper_iteration_step.increase.eval() self.hyper_gradients.initialize(session=session) return never_initialized def run(self, T, train_feed_dict_supplier=None, val_feed_dict_suppliers=None, hyper_constraints_ops=None, _debug_no_hyper_update=False): # TODO add session parameter """ :param _debug_no_hyper_update: :param T: number of steps :param train_feed_dict_supplier: :param val_feed_dict_suppliers: :param hyper_constraints_ops: (list of) either callable (no parameters) or tensorflow ops :return: """ # idea: if steps == T then do full reverse, or forward, otherwise do trho and rtho # after all the main difference is that if we go with the full version, after the gradient has been # computed, the method `initialize()` is called. self.hyper_gradients.run_all( T, train_feed_dict_supplier=train_feed_dict_supplier, val_feed_dict_suppliers=val_feed_dict_suppliers, hyper_batch_step=self.hyper_batch_step.eval()) if not _debug_no_hyper_update: [ tf.get_default_session().run(hod.assign_ops) for hod in self.hyper_optimizers ] if hyper_constraints_ops: [ op() if callable(op) else op.eval() for op in as_list(hyper_constraints_ops) ] self.hyper_batch_step.increase.eval()
def __init__(self, optimizer, hyper_dict, global_step=None, devices=None): """ Creates a new object that computes the hyper-gradient of validation errors in forward mode. See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter) (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians efficiently yet (suggestions or pointer are welcomed) :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable (never tested on Variables actually) ... self.w_t = self.w # MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \ 'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict self.hyper_list = [] # more comfortable to use self.d_dynamics_d_hypers = [] self.hyper_dict = {} # standardizes hyper_dict parameter self._inverse_hyper_dict = {} # hyperparameter-validation error pairs for k, v in hyper_dict.items(): list_v = as_list(v) # assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\ # % (hyper_dict, list_v[0]) self.hyper_dict[k] = list_v # be sure values are lists! self._inverse_hyper_dict = { **self._inverse_hyper_dict, **{hyp: k for hyp in list_v} } self.hyper_list += [ pair[0] if isinstance(pair, (tuple, list)) else pair for pair in list_v ] self.d_dynamics_d_hypers += [ pair[1] if isinstance(pair, (tuple, list)) else optimizer.auto_d_dynamics_d_hyper( pair) # try to compute it automatically for pair in list_v ] self.val_errors = [] # will follow the same order as hyper_list for hyp in self.hyper_list: # find the right validation error for hyp! for k, v in hyper_dict.items(): all_hypers = [ pair[0] if isinstance(pair, (list, tuple)) else pair for pair in as_list(v) ] if hyp in all_hypers: self.val_errors.append(k) break for i, der in enumerate( self.d_dynamics_d_hypers ): # this automatic casting at the moment works only for SGD if not isinstance(der, ZMergedMatrix): print('Try casting d_dynamics_d_hyper to ZMergedMatrix') self.d_dynamics_d_hypers[i] = ZMergedMatrix(der) print('Successful') devices = as_list(devices) # at most will be [None] with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self.fw_ops = optimizer.assign_ops # add here when hypers are sequence (...) with tf.name_scope('ForwardHG'): ''' Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector ''' self.grad_wrt_hypers, self.zs, self.zs_dynamics, self._zs_assigns = [], [], [], [] self.hyper_gradient_vars, self._hyper_assign_ops = [], [] self.grad_val_err = { ve: tf.identity(tf.gradients(ve, self.w_t)[0], name='grad_val_err_%s' % simple_name(ve.name)) for ve in self.hyper_dict.keys() } self._gve_inv_dict = { hyp: self.grad_val_err[ve] for hyp, ve in self._inverse_hyper_dict.items() } for k, hyp in enumerate(self.hyper_list): with tf.device(devices[k % len(devices)]): self.zs.append(self._create_z(hyp)) with tf.name_scope('Z_dynamics'): self.zs_dynamics.append( optimizer.jac_z(self.zs[k]) + self.d_dynamics_d_hypers[k]) self._zs_assigns.append(self.zs[k].assign( self.zs_dynamics[k])) self.grad_wrt_hypers.append( dot(self._gve_inv_dict[hyp], self.zs[k], name='hyper_grad_wrt_h')) with tf.name_scope('hyper_gradients'): self.hyper_gradient_vars.append( tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp))) self._hyper_assign_ops.append( self.hyper_gradient_vars[k].assign( self.grad_wrt_hypers[k])) # final operations self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } # hyper-gradient check assert all([g is not None for g in self.grad_val_err]), 'Some gradient ' \ 'of the validation error is None!'