Пример #1
0
    def __init__(self,
                 optimizer,
                 hyper_dict,
                 method,
                 hyper_grad_kwargs=None,
                 hyper_optimizer_class=AdamOptimizer,
                 **optimizers_kwargs):
        """
        Interface instance of gradient-based hyperparameter optimization methods.

        :param optimizer: parameter optimization dynamics (obtained from `Optimizer.create` methods)
        :param hyper_dict: dictionary of validation errors and list of hyperparameters to be optimized
        :param method:  method with which to compute hyper-gradients: Forward
                        or Reverse-Ho
        :param hyper_grad_kwargs: dictionary of keyword arguments for `HyperGradient` classes (usually None)
        :param hyper_optimizer_class: (default Adam) Optimizer class for optimization of the hyperparameters
        :param optimizers_kwargs: keyword arguments for hyperparameter optimizers (like hyper-learning rate)
        """
        assert method in [ReverseHG, ForwardHG]
        assert hyper_optimizer_class is None or issubclass(
            hyper_optimizer_class, Optimizer)
        assert isinstance(hyper_dict, dict)
        assert isinstance(optimizer, Optimizer)

        if not hyper_grad_kwargs: hyper_grad_kwargs = {}
        self.hyper_iteration_step = GlobalStep(name='hyper_iteration_step')
        self._report_hyper_it_init = tf.report_uninitialized_variables(
            [self.hyper_iteration_step.var])
        # self.hyper_batch_step = GlobalStep(name='hyper_batch_step')
        self.hyper_batch_step = GlobalStep(name='batch_step')

        # automatically links eventual optimizer global step (like in Adam) to HyperGradient global step
        hyper_grad_kwargs['global_step'] = hyper_grad_kwargs.get(
            'global_step', optimizer.global_step if hasattr(
                optimizer, 'global_step') else GlobalStep())

        # automatically links eventual hyper-optimizer global step (like in Adam) to batch_step
        if hyper_optimizer_class == AdamOptimizer:
            optimizers_kwargs['global_step'] = self.hyper_batch_step
            optimizers_kwargs.setdefault('eps', 1.e-14)

        self.hyper_gradients = method(optimizer, hyper_dict,
                                      **hyper_grad_kwargs)

        if hyper_optimizer_class:
            # noinspection PyTypeChecker
            self.hyper_optimizers = create_hyperparameter_optimizers(
                self.hyper_gradients,
                optimizer_class=hyper_optimizer_class,
                **optimizers_kwargs)
        else:
            self.hyper_optimizers = None
Пример #2
0
    def __init__(self,
                 forward_hyper_grad,
                 hyperparameter_optimizers,
                 hyper_projections=None,
                 hyper_step=None):
        """
        Helper class to perform Real Time Hyperparameter optimization.
        See section 3.3 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)

        :param forward_hyper_grad:          instance of `ForwardHyperGradient`. Used to compute hyper-gradients
        :param hyperparameter_optimizers:   single or list of Optimizer for the hyper-parameter descent procedure
        :param hyper_projections:           (optional) list of assign ops that performs projection to
                                            onto a convex subset of the hyperparameter space.
        :param hyper_step:                  (optional) instance of `GlobalStep` class that keeps tracks of the number
                                            of hyper-batches performed so far.
        """
        assert isinstance(forward_hyper_grad, ForwardHyperGradient)
        self.direct_doh = forward_hyper_grad

        assert isinstance(hyperparameter_optimizers, (list, Optimizer)), "hyper_opt_dicts should be a single " \
                                                                         "Optimizer or a list of Optimizer. Instead" \
                                                                         "is %s" % hyperparameter_optimizers
        self.hyper_opt_dicts = as_list(hyperparameter_optimizers)

        self.hyper_projections = hyper_projections or []

        self.hyper_step = hyper_step or GlobalStep()
Пример #3
0
    def create(w,
               lr=1.e-3,
               beta1=.9,
               beta2=.999,
               eps=1.e-8,
               global_step=None,
               loss=None,
               grad=None,
               w_is_state=True,
               name='Adam'):  # FIXME rewrite this
        """
        Adam optimizer.
        :param w:
        :param lr:
        :param beta1:
        :param beta2:
        :param eps:
        :param global_step:
        :param loss:
        :param grad:
        :param w_is_state:
        :param name:
        :return:
        """
        # beta1_pow = tf.Variable(beta1)  # for the moment skip the implementation of this optimization.
        assert grad is not None or loss is not None, "One between grad or loss must be given"
        with tf.name_scope(name):
            if w_is_state:

                assert isinstance(
                    w,
                    MergedVariable), "%s is not instance of MergedVariable" % w
                assert len(w.var_list(Vl_Mode.TENSOR)) == 3, "%s is not augmented correctly, len of w.var_list(" \
                                                             "Vl_Mode.TENSOR should be 3, but is " \
                                                             "%d" % (w, len(w.var_list(Vl_Mode.TENSOR)))

                w_base, m, v = w.var_list(Vl_Mode.TENSOR)
            else:
                w_base = w
                m = tf.Variable(tf.zeros(w.get_shape()))
                v = tf.Variable(tf.zeros(w.get_shape()))
            if grad is None:
                grad = tf.gradients(loss, w_base)[0]
            if global_step is None:
                global_step = GlobalStep()

            m_k = beta1 * m + (1. - beta1) * grad
            v_k = beta2 * v + (1. - beta2) * grad**2

            lr_k = lr * tf.sqrt(
                1. - tf.pow(beta2, tf.to_float(global_step.var + 1))) / (
                    1. - tf.pow(beta1, tf.to_float(global_step.var + 1)))
            w_base_k = w_base - lr_k * (
                beta1 * m +
                (1. - beta1) * grad) / tf.sqrt(beta2 * v +
                                               (1. - beta2) * grad**2 + eps)

            jac_z = None  # TODO!!!!!

            # noinspection PyUnresolvedReferences
            dynamics = tf.concat([w_base_k, m_k, v_k], 0) if w_base_k.get_shape().ndims != 0 \
                else tf.stack([w_base_k, m_k, v_k], 0)  # scalar case

            if w_is_state:
                w_base_mv, m_mv, v_mv = w.var_list(Vl_Mode.RAW)
            else:
                w_base_mv, m_mv, v_mv = w_base, m, v

            return AdamOptimizer(w=w_base,
                                 m=m,
                                 v=v,
                                 global_step=global_step,
                                 assign_ops=[
                                     w_base_mv.assign(w_base_k),
                                     m_mv.assign(m_k),
                                     v_mv.assign(v_k)
                                 ],
                                 dynamics=dynamics,
                                 jac_z=jac_z,
                                 gradient=grad,
                                 learning_rate=lr,
                                 momentum_factor=beta1,
                                 second_momentum_factor=beta2,
                                 raw_w=w)
Пример #4
0
    def create(w,
               lr=1.e-3,
               beta1=.9,
               beta2=.999,
               eps=1.e-6,
               global_step=None,
               loss=None,
               grad=None,
               w_is_state=True,
               name='Adam',
               _debug_jac_z=False):  # FIXME rewrite this
        """
        Adam optimizer.
        
        :param w: all weight vector
        :param lr: learning rate
        :param beta1: first momentum factor
        :param beta2: second momentum factor
        :param eps: term for numerical stability (higher than proposed default)
        :param global_step: 
        :param loss: scalar tensor 
        :param grad: vector tensor
        :param w_is_state:
        :param name:
        :param _debug_jac_z: 
        :return:
        """
        # beta1_pow = tf.Variable(beta1)  # for the moment skip the implementation of this optimization.
        assert grad is not None or loss is not None, "One between grad or loss must be given"
        with tf.name_scope(name):
            if w_is_state:

                assert isinstance(
                    w,
                    MergedVariable), "%s is not instance of MergedVariable" % w
                assert len(w.var_list(VlMode.TENSOR)) == 3, "%s is not augmented correctly, len of w.var_list(" \
                                                            "VlMode.TENSOR should be 3, but is " \
                                                            "%d" % (w, len(w.var_list(VlMode.TENSOR)))

                w_base, m, v = w.var_list(VlMode.TENSOR)
            else:
                w_base = w
                m = tf.Variable(tf.zeros(w.get_shape()))
                v = tf.Variable(tf.zeros(w.get_shape()))
            if grad is None:
                grad = tf.gradients(loss, w_base)[0]
            if global_step is None:
                global_step = GlobalStep()

            m_k = tf.multiply(beta1, m) + (1. - beta1) * grad
            v_k = tf.multiply(beta2, v) + (1. - beta2) * grad**2

            bias_correction = tf.sqrt(
                1. - tf.pow(beta2, tf.to_float(global_step.var + 1))) / (
                    1. - tf.pow(beta1, tf.to_float(global_step.var + 1)))
            lr_k = lr * bias_correction

            v_epsilon_k = tf.multiply(beta2, v) + (1. - beta2) * grad**2 + eps
            v_tilde_k = tf.sqrt(v_epsilon_k)  # + eps
            """
            to make it the same as tensorflow adam optimizer the eps should go after the square root... this
            brings however some problems in the computation of the hypergradient, therefore we put it inside!
            SHOULD BETTER INVESTIGATE THE ISSUE. (maybe the jacobian computation should be done again)
            """

            # TODO THESE QUANTITIES ARE NEEDED FOR FORWARD-HG IN VARIOUS PLACES... FIND A BETTER WAY TO COMPUTE THEM
            # ONLY IF NEEDED
            v_k_eps_32 = tf.pow(v_epsilon_k, 1.5)
            pre_j_11_out = -lr_k * ((1. - beta1) / v_tilde_k -
                                    ((1. - beta2) * grad * m_k) / v_k_eps_32)
            pre_j_31_out = 2. * (1. - beta2) * grad

            w_base_k = w_base - lr_k * (tf.multiply(beta1, m) +
                                        (1. - beta1) * grad) / v_tilde_k

            # noinspection PyUnresolvedReferences
            def _jac_z(z):
                if _debug_jac_z:  # I guess this would take an incredible long time to compile for large systems
                    d = dynamics.get_shape().as_list()[0] // 3
                    r, u, s = z.var_list(VlMode.TENSOR)

                    j11 = tf.stack([
                        tf.gradients(w_base_k[i], w_base)[0] for i in range(d)
                    ])
                    j12 = tf.stack(
                        [tf.gradients(w_base_k[i], m)[0] for i in range(d)])
                    j13 = tf.stack(
                        [tf.gradients(w_base_k[i], v)[0] for i in range(d)])
                    j1 = tf.concat([j11, j12, j13], axis=1)
                    jz1 = tf.matmul(j11, r) + tf.matmul(j12, u) + tf.matmul(
                        j13, s)

                    # second block
                    j21 = tf.stack(
                        [tf.gradients(m_k[i], w_base)[0] for i in range(d)])
                    j22 = tf.stack(
                        [tf.gradients(m_k[i], m)[0] for i in range(d)])
                    j23 = tf.stack(
                        [tf.gradients(m_k[i], v)[0] for i in range(d)])
                    j2 = tf.concat([j21, j22, j23], axis=1)
                    jz2 = tf.matmul(j21, r) + tf.matmul(j22, u) + tf.matmul(
                        j23, s)

                    # third block
                    j31 = tf.stack(
                        [tf.gradients(v_k[i], w_base)[0] for i in range(d)])
                    j32 = tf.stack(
                        [tf.gradients(v_k[i], m)[0] for i in range(d)])
                    j33 = tf.stack(
                        [tf.gradients(v_k[i], v)[0] for i in range(d)])
                    j3 = tf.concat([j31, j32, j33], axis=1)
                    jz3 = tf.matmul(j31, r) + tf.matmul(j32, u) + tf.matmul(
                        j33, s)

                    tf.concat([j1, j2, j3], axis=0, name='Jacobian')

                    return ZMergedMatrix([jz1, jz2, jz3])

                else:
                    assert loss is not None, 'Should specify loss to use jac_z'

                    r, u, s = z.var_list(VlMode.TENSOR)

                    with tf.name_scope('Jac_Z'):

                        hessian_r_product = hvp(loss=loss,
                                                w=w_base,
                                                v=r,
                                                name='hessian_r_product')
                        # hessian_r_product = hvp(loss=loss, w=w.tensor, v=z.tensor, name='hessian_r_product')[:d, :d]

                        j_11_r_tilde = l_diag_mul(pre_j_11_out,
                                                  hessian_r_product,
                                                  name='j_11_r_tilde')
                        j_11_r = tf.identity(j_11_r_tilde + r, 'j_11_r')

                        j_12_u_hat = tf.identity(-lr_k * beta1 / v_tilde_k,
                                                 name='j_12_u_hat')
                        j_12_u = l_diag_mul(j_12_u_hat, u, name='j_12_u')

                        j_13_s_hat = tf.identity(lr_k * beta2 * m_k /
                                                 (2 * v_k_eps_32),
                                                 name='j_13_s_hat')
                        j_13_s = l_diag_mul(j_13_s_hat, s, name='j_13_s')

                        jac_z_1 = tf.identity(j_11_r + j_12_u + j_13_s,
                                              name='jac_z_1')
                        # end first bock

                        j_21_r = tf.identity((1. - beta1) * hessian_r_product,
                                             name='j_21_r')
                        j_22_u = tf.identity(beta1 * u, name='j_22_u')
                        # j_23_s = tf.zeros_like(s)  # would be...

                        jac_z_2 = tf.identity(j_21_r + j_22_u, name='jac_z_2')
                        # end second block

                        j_31_r = l_diag_mul(pre_j_31_out,
                                            hessian_r_product,
                                            name='j_31_r')
                        # j_32_u = tf.zeros_like(u)  # would be
                        j_33_s = tf.identity(beta2 * s, name='j_33_s')
                        jac_z_3 = tf.identity(j_31_r + j_33_s, name='jac_z_3')

                        res = [jac_z_1, jac_z_2, jac_z_3]
                        # print('res', res)

                        return ZMergedMatrix(res)

            # algorithmic partial derivatives (as functions so that we do not create unnecessary nodes
            def _d_dyn_d_lr(_name):
                res = [
                    -bias_correction * m_k / v_tilde_k,
                    tf.zeros_like(m_k),
                    tf.zeros_like(v_k)  # just aesthetics
                ]
                return ZMergedMatrix(res, name=_name)

            def _d_dyn_d_hyp_gl(cross_der_l, _name):
                dwt_dl_hat = pre_j_11_out
                dwt_dl = l_diag_mul(dwt_dl_hat, cross_der_l)

                dmt_dl = (1 - beta1) * cross_der_l

                dvt_dl = l_diag_mul(pre_j_31_out, cross_der_l)
                return ZMergedMatrix([dwt_dl, dmt_dl, dvt_dl], name=_name)

            # noinspection PyUnresolvedReferences
            dynamics = tf.concat([w_base_k, m_k, v_k], 0) if w_base_k.get_shape().ndims != 0 \
                else tf.stack([w_base_k, m_k, v_k], 0)  # scalar case

            if w_is_state:
                w_base_mv, m_mv, v_mv = w.var_list(VlMode.RAW)
            else:
                w_base_mv, m_mv, v_mv = w_base, m, v

            return AdamOptimizer(w=w_base,
                                 m=m,
                                 v=v,
                                 global_step=global_step,
                                 assign_ops=[
                                     w_base_mv.assign(w_base_k),
                                     m_mv.assign(m_k),
                                     v_mv.assign(v_k)
                                 ],
                                 dynamics=dynamics,
                                 jac_z=_jac_z,
                                 gradient=grad,
                                 learning_rate=lr,
                                 momentum_factor=beta1,
                                 second_momentum_factor=beta2,
                                 raw_w=w,
                                 loss=loss,
                                 d_dyn_d_lr=_d_dyn_d_lr,
                                 d_dyn_d_hyper=_d_dyn_d_hyp_gl)
Пример #5
0
    def __init__(self, optimizer, hyper_dict, global_step=None):
        """
        Creates a new object that computes the hyper-gradient of validation errors in forward mode.
        See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)
        Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization.

        :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are
                            updated
        :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where
                            `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of
                            pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter)
                            (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians
                            efficiently yet (suggestions or pointer are welcomed)
        :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step
        """
        assert isinstance(optimizer, Optimizer)

        self.w = optimizer.raw_w  # might be variable or MergedVariable (never tested on Variables actually) ...
        self.w_t = MergedVariable.get_tensor(self.w)  # this is always a tensor

        self.tr_dynamics = optimizer.dynamics

        assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \
                                             'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict

        self.hyper_list = []  # more comfortable to use
        self.d_dynamics_d_hypers = []
        self.hyper_dict = {}  # standardizes hyper_dict parameter
        for k, v in hyper_dict.items():
            list_v = as_list(v)
            assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\
                                                 % (hyper_dict, list_v[0])
            self.hyper_dict[k] = list_v  # be sure values are lists!
            self.hyper_list += [pair[0] for pair in list_v]
            self.d_dynamics_d_hypers += [pair[1] for pair in list_v]

        self.val_errors = []  # will follow the same order as hyper_list
        for hyp in self.hyper_list:  # find the right validation error for hyp!
            for k, v in hyper_dict.items():
                all_hypers = [pair[0] for pair in as_list(v)]
                if hyp in all_hypers:
                    self.val_errors.append(k)
                    break

        for i, der in enumerate(
                self.d_dynamics_d_hypers
        ):  # this automatic casting at the moment works only for SGD
            if not isinstance(der, ZMergedMatrix):
                print('Try casting d_dynamics_d_hyper to ZMergedMatrix')
                self.d_dynamics_d_hypers[i] = ZMergedMatrix(der)
                print('Successful')

        with self.w_t.graph.as_default():
            # global step
            self.global_step = global_step or GlobalStep()

            self.fw_ops = self.w.assign(
                self.tr_dynamics)  # TODO add here when hypers are sequence

            with tf.name_scope('direct_HO'):
                '''
                Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector
                '''
                self.zs = [self._create_z(hyp) for hyp in self.hyper_list]

                self.zs_dynamics = [
                    optimizer.jac_z(z) + dd_dh
                    for z, dd_dh in zip(self.zs, self.d_dynamics_d_hypers)
                ]

                print('z dynamics', self.zs_dynamics[0])
                print('z', self.zs[0])

                self.zs_assigns = [
                    z.assign(z_dyn)
                    for z, z_dyn in zip(self.zs, self.zs_dynamics)
                ]

                self.grad_val_err = [
                    tf.gradients(v_e, self.w_t)[0] for v_e in self.val_errors
                ]
                assert all([
                    g is not None for g in self.grad_val_err
                ]), 'Some gradient of the validation error is None!'

                self.grad_wrt_hypers = [
                    dot(gve, z.tensor)
                    for z, gve in zip(self.zs, self.grad_val_err)
                ]

                with tf.name_scope(
                        'hyper_gradients'
                ):  # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable
                    self.hyper_gradient_vars = [
                        tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp))
                        for hyp in self.hyper_list
                    ]
                    self.hyper_gradients_dict = {
                        hyp: hgv
                        for hyp, hgv  # redundant.. just for comfort ..
                        in zip(self.hyper_list, self.hyper_gradient_vars)
                    }
                    self._hyper_assign_ops = [
                        v.assign(ght) for v, ght in zip(
                            self.hyper_gradient_vars, self.grad_wrt_hypers)
                    ]
Пример #6
0
    def __init__(self,
                 optimizer,
                 hyper_dict,
                 state_history=None,
                 global_step=None):
        """
        Creates a new object that computes the hyper-gradient of validation errors in reverse mode.
        See section 3.1 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)
        Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization.

        :param optimizer: insance of Optimizer class, which contains the dynamics with which the model parameters are
                            updated
        :param hyper_dict: A dictionary of `{validation_error: hyperparameter or list_of_hyperparameters}` where
                            `validation_error` is a scalar tensor and `list_of_hyperparameters` is a list
                            of tensorflow variables that represents the hyperparameters
        :param state_history: (default: empty list) state history manager:
                                should implement methods `clear`, `append`, `__getitem__`
        :param global_step: optional instance of GlobalStep class
        """
        assert isinstance(optimizer, Optimizer)

        self.w = optimizer.raw_w  # might be variable or MergedVariable
        #  TODO check if it works also with w as simple Variable
        self.w_t = MergedVariable.get_tensor(self.w)  # this is always a tensor

        self.tr_dynamics = optimizer.dynamics
        assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of ' \
                                             '(tf.Tensor, hyperparameters)' % hyper_dict
        self.val_error_dict = hyper_dict

        self.hyper_list = []
        for k, v in hyper_dict.items():
            self.hyper_list += as_list(v)
            self.val_error_dict[k] = as_list(v)  # be sure that are all lists

        self.w_hist = state_history or []

        with self.w_t.graph.as_default():
            # global step
            self.global_step = global_step or GlobalStep()

            self._fw_ops = optimizer.assign_ops  # TODO add here when hyper-parameters are sequence

            # backward assign ops
            with tf.name_scope('backward'):
                # equation (9)
                p_T = {
                    ve: tf.gradients(ve, self.w_t)[0]
                    for ve, hyp_list in self.val_error_dict.items()
                }  # deltaE(s_t)

                self.p_dict = {
                    ve: tf.Variable(pt, name='p')
                    for ve, pt in p_T.items()
                }

                # for nullity check
                self._abs_sum_p = tf.reduce_sum(
                    tf.stack([
                        tf.reduce_sum(tf.abs(p), name='l1_p')
                        for p in self.p_dict.values()
                    ]))

                # build Lagrangian function
                with tf.name_scope('lagrangian'):
                    self.lagrangians_dict = {
                        ve: dot(p, self.tr_dynamics)
                        for ve, p in self.p_dict.items()
                    }

                # TODO read below
                '''
                In the following {if else} block there are two ways of computing the the dynamics of the update
                 of the Lagrangian multipliers. The procedures SHOULD produce the same result,
                however, for some strange reason, if w is indeed a state varibale that contains auxiliary components
                (e.g. velocity in Momentum algorithm, ...) there is a difference in the two methods and
                the right one is the first one. This is possibly due to the order in wich the derivatives are
                 taken by tensorflow, but furhter investigation is necessary.
                '''
                # detects if some auxiliary variables are used.
                if isinstance(self.w, MergedVariable) and \
                        any([isinstance(v, MergedVariable) for v in self.w.var_list(Vl_Mode.RAW)]):
                    state_components = self.w.var_list(Vl_Mode.TENSOR)

                    # equation (8)
                    self.p_dynamics = {
                        ve:
                        tf.concat(tf.gradients(lagrangian, state_components),
                                  0)
                        for ve, lagrangian in self.lagrangians_dict.items()
                    }
                else:
                    # equation (8)
                    self.p_dynamics = {
                        ve: tf.gradients(lagrangian, self.w_t)[0]
                        for ve, lagrangian in self.lagrangians_dict.items()
                    }  # equation (7)

                self._bk_ops = [
                    self.p_dict[ve].assign(self.p_dynamics[ve])
                    for ve in self.val_error_dict
                ]  # TODO add here when hp are sequ.

            with tf.name_scope('w_history_ops'):
                self._w_placeholder = tf.placeholder(self.w_t.dtype)

                self._back_hist_op = self.w.assign(self._w_placeholder)

            with tf.name_scope('hyper_derivatives'):
                # equation (10) without summation.
                self.hyper_derivatives = [
                    (self.val_error_dict[ve],
                     tf.gradients(lagrangian, self.val_error_dict[ve]))
                    for ve, lagrangian in self.lagrangians_dict.items()
                ]  # list of couples (hyper_list, list of symbolic hyper_gradients)  (lists are unhashable!)

            with tf.name_scope(
                    'hyper_gradients'
            ):  # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable
                self._grad_wrt_hypers_placeholder = tf.placeholder(
                    tf.float32, name='placeholder')
                # TODO this placeholder is not really necessary... just added to minimize the changes needed
                # (merge with RICCARDO)

                self.hyper_gradient_vars = [
                    tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp))
                    for hyp in self.hyper_list
                ]
                self.hyper_gradients_dict = {
                    hyp: hgv
                    for hyp, hgv  # redundant.. just for comfort ..
                    in zip(self.hyper_list, self.hyper_gradient_vars)
                }

                self._hyper_assign_ops = {
                    h: v.assign(self._grad_wrt_hypers_placeholder)
                    for h, v in self.hyper_gradients_dict.items()
                }
Пример #7
0
class HyperOptimizer:
    """
    Interface class for gradient-based hyperparameter optimization methods.
    """
    def __init__(self,
                 optimizer,
                 hyper_dict,
                 method,
                 hyper_grad_kwargs=None,
                 hyper_optimizer_class=AdamOptimizer,
                 **optimizers_kwargs):
        """
        Interface instance of gradient-based hyperparameter optimization methods.

        :param optimizer: parameter optimization dynamics (obtained from `Optimizer.create` methods)
        :param hyper_dict: dictionary of validation errors and list of hyperparameters to be optimized
        :param method:  method with which to compute hyper-gradients: Forward
                        or Reverse-Ho
        :param hyper_grad_kwargs: dictionary of keyword arguments for `HyperGradient` classes (usually None)
        :param hyper_optimizer_class: (default Adam) Optimizer class for optimization of the hyperparameters
        :param optimizers_kwargs: keyword arguments for hyperparameter optimizers (like hyper-learning rate)
        """
        assert method in [ReverseHG, ForwardHG]
        assert hyper_optimizer_class is None or issubclass(
            hyper_optimizer_class, Optimizer)
        assert isinstance(hyper_dict, dict)
        assert isinstance(optimizer, Optimizer)

        if not hyper_grad_kwargs: hyper_grad_kwargs = {}
        self.hyper_iteration_step = GlobalStep(name='hyper_iteration_step')
        self._report_hyper_it_init = tf.report_uninitialized_variables(
            [self.hyper_iteration_step.var])
        # self.hyper_batch_step = GlobalStep(name='hyper_batch_step')
        self.hyper_batch_step = GlobalStep(name='batch_step')

        # automatically links eventual optimizer global step (like in Adam) to HyperGradient global step
        hyper_grad_kwargs['global_step'] = hyper_grad_kwargs.get(
            'global_step', optimizer.global_step if hasattr(
                optimizer, 'global_step') else GlobalStep())

        # automatically links eventual hyper-optimizer global step (like in Adam) to batch_step
        if hyper_optimizer_class == AdamOptimizer:
            optimizers_kwargs['global_step'] = self.hyper_batch_step
            optimizers_kwargs.setdefault('eps', 1.e-14)

        self.hyper_gradients = method(optimizer, hyper_dict,
                                      **hyper_grad_kwargs)

        if hyper_optimizer_class:
            # noinspection PyTypeChecker
            self.hyper_optimizers = create_hyperparameter_optimizers(
                self.hyper_gradients,
                optimizer_class=hyper_optimizer_class,
                **optimizers_kwargs)
        else:
            self.hyper_optimizers = None

    @property
    def hyper_list(self):
        """

        :return: list of hyperparameters that are/will be optimized
        """
        return self.hyper_gradients.hyper_list

    def initialize(self, session=None, complete_reinitialize=False):
        """
        Initialize all tensorflow variables. This method has two behaviours:

        - first time it is called (after entering a Session run block) or when flag `complete_reinitialize` is `True`
            initializes all the relevant variables
        - subsequent times, reinitialize only model variables (next hyper-iteration).
        
        :param: complete_reinitialize: (default `False`) if True reinitialize hyper-step counts and hyperparameter
                                        optimizers regardless of
        :param: session: optional tensorflow session (if None default session is used) 

        :return: True if this is the first initialization
        """
        ss = tf.get_default_session()
        assert ss, 'No default session.'

        never_initialized = bool(self._report_hyper_it_init.eval())

        if complete_reinitialize or never_initialized:  # never initialized or subsequent run of
            # Session run block (for instance in a Ipython book)
            tf.variables_initializer(self.hyper_gradients.hyper_list).run()
            if self.hyper_optimizers:
                [
                    opt.support_variables_initializer().run()
                    for opt in self.hyper_optimizers
                ]
            tf.variables_initializer(
                [self.hyper_iteration_step.var,
                 self.hyper_batch_step.var]).run()
        else:
            self.hyper_iteration_step.increase.eval()

        self.hyper_gradients.initialize(session=session)

        return never_initialized

    def run(self,
            T,
            train_feed_dict_supplier=None,
            val_feed_dict_suppliers=None,
            hyper_constraints_ops=None,
            _debug_no_hyper_update=False):  # TODO add session parameter
        """

        :param _debug_no_hyper_update: 
        :param T: number of steps
        :param train_feed_dict_supplier:
        :param val_feed_dict_suppliers:
        :param hyper_constraints_ops: (list of) either callable (no parameters) or tensorflow ops
        :return:
        """
        # idea: if steps == T then do full reverse, or forward, otherwise do trho and rtho
        # after all the main difference is that if we go with the full version, after the gradient has been
        # computed, the method `initialize()` is called.

        self.hyper_gradients.run_all(
            T,
            train_feed_dict_supplier=train_feed_dict_supplier,
            val_feed_dict_suppliers=val_feed_dict_suppliers,
            hyper_batch_step=self.hyper_batch_step.eval())
        if not _debug_no_hyper_update:
            [
                tf.get_default_session().run(hod.assign_ops)
                for hod in self.hyper_optimizers
            ]
            if hyper_constraints_ops:
                [
                    op() if callable(op) else op.eval()
                    for op in as_list(hyper_constraints_ops)
                ]

            self.hyper_batch_step.increase.eval()
Пример #8
0
    def __init__(self, optimizer, hyper_dict, global_step=None, devices=None):
        """
        Creates a new object that computes the hyper-gradient of validation errors in forward mode.
        See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)
        Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization.

        :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are
                            updated
        :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where
                            `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of
                            pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter)
                            (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians
                            efficiently yet (suggestions or pointer are welcomed)
        :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step
        """
        assert isinstance(optimizer, Optimizer)

        self.w = optimizer.raw_w  # might be variable or MergedVariable (never tested on Variables actually) ...
        self.w_t = self.w  # MergedVariable.get_tensor(self.w)  # this is always a tensor

        self.tr_dynamics = optimizer.dynamics

        assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \
                                             'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict

        self.hyper_list = []  # more comfortable to use
        self.d_dynamics_d_hypers = []
        self.hyper_dict = {}  # standardizes hyper_dict parameter
        self._inverse_hyper_dict = {}  # hyperparameter-validation error pairs
        for k, v in hyper_dict.items():
            list_v = as_list(v)
            # assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\
            #                                      % (hyper_dict, list_v[0])
            self.hyper_dict[k] = list_v  # be sure values are lists!
            self._inverse_hyper_dict = {
                **self._inverse_hyper_dict,
                **{hyp: k
                   for hyp in list_v}
            }
            self.hyper_list += [
                pair[0] if isinstance(pair, (tuple, list)) else pair
                for pair in list_v
            ]
            self.d_dynamics_d_hypers += [
                pair[1] if isinstance(pair, (tuple, list)) else
                optimizer.auto_d_dynamics_d_hyper(
                    pair)  # try to compute it automatically
                for pair in list_v
            ]

        self.val_errors = []  # will follow the same order as hyper_list
        for hyp in self.hyper_list:  # find the right validation error for hyp!
            for k, v in hyper_dict.items():
                all_hypers = [
                    pair[0] if isinstance(pair, (list, tuple)) else pair
                    for pair in as_list(v)
                ]
                if hyp in all_hypers:
                    self.val_errors.append(k)
                    break

        for i, der in enumerate(
                self.d_dynamics_d_hypers
        ):  # this automatic casting at the moment works only for SGD
            if not isinstance(der, ZMergedMatrix):
                print('Try casting d_dynamics_d_hyper to ZMergedMatrix')
                self.d_dynamics_d_hypers[i] = ZMergedMatrix(der)
                print('Successful')

        devices = as_list(devices)  # at most will be [None]

        with self.w_t.graph.as_default():
            # global step
            self.global_step = global_step or GlobalStep()

            self.fw_ops = optimizer.assign_ops  # add here when hypers are sequence (...)

            with tf.name_scope('ForwardHG'):
                '''
                Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector
                '''
                self.grad_wrt_hypers, self.zs, self.zs_dynamics, self._zs_assigns = [], [], [], []
                self.hyper_gradient_vars, self._hyper_assign_ops = [], []

                self.grad_val_err = {
                    ve:
                    tf.identity(tf.gradients(ve, self.w_t)[0],
                                name='grad_val_err_%s' % simple_name(ve.name))
                    for ve in self.hyper_dict.keys()
                }
                self._gve_inv_dict = {
                    hyp: self.grad_val_err[ve]
                    for hyp, ve in self._inverse_hyper_dict.items()
                }

                for k, hyp in enumerate(self.hyper_list):
                    with tf.device(devices[k % len(devices)]):
                        self.zs.append(self._create_z(hyp))

                        with tf.name_scope('Z_dynamics'):
                            self.zs_dynamics.append(
                                optimizer.jac_z(self.zs[k]) +
                                self.d_dynamics_d_hypers[k])
                            self._zs_assigns.append(self.zs[k].assign(
                                self.zs_dynamics[k]))

                        self.grad_wrt_hypers.append(
                            dot(self._gve_inv_dict[hyp],
                                self.zs[k],
                                name='hyper_grad_wrt_h'))

                        with tf.name_scope('hyper_gradients'):
                            self.hyper_gradient_vars.append(
                                tf.Variable(tf.zeros_like(hyp),
                                            name=simple_name(hyp)))
                            self._hyper_assign_ops.append(
                                self.hyper_gradient_vars[k].assign(
                                    self.grad_wrt_hypers[k]))

                # final operations
                self.hyper_gradients_dict = {
                    hyp: hgv
                    for hyp, hgv  # redundant.. just for comfort ..
                    in zip(self.hyper_list, self.hyper_gradient_vars)
                }
                # hyper-gradient check
                assert all([g is not None for g in self.grad_val_err]), 'Some gradient ' \
                                                                        'of the validation error is None!'