Exemplo n.º 1
0
 def _d_dyn_d_lr(_name):
     res = [
         -bias_correction * m_k / v_tilde_k,
         tf.zeros_like(m_k),
         tf.zeros_like(v_k)  # just aesthetics
     ]
     return ZMergedMatrix(res, name=_name)
Exemplo n.º 2
0
 def _jac_z(z):
     return ZMergedMatrix(
         hvp(
             integral,
             w,
             # MergedVariable.get_tensor(w),
             z.tensor))
Exemplo n.º 3
0
    def _create_z(self, hyper):
        """
        Initializer for Z-variables. Used internally.

        :param hyper:
        :return:
        """
        shape_h = hyper.get_shape().as_list()
        assert len(
            shape_h
        ) < 2, 'only scalar or vector hyper-parameters are accepted: %s shape: %s' % (
            hyper, shape_h)
        dim_h = shape_h[0] if shape_h else 1

        components = self.w.var_list(Vl_Mode.TENSOR) if isinstance(
            self.w, MergedVariable) else [self.w_t]

        print('components', components)

        with tf.name_scope('z'):

            z_components = [
                tf.Variable(tf.zeros([c.get_shape().as_list()[0], dim_h]),
                            name=hyper.name.split(':')[0]) for c in components
            ]
            mvz = ZMergedMatrix(z_components)
            print(mvz.tensor)
            return mvz
Exemplo n.º 4
0
            def _d_dyn_d_hyp_gl(cross_der_l, _name):
                dwt_dl_hat = pre_j_11_out
                dwt_dl = l_diag_mul(dwt_dl_hat, cross_der_l)

                dmt_dl = (1 - beta1) * cross_der_l

                dvt_dl = l_diag_mul(pre_j_31_out, cross_der_l)
                return ZMergedMatrix([dwt_dl, dmt_dl, dvt_dl], name=_name)
Exemplo n.º 5
0
            def _jac_z(z):
                if _debug_jac_z:  # I guess this would take an incredible long time to compile for large systems
                    d = dynamics.get_shape().as_list()[0]
                    d2 = d // 2
                    jac_1_1 = tf.stack([
                        tf.gradients(w_base_k[i], w_base)[0] for i in range(d2)
                    ])
                    jac_2_1 = tf.stack(
                        [tf.gradients(m_k[i], w_base)[0] for i in range(d2)])
                    # jac_1 = tf.concat([jac_1_1, jac_2_1], axis=0)

                    jac_1_2 = tf.stack(
                        [tf.gradients(w_base_k[i], m)[0] for i in range(d2)])
                    jac_2_2 = tf.stack(
                        [tf.gradients(m_k[i], m)[0] for i in range(d2)])
                    # jac_2 = tf.concat([jac_1_2, jac_2_2], axis=0)

                    # jac = tf.concat([jac_1, jac_2], axis=1, name='Jacobian')

                    # mul = tf.matmul(jac, z.tensor)
                    #
                    # return ZMergedMatrix([
                    #     mul[:d2, :],
                    #     mul[d2, :]
                    # ])
                    r, u = z.var_list(VlMode.TENSOR)
                    return ZMergedMatrix([
                        tf.matmul(jac_1_1, r) + tf.matmul(jac_1_2, u),
                        tf.matmul(jac_2_1, r) + tf.matmul(jac_2_2, u)
                    ])
                else:
                    r, u = z.var_list(VlMode.TENSOR)

                    assert loss is not None, 'Should specify loss to use jac_z'

                    hessian_r_product = hvp(loss=loss, w=w_base, v=r)

                    # print('hessian_r_product', hessian_r_product)

                    res = [
                        r - lr * mu * u - lr * hessian_r_product,
                        hessian_r_product + mu * u
                    ]

                    return ZMergedMatrix(res)
Exemplo n.º 6
0
    def d_dynamics_d_linear_loss_term(self, grad_loss_term):
        """
        Helper function for building the partial derivative of the dynamics w.r.t. an hyperparameter that
        multiplies a loss term that concur in an additive way in forming the training error function.
        E.g.: L + gamma R

        :param grad_loss_term: should be \nabla R
        :return: Partial derivative of dynamics w.r.t. weighting hyperparameter (e.g. gamma)
        """
        return ZMergedMatrix(-self.learning_rate * grad_loss_term)
Exemplo n.º 7
0
    def d_dynamics_d_hyper_loss(self, grad_loss_term, name):
        """
        Helper function for building the partial derivative of the dynamics w.r.t. an hyperparameter
        inside the loss function, given the gradient or Jacobian of loss w.r.t.

        :param name: name of the resulting MergedMatrix
        :param grad_loss_term: should be \nabla R
        :return: Partial derivative of dynamics w.r.t. weighting hyperparameter (e.g. gamma)
        """
        return ZMergedMatrix(-self.learning_rate * grad_loss_term, name=name)
Exemplo n.º 8
0
            def jac_z(z):
                r, u = z.var_list(Vl_Mode.TENSOR)

                assert loss is not None, 'Should specify loss to use jac_z'

                hessian_r_product = hvp(loss=loss, w=w_base, v=r)

                print('hessian_r_product', hessian_r_product)

                res = [
                    r - lr * mu * u - lr * hessian_r_product,
                    hessian_r_product + mu * u
                ]

                print('res', res)

                return ZMergedMatrix(res)
Exemplo n.º 9
0
    def d_dynamics_d_learning_rate(self):
        """

        :return: Partial derivative of dynamics w.r.t. learning rate
        """
        return ZMergedMatrix(-self.gradient)
Exemplo n.º 10
0
 def d_dynamics_d_linear_loss_term(self, grad_loss_term):
     return ZMergedMatrix(
         [-self.learning_rate * grad_loss_term, grad_loss_term])
Exemplo n.º 11
0
 def d_dynamics_d_momentum_factor(self):
     return ZMergedMatrix([-(self.learning_rate * self.m), self.m])
Exemplo n.º 12
0
 def d_dynamics_d_learning_rate(self):
     return ZMergedMatrix([
         -self.momentum_factor * self.m - self.gradient,
         tf.zeros(self.m.get_shape())
     ])
Exemplo n.º 13
0
            def _jac_z(z):
                if _debug_jac_z:  # I guess this would take an incredible long time to compile for large systems
                    d = dynamics.get_shape().as_list()[0] // 3
                    r, u, s = z.var_list(VlMode.TENSOR)

                    j11 = tf.stack([
                        tf.gradients(w_base_k[i], w_base)[0] for i in range(d)
                    ])
                    j12 = tf.stack(
                        [tf.gradients(w_base_k[i], m)[0] for i in range(d)])
                    j13 = tf.stack(
                        [tf.gradients(w_base_k[i], v)[0] for i in range(d)])
                    j1 = tf.concat([j11, j12, j13], axis=1)
                    jz1 = tf.matmul(j11, r) + tf.matmul(j12, u) + tf.matmul(
                        j13, s)

                    # second block
                    j21 = tf.stack(
                        [tf.gradients(m_k[i], w_base)[0] for i in range(d)])
                    j22 = tf.stack(
                        [tf.gradients(m_k[i], m)[0] for i in range(d)])
                    j23 = tf.stack(
                        [tf.gradients(m_k[i], v)[0] for i in range(d)])
                    j2 = tf.concat([j21, j22, j23], axis=1)
                    jz2 = tf.matmul(j21, r) + tf.matmul(j22, u) + tf.matmul(
                        j23, s)

                    # third block
                    j31 = tf.stack(
                        [tf.gradients(v_k[i], w_base)[0] for i in range(d)])
                    j32 = tf.stack(
                        [tf.gradients(v_k[i], m)[0] for i in range(d)])
                    j33 = tf.stack(
                        [tf.gradients(v_k[i], v)[0] for i in range(d)])
                    j3 = tf.concat([j31, j32, j33], axis=1)
                    jz3 = tf.matmul(j31, r) + tf.matmul(j32, u) + tf.matmul(
                        j33, s)

                    tf.concat([j1, j2, j3], axis=0, name='Jacobian')

                    return ZMergedMatrix([jz1, jz2, jz3])

                else:
                    assert loss is not None, 'Should specify loss to use jac_z'

                    r, u, s = z.var_list(VlMode.TENSOR)

                    with tf.name_scope('Jac_Z'):

                        hessian_r_product = hvp(loss=loss,
                                                w=w_base,
                                                v=r,
                                                name='hessian_r_product')
                        # hessian_r_product = hvp(loss=loss, w=w.tensor, v=z.tensor, name='hessian_r_product')[:d, :d]

                        j_11_r_tilde = l_diag_mul(pre_j_11_out,
                                                  hessian_r_product,
                                                  name='j_11_r_tilde')
                        j_11_r = tf.identity(j_11_r_tilde + r, 'j_11_r')

                        j_12_u_hat = tf.identity(-lr_k * beta1 / v_tilde_k,
                                                 name='j_12_u_hat')
                        j_12_u = l_diag_mul(j_12_u_hat, u, name='j_12_u')

                        j_13_s_hat = tf.identity(lr_k * beta2 * m_k /
                                                 (2 * v_k_eps_32),
                                                 name='j_13_s_hat')
                        j_13_s = l_diag_mul(j_13_s_hat, s, name='j_13_s')

                        jac_z_1 = tf.identity(j_11_r + j_12_u + j_13_s,
                                              name='jac_z_1')
                        # end first bock

                        j_21_r = tf.identity((1. - beta1) * hessian_r_product,
                                             name='j_21_r')
                        j_22_u = tf.identity(beta1 * u, name='j_22_u')
                        # j_23_s = tf.zeros_like(s)  # would be...

                        jac_z_2 = tf.identity(j_21_r + j_22_u, name='jac_z_2')
                        # end second block

                        j_31_r = l_diag_mul(pre_j_31_out,
                                            hessian_r_product,
                                            name='j_31_r')
                        # j_32_u = tf.zeros_like(u)  # would be
                        j_33_s = tf.identity(beta2 * s, name='j_33_s')
                        jac_z_3 = tf.identity(j_31_r + j_33_s, name='jac_z_3')

                        res = [jac_z_1, jac_z_2, jac_z_3]
                        # print('res', res)

                        return ZMergedMatrix(res)
Exemplo n.º 14
0
 def d_dynamics_d_hyper_loss(self, grad_loss_term, name):
     return ZMergedMatrix(
         [-self.learning_rate * grad_loss_term, grad_loss_term], name=name)
Exemplo n.º 15
0
    def __init__(self, optimizer, hyper_dict, global_step=None):
        """
        Creates a new object that computes the hyper-gradient of validation errors in forward mode.
        See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)
        Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization.

        :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are
                            updated
        :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where
                            `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of
                            pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter)
                            (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians
                            efficiently yet (suggestions or pointer are welcomed)
        :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step
        """
        assert isinstance(optimizer, Optimizer)

        self.w = optimizer.raw_w  # might be variable or MergedVariable (never tested on Variables actually) ...
        self.w_t = MergedVariable.get_tensor(self.w)  # this is always a tensor

        self.tr_dynamics = optimizer.dynamics

        assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \
                                             'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict

        self.hyper_list = []  # more comfortable to use
        self.d_dynamics_d_hypers = []
        self.hyper_dict = {}  # standardizes hyper_dict parameter
        for k, v in hyper_dict.items():
            list_v = as_list(v)
            assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\
                                                 % (hyper_dict, list_v[0])
            self.hyper_dict[k] = list_v  # be sure values are lists!
            self.hyper_list += [pair[0] for pair in list_v]
            self.d_dynamics_d_hypers += [pair[1] for pair in list_v]

        self.val_errors = []  # will follow the same order as hyper_list
        for hyp in self.hyper_list:  # find the right validation error for hyp!
            for k, v in hyper_dict.items():
                all_hypers = [pair[0] for pair in as_list(v)]
                if hyp in all_hypers:
                    self.val_errors.append(k)
                    break

        for i, der in enumerate(
                self.d_dynamics_d_hypers
        ):  # this automatic casting at the moment works only for SGD
            if not isinstance(der, ZMergedMatrix):
                print('Try casting d_dynamics_d_hyper to ZMergedMatrix')
                self.d_dynamics_d_hypers[i] = ZMergedMatrix(der)
                print('Successful')

        with self.w_t.graph.as_default():
            # global step
            self.global_step = global_step or GlobalStep()

            self.fw_ops = self.w.assign(
                self.tr_dynamics)  # TODO add here when hypers are sequence

            with tf.name_scope('direct_HO'):
                '''
                Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector
                '''
                self.zs = [self._create_z(hyp) for hyp in self.hyper_list]

                self.zs_dynamics = [
                    optimizer.jac_z(z) + dd_dh
                    for z, dd_dh in zip(self.zs, self.d_dynamics_d_hypers)
                ]

                print('z dynamics', self.zs_dynamics[0])
                print('z', self.zs[0])

                self.zs_assigns = [
                    z.assign(z_dyn)
                    for z, z_dyn in zip(self.zs, self.zs_dynamics)
                ]

                self.grad_val_err = [
                    tf.gradients(v_e, self.w_t)[0] for v_e in self.val_errors
                ]
                assert all([
                    g is not None for g in self.grad_val_err
                ]), 'Some gradient of the validation error is None!'

                self.grad_wrt_hypers = [
                    dot(gve, z.tensor)
                    for z, gve in zip(self.zs, self.grad_val_err)
                ]

                with tf.name_scope(
                        'hyper_gradients'
                ):  # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable
                    self.hyper_gradient_vars = [
                        tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp))
                        for hyp in self.hyper_list
                    ]
                    self.hyper_gradients_dict = {
                        hyp: hgv
                        for hyp, hgv  # redundant.. just for comfort ..
                        in zip(self.hyper_list, self.hyper_gradient_vars)
                    }
                    self._hyper_assign_ops = [
                        v.assign(ght) for v, ght in zip(
                            self.hyper_gradient_vars, self.grad_wrt_hypers)
                    ]
Exemplo n.º 16
0
    def __init__(self, optimizer, hyper_dict, global_step=None, devices=None):
        """
        Creates a new object that computes the hyper-gradient of validation errors in forward mode.
        See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization
        (https://arxiv.org/abs/1703.01785)
        Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization.

        :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are
                            updated
        :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where
                            `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of
                            pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter)
                            (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians
                            efficiently yet (suggestions or pointer are welcomed)
        :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step
        """
        assert isinstance(optimizer, Optimizer)

        self.w = optimizer.raw_w  # might be variable or MergedVariable (never tested on Variables actually) ...
        self.w_t = self.w  # MergedVariable.get_tensor(self.w)  # this is always a tensor

        self.tr_dynamics = optimizer.dynamics

        assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \
                                             'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict

        self.hyper_list = []  # more comfortable to use
        self.d_dynamics_d_hypers = []
        self.hyper_dict = {}  # standardizes hyper_dict parameter
        self._inverse_hyper_dict = {}  # hyperparameter-validation error pairs
        for k, v in hyper_dict.items():
            list_v = as_list(v)
            # assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\
            #                                      % (hyper_dict, list_v[0])
            self.hyper_dict[k] = list_v  # be sure values are lists!
            self._inverse_hyper_dict = {
                **self._inverse_hyper_dict,
                **{hyp: k
                   for hyp in list_v}
            }
            self.hyper_list += [
                pair[0] if isinstance(pair, (tuple, list)) else pair
                for pair in list_v
            ]
            self.d_dynamics_d_hypers += [
                pair[1] if isinstance(pair, (tuple, list)) else
                optimizer.auto_d_dynamics_d_hyper(
                    pair)  # try to compute it automatically
                for pair in list_v
            ]

        self.val_errors = []  # will follow the same order as hyper_list
        for hyp in self.hyper_list:  # find the right validation error for hyp!
            for k, v in hyper_dict.items():
                all_hypers = [
                    pair[0] if isinstance(pair, (list, tuple)) else pair
                    for pair in as_list(v)
                ]
                if hyp in all_hypers:
                    self.val_errors.append(k)
                    break

        for i, der in enumerate(
                self.d_dynamics_d_hypers
        ):  # this automatic casting at the moment works only for SGD
            if not isinstance(der, ZMergedMatrix):
                print('Try casting d_dynamics_d_hyper to ZMergedMatrix')
                self.d_dynamics_d_hypers[i] = ZMergedMatrix(der)
                print('Successful')

        devices = as_list(devices)  # at most will be [None]

        with self.w_t.graph.as_default():
            # global step
            self.global_step = global_step or GlobalStep()

            self.fw_ops = optimizer.assign_ops  # add here when hypers are sequence (...)

            with tf.name_scope('ForwardHG'):
                '''
                Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector
                '''
                self.grad_wrt_hypers, self.zs, self.zs_dynamics, self._zs_assigns = [], [], [], []
                self.hyper_gradient_vars, self._hyper_assign_ops = [], []

                self.grad_val_err = {
                    ve:
                    tf.identity(tf.gradients(ve, self.w_t)[0],
                                name='grad_val_err_%s' % simple_name(ve.name))
                    for ve in self.hyper_dict.keys()
                }
                self._gve_inv_dict = {
                    hyp: self.grad_val_err[ve]
                    for hyp, ve in self._inverse_hyper_dict.items()
                }

                for k, hyp in enumerate(self.hyper_list):
                    with tf.device(devices[k % len(devices)]):
                        self.zs.append(self._create_z(hyp))

                        with tf.name_scope('Z_dynamics'):
                            self.zs_dynamics.append(
                                optimizer.jac_z(self.zs[k]) +
                                self.d_dynamics_d_hypers[k])
                            self._zs_assigns.append(self.zs[k].assign(
                                self.zs_dynamics[k]))

                        self.grad_wrt_hypers.append(
                            dot(self._gve_inv_dict[hyp],
                                self.zs[k],
                                name='hyper_grad_wrt_h'))

                        with tf.name_scope('hyper_gradients'):
                            self.hyper_gradient_vars.append(
                                tf.Variable(tf.zeros_like(hyp),
                                            name=simple_name(hyp)))
                            self._hyper_assign_ops.append(
                                self.hyper_gradient_vars[k].assign(
                                    self.grad_wrt_hypers[k]))

                # final operations
                self.hyper_gradients_dict = {
                    hyp: hgv
                    for hyp, hgv  # redundant.. just for comfort ..
                    in zip(self.hyper_list, self.hyper_gradient_vars)
                }
                # hyper-gradient check
                assert all([g is not None for g in self.grad_val_err]), 'Some gradient ' \
                                                                        'of the validation error is None!'