예제 #1
0
    def _apply_dense_on_obilique_with_noise(self, grad, var, seed):
        g = gutils.obilique_project(var, grad)
        g_norm = gutils.norm(g)
        if g_norm >= 1 / (self._times):
            a = 1 - 1 / (tf.square(self._times) * tf.square(g_norm))
        else:
            a = 1 / tf.square(self._times)
        b = 1 / tf.square(self._times)

        dim = grad.get_shape()[0]
        noise = tf.truncated_normal([dim, dim],
                                    mean=0.0,
                                    stddev=1.0,
                                    dtype=tf.float32,
                                    seed=seed,
                                    name="random_noise")

        if self._grad_clip == None:
            h = -self._learning_rate_t * (a * g + b * noise)
        else:
            h = -self._learning_rate_t * (a * g + b * noise)
            h = gutils.clip_by_norm(h, self._grad_clip_t)

        var_new = gutils.grassmann_retrction(var, h)

        return var_new
예제 #2
0
def _apply_dense_on_oblique_with_noise(grad_clip, grad, var, seed,
                                       learning_rate, times):
    g = gutils.oblique_project(var, grad)
    g_norm = gutils.norm(g)
    #a = tf.minimum(1 - 1 / (tf.square(times + 1) * tf.square(g_norm) + 1e-5), 1 / tf.square(times + 1))
    a = 1.0
    b = 1 / (tf.square(times + 1))

    dim = tf.convert_to_tensor(grad.get_shape()[0], dtype=tf.int32)
    noise = tf.truncated_normal([dim, 1],
                                mean=0.0,
                                stddev=0.0001,
                                dtype=tf.float32,
                                seed=seed,
                                name="random_noise")

    if grad_clip == None:
        h = -1 * learning_rate * (a * g + b * noise)
    else:
        h = -1 * learning_rate * (a * g + b * noise)
        h = gutils.clip_by_norm(h, grad_clip)

    var_new = gutils.grassmann_retrction(var, h)

    return var_new
예제 #3
0
def apply_dense_on_grasssmann_g(grad_clip, grad_on_grassmann, var,
                                learning_rate, times, delta):
    a = tf.maximum(delta, 1)  #/ (tf.log(times+2))

    if grad_clip != None:
        h = learning_rate * (a * grad_on_grassmann)
        h = -1 * gutils.clip_by_norm(h, grad_clip)
    else:
        h = -1 * learning_rate * a * grad_on_grassmann

    var_update = gutils.grassmann_retrction(var, h)
    return var_update
예제 #4
0
def apply_dense_on_grasssmann(grad_clip, grad_on_grassmann, grad_on_oblique,
                              var, learning_rate, times, delta):
    a = tf.maximum(delta, 1 / tf.log((tf.log((times + 2)))))
    n = gutils.unit(gutils.grassmann_project(
        var, grad_on_oblique)) * gutils.norm(grad_on_grassmann)
    b_1 = 2 * (1 - a) * gutils.xTy(grad_on_grassmann, n)
    b_2 = gutils.norm(grad_on_grassmann)
    b = b_1 / (b_2 + 1e-5)

    if grad_clip != None:
        h = learning_rate * (a * grad_on_grassmann + b * n)
        h = -1 * gutils.clip_by_norm(h, grad_clip)
    else:
        h = -1 * learning_rate * (a * grad_on_grassmann + b * n)

    var_update = gutils.grassmann_retrction(var, h)
    return var_update
예제 #5
0
def _apply_dense_on_oblique_with_noise(grad_clip, grad, var, learning_rate,
                                       times, variance):
    g = gutils.oblique_project(var, grad)
    #g_norm = gutils.norm(g)
    #a = tf.minimum(1 - 1 / (tf.square(times + 1) * tf.square(g_norm) + 1e-5), 1 / tf.square(times + 1))

    a = 1.0
    b = 1 / torch.square(times + 1)

    noise = variance * gutils.oblique_project(var, torch.randn(var.size()[0]))

    if grad_clip == None:
        h = -1 * learning_rate * (a * g + b * noise)
    else:
        h = -1 * learning_rate * (a * g + b * noise)
        h = gutils.clip_by_norm(h, grad_clip)

    var_new = gutils.grassmann_retrction(var, h)

    return var_new
예제 #6
0
    def _apply_dense_on_grasssmann(self, grad_on_grassmann, grad_on_obilique,
                                   var):
        a = tf.maximum(self._delta_t, 1 / (tf.square(self._times)))
        b_1 = 2 * (1 - a) * tf.matmul(
            tf.transpose(grad_on_grassmann),
            gutils.grassmann_project(var, grad_on_obilique))
        b_2 = gutils.norm(gutils.grassmann_project(grad_on_obilique))
        b = b_1 / b_2

        if self._grad_clip != None:
            h = self._learning_rate_t * (
                a * grad_on_grassmann +
                b * gutils.grassmann_project(var, grad_on_obilique))
            h = -gutils.clip_by_norm(h, self._grad_clip_t)
        else:
            h = -self._learning_rate_t * (
                a * grad_on_grassmann +
                b * gutils.grassmann_project(var, grad_on_obilique))

        var_update = gutils.grassmann_retrction(var, h)
        return var_update
예제 #7
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            #momentum = group['momentum']
            manifold = group['manifold']

            if manifold != "None":
                grad_clip = group['grad_clip']

                length = len(group['params'])

                for i in range(length):

                    p_grassmann = group['params'][i]
                    p_oblique = group['params'][i + length / 2]

                    if p_grassmann.grad and p_oblique is None:
                        continue

                    unity_grassmann, _ = gutils.unit(
                        p_grassmann.data.view(p_grassmann.size()[0], -1))
                    unity_oblique, _ = gutils.unit(
                        p_oblique.data.view(p_grassmann.size()[0], -1))

                    grad_grassmann = p_grassmann.grad.data.view(
                        p_grassmann.size()[0], -1)
                    grad_oblique = p_grassmann.grad.data.view(
                        p_oblique.size()[0], -1)

                    # if omega != 0:
                    # L=|Y'Y-I|^2/2=|YY'-I|^2/2+c
                    # dL/dY=2(YY'Y-Y)
                    # g.add_(2*omega, torch.mm(torch.mm(unity, unity.t()), unity) - unity)

                    h_grassmann = gutils.grassmann_project(
                        unity_grassmann, grad_grassmann)
                    h_oblique = gutils.oblique_project(unity_oblique,
                                                       grad_oblique)

                    if grad_clip is not None:
                        h_hat_grassmann = gutils.clip_by_norm(
                            h_grassmann, grad_clip)
                        h_hat_oblique = gutils.clip_by_norm(
                            h_oblique, grad_clip)
                    else:
                        h_hat_grassmann = h_grassmann
                        h_hat_oblique = h_oblique

                        # param_state = self.state[p]
                        # if 'momentum_buffer' not in param_state:
                        #    param_state['momentum_buffer'] = torch.zeros(h_hat.size())
                        #    if p.is_cuda:
                        #      param_state['momentum_buffer'] = param_state['momentum_buffer'].cuda()

                        # mom = param_state['momentum_buffer']
                        # mom_new = momentum*mom - group['lr']*h_hat

                    p_grassmann.data.copy_(
                        gutils.grassmann_retrction(
                            unity_grassmann, group['lr'] *
                            h_hat_grassmann).view(p_grassmann.size()))
                    p_oblique.data.copy_(
                        gutils.oblique_retrction(unity_oblique, group['lr'] *
                                                 h_hat_oblique).view(
                                                     p_oblique.size()))

            elif manifold == "None":
                # This routine is from https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py
                weight_decay = group['weight_decay']
                #dampening = group['dampening']
                #nesterov = group['nesterov']
                for p in group['params']:
                    if p.grad is None:
                        continue
                    d_p = p.grad.data
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                    #if momentum != 0:
                    #    param_state = self.state[p]
                    #    if 'momentum_buffer' not in param_state:
                    #        buf = param_state['momentum_buffer'] = d_p.clone()
                    #    else:
                    #        buf = param_state['momentum_buffer']
                    #        buf.mul_(momentum).add_(1 - dampening, d_p)
                    #    if nesterov:
                    #        d_p = d_p.add(momentum, buf)
                    #    else:
                    #        d_p = buf

                    p.data.add_(-group['lr'], d_p)
                else:
                    raise ValueError("There is no such a manifold")

        return loss