Exemplo n.º 1
0
 def __call__(self, x):
     x, t, l = x
     if chainer.config.train:
         self.lipschitz = None
     if getattr(chainer.config, 'lmt', False):
         if getattr(chainer.config, 'exact', False):
             if self.lipschitz is None:
                 self.lipschitz = spectral_norm_exact(self.W.data)
             l = l * self.lipschitz
             x = super(Linear, self).__call__(x)
         else:
             if self.u is None:
                 # for calculation of Lipschitz constant
                 u = np.random.normal(size=(1,
                                            x.shape[1])).astype(np.float32)
                 with self.init_scope():
                     self.u = chainer.Parameter(u)
                     register_power_iter(self.u)
                 if self._device_id is not None and self._device_id >= 0:
                     with chainer.cuda._get_device(self._device_id):
                         self.u.to_gpu()
             x = super(Linear, self).__call__(x)
             normalize(self.u.array)
             u = F.linear(self.u, self.W)
             l = l * l2_norm(u)
     else:
         x = super(Linear, self).__call__(x)
     return x, t, l
Exemplo n.º 2
0
    def forward(self, inputs):
        W = inputs[0].reshape((inputs[0].shape[0], -1))

        # power iteration
        self.v = self.u.dot(W)
        normalize(self.v, eps=1e-20)

        self.u[:] = self.v.dot(W.T)
        nu = normalize(self.u, eps=1e-20)

        # spectral norm is approximated by the norm of a vector u
        return nu.reshape((1,)),
Exemplo n.º 3
0
    def __call__(self, x):
        x_in, t, l = x
        if chainer.config.train:
            self.lipschitz = None
        if self.parseval_factor is None:
            k_h, k_w = (self.ksize if isinstance(self.ksize, tuple)
                        else (self.ksize, self.ksize))
            # rescaling factor of Parseval networks
            # According to the author, this factor is not essential
            self.parseval_factor = 1 / math.sqrt(k_h * k_w)
        if self.u is None:
            # for calculation of Lipschitz constant
            u = np.random.normal(size=(1,) + x_in.shape[1:]).astype(np.float32)
            with self.init_scope():
                self.u = chainer.Parameter(u)
                register_power_iter(self.u)
            if self._device_id is not None and self._device_id >= 0:
                with chainer.cuda._get_device(self._device_id):
                    self.u.to_gpu()

        if getattr(chainer.config, 'lmt', False):
            if getattr(chainer.config, 'exact', False):
                # inference with calculation of Lipschitz constant
                # (all configuration)
                x = super(Convolution2D, self).__call__(x_in)
                if self.lipschitz is None:
                    self.lipschitz = conv_spectral_norm_exact(
                        self.W.array, self.u.shape, self.stride, self.pad)
                if getattr(chainer.config, 'parseval', False):
                    # in Parseval networks, output is rescaled
                    x = x * self.parseval_factor
                    l = l * self.parseval_factor
                l = l * self.lipschitz
                return x, t, l

        if getattr(chainer.config, 'lmt', False):
            # lmt training and non-exact inference
            normalize(self.u.array)
            # this is practically faster than concatenation
            x = super(Convolution2D, self).__call__(x_in)
            u = F.convolution_2d(self.u, self.W, stride=self.stride,
                                 pad=self.pad)
            l = l * l2_norm(u)
            return x, t, l

        # training and inference for other settings
        x = super(Convolution2D, self).__call__(x_in)
        if getattr(chainer.config, 'parseval', False):
            # in Parseval networks, output is rescaled
            x = x * self.parseval_factor

        # we do not have to calculate l (since it will not be used)
        return x, t, l
Exemplo n.º 4
0
    def __call__(self, x):
        x, t, l = x

        reshape = (1, x.shape[1]) + (1,) * (x.ndim - 2)

        if chainer.config.train:
            # batch norm
            mean = F.mean(x, axis=(0,) + tuple(range(2, x.ndim)))
            x = x - F.broadcast_to(
                F.reshape(mean, reshape),
                x.shape)
            var = F.mean(x ** 2, axis=(0,) + tuple(range(2, x.ndim)))
            m = x.size // self.gamma.size
            adjust = m / max(m - 1., 1.)  # unbiased estimation
            self.avg_mean *= self.decay
            self.avg_mean += (1 - self.decay) * mean.array
            self.avg_var *= self.decay
            self.avg_var += (1 - self.decay) * adjust * var.array
        else:
            mean = self.avg_mean
            var = self.avg_var
            x = x - F.broadcast_to(F.reshape(mean, reshape), x.shape)

        z0 = F.identity(self.gamma) / F.sqrt(var + self.eps)
        z = F.reshape(z0, reshape)
        x = x * F.broadcast_to(z, x.shape) + F.broadcast_to(
            F.reshape(self.beta, reshape), x.shape)

        # calculate Lipschitz constant
        if getattr(chainer.config, 'lmt', False):
            if getattr(chainer.config, 'exact', False):
                l = l * F.reshape(F.max(F.absolute(z0)), (1,))
            else:
                normalize(self.u.array)
                perturb(self.u.array, 1e-2, self.xp)
                u = self.u * z0
                l = l * l2_norm(u)

        return x, t, l
Exemplo n.º 5
0
    def forward(self, inputs):
        #
        # preprocessing
        #
        xp = cuda.get_array_module(*inputs)
        x, gamma, beta = inputs[:3]
        if configuration.config.train:
            if self.running_mean is None:
                self.running_mean = xp.zeros_like(beta, dtype=xp.float32)
                self.running_var = xp.zeros_like(gamma, dtype=xp.float32)
            else:
                self.running_mean = xp.array(self.running_mean)
                self.running_var = xp.array(self.running_var)
        elif len(inputs) == 6:
            self.fixed_mean = inputs[4]
            self.fixed_var = inputs[5]

        head_ndim = beta.ndim + 1
        expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim)

        #
        # start of forward path
        #
        if configuration.config.train:
            axis = (0, ) + tuple(range(head_ndim, x.ndim))
            mean = x.mean(axis=axis)
            var = cuda.reduce('S x, T mean, T alpha', 'T out',
                              '(x - mean) * (x - mean)', 'a + b',
                              'out = alpha * a', '0',
                              'conv_bn_var')(x,
                                             mean[expander],
                                             x.shape[1] / x.size,
                                             axis=axis,
                                             keepdims=False)
        else:
            mean = self.fixed_mean
            var = self.fixed_var
        if xp is numpy:
            raise NotImplementedError()
        else:
            self.std_inv = cuda.elementwise(
                'T var, T eps', 'T std_inv', '''
                std_inv = 1 / sqrt(var + eps);
                ''', 'conv_bn_std_inv')(var, self.eps)
            self.x_hat, y = cuda.elementwise(
                'T x, T mean, T std_inv, T gamma, T beta', 'T x_hat, T y', '''
                x_hat = (x - mean) * std_inv;
                y = gamma * x_hat + beta;
                ''', 'conv_bn_fwd')(x, mean[expander], self.std_inv[expander],
                                    gamma[expander], beta[expander])
        #
        # end of forward path
        #

        #
        # calculation of lipschitz constant
        #
        if chainer.config.train and getattr(chainer.config, 'lmt', False):
            #
            # power iteration for a matrix Diag(\gamma_i/\sigma_i)W
            #
            # u <= Diag(\gamma_i/\sigma_i) u
            # v <= W u
            # u_mid <= W^T v
            # u <= Diag(\gamma_i/\sigma_i)^T v
            #
            W = inputs[3].reshape((inputs[3].shape[0], -1))

            tmp_l = gamma * self.std_inv
            self.u *= tmp_l
            self.v = self.u.dot(W)

            # normalize for back propagation
            normalize(self.v, eps=1e-20)

            # do not normalize u_mid
            self.u_mid = self.v.dot(W.T)

            self.u[:] = self.u_mid * tmp_l

            # normalize for back propagation
            nu = normalize(self.u, eps=1e-20)

            # spectral norm is approximated by the norm of a vector u
            l = nu.reshape((1, ))
        else:
            # not used
            l = xp.ones((1, ), dtype=xp.float32)

        #
        # calculate running average of statistics
        #
        if configuration.config.train:
            self.running_mean *= self.decay
            self.running_mean += mean * (1 - self.decay)
            self.running_var *= self.decay
            self.running_var += var * (1 - self.decay)
        return y, l
Exemplo n.º 6
0
    def calculate_local_lipschitz(self):
        print('\rlocal Lipschitz start', flush=True)
        iterator = self.iterator
        preprocess = self.preprocess
        target = self.target
        eval_func = self.eval_func or (lambda x: target(preprocess(x)))
        device = self.device or chainer.cuda.cupy.cuda.get_device_id()
        assert device >= 0

        if self.eval_hook:
            self.eval_hook(self)

        # gradを計算して勾配をsamplingする
        if hasattr(iterator, 'reset'):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        self.global_grad = chainer.cuda.cupy.zeros(
            (self.n_class, self.n_class), dtype=chainer.cuda.cupy.float32)

        margin_list = []
        size = 0
        total = len(it.dataset)
        for batch in it:
            size += len(batch)
            sys.stdout.write('\r{0}/{1}'.format(size, total))
            sys.stdout.flush()
            x, t = self.converter(batch, device)
            xp = chainer.cuda.get_array_module(x)
            c = xp.ones((1, ), dtype=np.float32)
            local_grad = xp.zeros((self.n_class, self.n_class),
                                  dtype=xp.float32)
            with chainer.force_backprop_mode():
                for _ in range(100):
                    noise = xp.random.normal(size=x.shape).astype(xp.float32)
                    normalize(noise)
                    x2 = chainer.Parameter(x + noise)
                    y, t, _ = eval_func((x2, t, c))
                    for i in range(self.n_class):
                        for j in range(i + 1, self.n_class):
                            if i == j:
                                continue
                            target.cleargrads()
                            x2.grad = None
                            F.sum(y[:, i] - y[:, j]).backward()
                            norm = xp.max(
                                xp.sqrt((x2.grad**2).sum(
                                    axis=tuple(range(1, x2.ndim)))))
                            local_grad[i, j] = max(local_grad[i, j], norm)
            for i in range(self.n_class):
                for j in range(i + 1, self.n_class):
                    local_grad[j, i] = local_grad[i, j]
                    self.global_grad[:] = xp.maximum(self.global_grad,
                                                     local_grad)
            with chainer.no_backprop_mode():
                y, t, _ = eval_func((x, t, c))
                y = y.array
            grad = local_grad[t]
            margins = self.get_margin(
                y, y[list(range(t.size)), t].reshape(t.size, 1), grad)
            margins = xp.min(margins, axis=1)
            margin_list.extend(list(margins.get()))

        return margin_list