예제 #1
0
    def get_jacobian_beta(self, alpha, beta, C, eps, output_layer=None):
        """Compute the Jacobian of the scale dual variable g relative to beta.
        """
        n_features = beta.shape

        alpha = check_tensor(alpha, device=self.device)
        beta = check_tensor(beta, device=self.device, require_grad=True)
        C = check_tensor(C, device=self.device)

        # Contruct the matrix to probe the jacobian
        beta = beta.squeeze()
        beta = beta.repeat(n_features, 1)
        f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer)
        return get_np(torch.autograd.grad(
            g, beta, grad_outputs=torch.eye(n_features))[0])
예제 #2
0
    def gradient_beta(self, alpha, beta, C, eps, output_layer=None,
                      return_loss=False, computation=None):
        """Compute the gradient of Sinkhorn relative to beta with autodiff."""
        if computation is None:
            computation = self.gradient_computation

        alpha, beta, C = check_tensor(alpha, beta, C, device=self.device)
        if computation == 'autodiff':
            beta = check_tensor(beta, device=self.device, requires_grad=True)
        f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer)
        res = self._get_grad_beta(f, g, alpha, beta, C, eps,
                                  return_loss=return_loss)
        if return_loss:
            return get_np(res[0]), get_np(res[1])
        return get_np(res)
예제 #3
0
    def transform(self, alpha, beta, C, eps, output_layer=None, log_iters=None,
                  log_callbacks=DEFAULT_CALLBACKS, requires_grad=False):
        """Compute the dual variables associate to the transport plan.

        The transport plan can be recovered using the formula:
            P = exp(f / eps)[:, None] * exp(-C / eps) * exp (g / eps)[None]
        """
        # Compat numpy
        alpha, beta, C = check_tensor(alpha, beta, C, device=self.device)
        beta = check_tensor(beta, requires_grad=True)

        with nullcontext() if requires_grad else torch.no_grad():
            f, g, log = self(alpha, beta, C, eps, output_layer=output_layer,
                             log_iters=log_iters, log_callbacks=log_callbacks)

        return (get_np(f), get_np(g)), log
예제 #4
0
    def compute_loss(self, alpha, beta, C, eps, primal=False):
        """Compute the loss  along the network's layers

        Parameters
        ----------
        alpha : ndarray, shape (n_alpha,)
            First input distribution.
        beta: ndarray, shape (n_beta,)
            Second input distribution.
        C : ndarray, shape (n_alpha, n_beta)
            Cost matrix between the samples of each distribution.
        eps : float
            Entropic regularization parameter
        primal : boolean (default: False)
            If set to True, output the primal loss function. Else, output the
            dual loss.
        """
        alpha, beta, C = check_tensor(alpha, beta, C, device=self.device)
        loss = []
        with torch.no_grad():
            for output_layer in range(self.n_layers):
                f, g, _ = self(alpha, beta, C, eps,
                               output_layer=output_layer + 1)
                loss.append(get_np(self._loss_fn(f, g, alpha, beta, C, eps,
                                                 primal=primal)))
        return np.array(loss)
예제 #5
0
    def score(self, alpha, beta, C, eps, primal=False, output_layer=None):
        """Compute the loss for the network's output

        Parameters
        ----------
        alpha : ndarray, shape (n_samples, n_alpha)
            First input distribution.
        beta: ndarray, shape (n_beta,)
            Second input distribution.
        C : ndarray, shape (n_alpha, n_beta)
            Cost matrix between the samples of each distribution.
        eps : float
            Entropic regularization parameter
        primal : boolean (default: False)
            If set to True, output the primal loss function. Else, output the
            dual loss.
        output_layer : int (default: None)
            Layer to output from. It should be smaller than the number of
            layers of the network. Ifs set to None, output the network's last
            layer.

        Return
        ------
        loss : float
            Regularized logreg loss between x and Dz, with regularization reg
        """
        alpha, beta, C = check_tensor(alpha, beta, C, device=self.device)
        with torch.no_grad():
            f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer)
            return get_np(self._loss_fn(f, g, alpha, beta, C, eps,
                                        primal=primal))
예제 #6
0
    def _get_grad_beta(self, f, g, alpha, beta, C, eps, return_loss=False,
                       retain_graph=False, computation=None):

        if computation is None:
            computation = self.gradient_computation

        if computation == 'autodiff':
            beta = check_tensor(beta, device=self.device, requires_grad=True)
            beta.grad = None

            loss = self._loss_fn(f, g, alpha, beta, C, eps, primal=False)
            grad = torch.autograd.grad(
                loss, beta, retain_graph=retain_graph)[0]

        elif computation == 'analytic':
            with torch.no_grad():
                grad = g
                if return_loss:
                    loss = self._loss_fn(f, grad, alpha, beta, C, eps,
                                         primal=False)
        elif computation == 'implicit':
            with torch.no_grad():
                n_samples, _ = alpha.shape
                n, m = C.shape
                z = torch.zeros((n_samples, n + m), device=alpha.device)
                H = torch.zeros((n_samples, n+m, n+m), device=alpha.device)

                dx = g
                u, v = torch.exp(f / eps), torch.exp(g / eps)
                K = torch.exp(-C / eps)
                P = u[:, :, None] * K[None] * v[:, None]
                z[:, :n] = alpha - u * torch.matmul(v, K.t())
                z[:, n:] = beta - v * torch.matmul(u, K)
                bias = z.sum(axis=-1, keepdims=True)
                z -= bias / (n + m)

                # Compute the Hessian zz and solve h_inv . z
                H[:, :n, :n] = torch.diag_embed(-P.sum(axis=-1)/eps)
                H[:, n:, n:] = torch.diag_embed(-P.sum(axis=-2)/eps)
                H[:, :n, n:] = -P / eps
                H[:, n:, :n] = -P.transpose(-2, -1) / eps
                e, v = torch.symeig(H, eigenvectors=True)
                e_inv = 1 / e
                e_inv[e > -1e-12] = 0
                H_inv = torch.matmul(v, e_inv[..., None] * v.transpose(-1, -2))
                dz = (H_inv * z[:, None, :]).sum(axis=-1)[:, n:]
                grad = dx - dz
                if return_loss:
                    loss = self._loss_fn(f, g, alpha, beta, C, eps,
                                         primal=False)
        if grad.shape != beta.shape:
            assert beta.dim() == 1
            grad = grad.sum(axis=0)

        if return_loss:
            return grad, loss
        return grad
예제 #7
0
def wasserstein_barycenter(alphas, C, eps, n_outer, n_inner, gradient,
                           step_size=.1, device=None, meta={}):
    n_samples, n_alpha = alphas.shape
    # Generate the initial barycenter as the uniform distribution
    beta = np.ones(n_alpha) / n_alpha

    alphas, beta, C = check_tensor(alphas, beta, C, device=device)

    sinkhorn = Sinkhorn(n_layers=n_inner, log_domain=False,
                        gradient_computation=gradient, device=device,
                        verbose=0)

    sinkhorn_full = Sinkhorn(
        n_layers=N_INNER_FULL, log_domain=False,
        gradient_computation='analytic', device=device,
        verbose=0)

    # Warm start the GPU computation
    G_star, loss = sinkhorn_full.gradient_beta(
                    alphas, beta, C, eps, return_loss=True,
                    output_layer=2)

    results = []
    it_loss = np.logspace(0, np.log10(n_outer), num=int(4*np.log10(n_outer)),
                          dtype=int)
    t_start = time()
    for it in range(n_outer):
        print(f"{it/n_outer:.1%}".rjust(7, '.') + '\b' * 7,
              end='', flush=True)
        f, g, _ = sinkhorn(alphas, beta, C, eps)
        G = sinkhorn._get_grad_beta(f, g, alphas, beta, C, eps)
        with torch.no_grad():
            beta *= torch.exp(-step_size * G)
            beta /= beta.sum()

        if it in it_loss:
            delta_t = time() - t_start
            with torch.no_grad():
                G_star, loss = sinkhorn_full.gradient_beta(
                    alphas, beta, C, eps, return_loss=True)
            assert not np.isnan(loss)
            results.append(dict(
                n_inner=n_inner, gradient=gradient, step_size=step_size,
                iteration=it, time=delta_t, loss=loss,
                norm_gstar=np.linalg.norm(G_star.ravel()),
                g_diff=np.linalg.norm(G_star.ravel()-G.cpu().numpy().ravel()),
                best=n_inner == N_INNER_FULL, **meta
            ))
            t_start = time()

    print("done".rjust(7, '.'))
    return beta, results
예제 #8
0
def run_benchmark(n_rep=50, max_layers=100, n_probe_layers=20,
                  gpu=None):
    """Benchmark for the gradient computation time (analytic vs autodiff)

    Parameters:
    -----------
    n_rep: int (default: 50)
        Number of repetition for the benchmark. For each repetition, a new
        problem is created and the gradient are computed for different number
        of layers.
    max_layers: int (default: 100)
        Maximal number of layers. The benchmark is run for different number of
        n_layers which are chosen in log-scale between 1 and max_layers.
    n_probe_layers: int (default: 20)
        Number of number of layers chosen in the log-scale.
    gpu: int (default: none)
        If not None, use GPU number `gpu` to run the gradient computation.
    """
    eps = 1
    dimensions = dict(n_alpha=1000, n_beta=500, point_dim=2, n_samples=100)

    device = f'cuda:{gpu}' if gpu is not None else None

    layers = np.unique(np.logspace(0, np.log(max_layers), n_probe_layers,
                                   dtype=int))
    n_probe_layers = len(layers)

    layers = np.minimum(max_layers, layers)
    results = []
    for j in range(n_rep):
        alpha, beta, C, *_ = make_ot(**dimensions, random_state=None)
        args = check_tensor(alpha, beta, C, device=device)
        for i, nl in enumerate(layers):
            progress = (j*n_probe_layers + i) / (n_rep * n_probe_layers)
            print(f"\rBenchmark gradient computation on {device}: "
                  f"{progress:.1%}", end='', flush=True)
            for gradient in ['analytic', 'autodiff', 'implicit']:
                model = Sinkhorn(
                    n_layers=nl, gradient_computation=gradient,
                    device=device, log_domain=False)
                t_start = time()
                model.gradient_beta(*args, eps=eps)
                delta_t = time() - t_start
                results.append(dict(
                    gradient=gradient, n_layers=nl, time=delta_t, **dimensions
                ))

    df = pd.DataFrame(results)
    tag = f"{datetime.now().strftime('%Y-%m-%d_%Hh%M')}"
    df.to_pickle(os.path.join(OUTPUT_DIR, f"{BENCH_NAME}_{tag}.pkl"))
def wasserstein_barycenter_sgd(alphas,
                               C,
                               eps,
                               n_epochs,
                               n_inner,
                               gradient,
                               step_size=.1,
                               device=None):
    n_samples, n_alpha = alphas.shape
    # Generate the initial barycenter as the uniform distribution
    beta = np.ones(n_alpha) / n_alpha

    alphas, beta, C = check_tensor(alphas, beta, C, device=device)

    sinkhorn = Sinkhorn(n_layers=n_inner,
                        log_domain=False,
                        gradient_computation=gradient,
                        device=device,
                        verbose=0)

    sinkhorn_full = Sinkhorn(n_layers=N_INNER_FULL,
                             log_domain=False,
                             gradient_computation='analytic',
                             device=device,
                             verbose=0)

    results = []
    log_max = np.log10(n_epochs * n_samples)
    it_loss = np.logspace(0, log_max, num=int(5 * log_max), dtype=int)
    t_start = time()
    idx_sample = np.random.randint(n_samples, size=n_epochs * n_samples)
    for id_epoch in range(n_epochs):
        print(f"{id_epoch/n_epochs:.1%}".rjust(7, '.') + '\b' * 7,
              end='',
              flush=True)
        for i in range(n_samples):
            it = id_epoch * n_samples + i
            id_sample = idx_sample[it]
            f, g, _ = sinkhorn(alphas, beta, C, eps)
            G = sinkhorn._get_grad_beta(f, g, alphas[id_sample], beta, C, eps)
            with torch.no_grad():
                beta *= torch.exp(-step_size * G)
                beta /= beta.sum()

            if it in it_loss:
                delta_t = time() - t_start
                with torch.no_grad():
                    G_star, loss = sinkhorn_full.gradient_beta(
                        alphas, beta, C, eps, return_loss=True)
                assert not np.isnan(loss)
                results.append(
                    dict(n_inner=n_inner,
                         gradient=gradient,
                         step_size=step_size,
                         iteration=it,
                         time=delta_t,
                         loss=loss,
                         norm_gstar=np.linalg.norm(G_star.ravel()),
                         g_diff=np.linalg.norm(G_star.ravel() -
                                               G.cpu().numpy().ravel()),
                         best=n_inner == N_INNER_FULL))
                t_start = time()

    print("done".rjust(7, '.'))
    return beta, results