def get_jacobian_beta(self, alpha, beta, C, eps, output_layer=None): """Compute the Jacobian of the scale dual variable g relative to beta. """ n_features = beta.shape alpha = check_tensor(alpha, device=self.device) beta = check_tensor(beta, device=self.device, require_grad=True) C = check_tensor(C, device=self.device) # Contruct the matrix to probe the jacobian beta = beta.squeeze() beta = beta.repeat(n_features, 1) f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer) return get_np(torch.autograd.grad( g, beta, grad_outputs=torch.eye(n_features))[0])
def gradient_beta(self, alpha, beta, C, eps, output_layer=None, return_loss=False, computation=None): """Compute the gradient of Sinkhorn relative to beta with autodiff.""" if computation is None: computation = self.gradient_computation alpha, beta, C = check_tensor(alpha, beta, C, device=self.device) if computation == 'autodiff': beta = check_tensor(beta, device=self.device, requires_grad=True) f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer) res = self._get_grad_beta(f, g, alpha, beta, C, eps, return_loss=return_loss) if return_loss: return get_np(res[0]), get_np(res[1]) return get_np(res)
def transform(self, alpha, beta, C, eps, output_layer=None, log_iters=None, log_callbacks=DEFAULT_CALLBACKS, requires_grad=False): """Compute the dual variables associate to the transport plan. The transport plan can be recovered using the formula: P = exp(f / eps)[:, None] * exp(-C / eps) * exp (g / eps)[None] """ # Compat numpy alpha, beta, C = check_tensor(alpha, beta, C, device=self.device) beta = check_tensor(beta, requires_grad=True) with nullcontext() if requires_grad else torch.no_grad(): f, g, log = self(alpha, beta, C, eps, output_layer=output_layer, log_iters=log_iters, log_callbacks=log_callbacks) return (get_np(f), get_np(g)), log
def compute_loss(self, alpha, beta, C, eps, primal=False): """Compute the loss along the network's layers Parameters ---------- alpha : ndarray, shape (n_alpha,) First input distribution. beta: ndarray, shape (n_beta,) Second input distribution. C : ndarray, shape (n_alpha, n_beta) Cost matrix between the samples of each distribution. eps : float Entropic regularization parameter primal : boolean (default: False) If set to True, output the primal loss function. Else, output the dual loss. """ alpha, beta, C = check_tensor(alpha, beta, C, device=self.device) loss = [] with torch.no_grad(): for output_layer in range(self.n_layers): f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer + 1) loss.append(get_np(self._loss_fn(f, g, alpha, beta, C, eps, primal=primal))) return np.array(loss)
def score(self, alpha, beta, C, eps, primal=False, output_layer=None): """Compute the loss for the network's output Parameters ---------- alpha : ndarray, shape (n_samples, n_alpha) First input distribution. beta: ndarray, shape (n_beta,) Second input distribution. C : ndarray, shape (n_alpha, n_beta) Cost matrix between the samples of each distribution. eps : float Entropic regularization parameter primal : boolean (default: False) If set to True, output the primal loss function. Else, output the dual loss. output_layer : int (default: None) Layer to output from. It should be smaller than the number of layers of the network. Ifs set to None, output the network's last layer. Return ------ loss : float Regularized logreg loss between x and Dz, with regularization reg """ alpha, beta, C = check_tensor(alpha, beta, C, device=self.device) with torch.no_grad(): f, g, _ = self(alpha, beta, C, eps, output_layer=output_layer) return get_np(self._loss_fn(f, g, alpha, beta, C, eps, primal=primal))
def _get_grad_beta(self, f, g, alpha, beta, C, eps, return_loss=False, retain_graph=False, computation=None): if computation is None: computation = self.gradient_computation if computation == 'autodiff': beta = check_tensor(beta, device=self.device, requires_grad=True) beta.grad = None loss = self._loss_fn(f, g, alpha, beta, C, eps, primal=False) grad = torch.autograd.grad( loss, beta, retain_graph=retain_graph)[0] elif computation == 'analytic': with torch.no_grad(): grad = g if return_loss: loss = self._loss_fn(f, grad, alpha, beta, C, eps, primal=False) elif computation == 'implicit': with torch.no_grad(): n_samples, _ = alpha.shape n, m = C.shape z = torch.zeros((n_samples, n + m), device=alpha.device) H = torch.zeros((n_samples, n+m, n+m), device=alpha.device) dx = g u, v = torch.exp(f / eps), torch.exp(g / eps) K = torch.exp(-C / eps) P = u[:, :, None] * K[None] * v[:, None] z[:, :n] = alpha - u * torch.matmul(v, K.t()) z[:, n:] = beta - v * torch.matmul(u, K) bias = z.sum(axis=-1, keepdims=True) z -= bias / (n + m) # Compute the Hessian zz and solve h_inv . z H[:, :n, :n] = torch.diag_embed(-P.sum(axis=-1)/eps) H[:, n:, n:] = torch.diag_embed(-P.sum(axis=-2)/eps) H[:, :n, n:] = -P / eps H[:, n:, :n] = -P.transpose(-2, -1) / eps e, v = torch.symeig(H, eigenvectors=True) e_inv = 1 / e e_inv[e > -1e-12] = 0 H_inv = torch.matmul(v, e_inv[..., None] * v.transpose(-1, -2)) dz = (H_inv * z[:, None, :]).sum(axis=-1)[:, n:] grad = dx - dz if return_loss: loss = self._loss_fn(f, g, alpha, beta, C, eps, primal=False) if grad.shape != beta.shape: assert beta.dim() == 1 grad = grad.sum(axis=0) if return_loss: return grad, loss return grad
def wasserstein_barycenter(alphas, C, eps, n_outer, n_inner, gradient, step_size=.1, device=None, meta={}): n_samples, n_alpha = alphas.shape # Generate the initial barycenter as the uniform distribution beta = np.ones(n_alpha) / n_alpha alphas, beta, C = check_tensor(alphas, beta, C, device=device) sinkhorn = Sinkhorn(n_layers=n_inner, log_domain=False, gradient_computation=gradient, device=device, verbose=0) sinkhorn_full = Sinkhorn( n_layers=N_INNER_FULL, log_domain=False, gradient_computation='analytic', device=device, verbose=0) # Warm start the GPU computation G_star, loss = sinkhorn_full.gradient_beta( alphas, beta, C, eps, return_loss=True, output_layer=2) results = [] it_loss = np.logspace(0, np.log10(n_outer), num=int(4*np.log10(n_outer)), dtype=int) t_start = time() for it in range(n_outer): print(f"{it/n_outer:.1%}".rjust(7, '.') + '\b' * 7, end='', flush=True) f, g, _ = sinkhorn(alphas, beta, C, eps) G = sinkhorn._get_grad_beta(f, g, alphas, beta, C, eps) with torch.no_grad(): beta *= torch.exp(-step_size * G) beta /= beta.sum() if it in it_loss: delta_t = time() - t_start with torch.no_grad(): G_star, loss = sinkhorn_full.gradient_beta( alphas, beta, C, eps, return_loss=True) assert not np.isnan(loss) results.append(dict( n_inner=n_inner, gradient=gradient, step_size=step_size, iteration=it, time=delta_t, loss=loss, norm_gstar=np.linalg.norm(G_star.ravel()), g_diff=np.linalg.norm(G_star.ravel()-G.cpu().numpy().ravel()), best=n_inner == N_INNER_FULL, **meta )) t_start = time() print("done".rjust(7, '.')) return beta, results
def run_benchmark(n_rep=50, max_layers=100, n_probe_layers=20, gpu=None): """Benchmark for the gradient computation time (analytic vs autodiff) Parameters: ----------- n_rep: int (default: 50) Number of repetition for the benchmark. For each repetition, a new problem is created and the gradient are computed for different number of layers. max_layers: int (default: 100) Maximal number of layers. The benchmark is run for different number of n_layers which are chosen in log-scale between 1 and max_layers. n_probe_layers: int (default: 20) Number of number of layers chosen in the log-scale. gpu: int (default: none) If not None, use GPU number `gpu` to run the gradient computation. """ eps = 1 dimensions = dict(n_alpha=1000, n_beta=500, point_dim=2, n_samples=100) device = f'cuda:{gpu}' if gpu is not None else None layers = np.unique(np.logspace(0, np.log(max_layers), n_probe_layers, dtype=int)) n_probe_layers = len(layers) layers = np.minimum(max_layers, layers) results = [] for j in range(n_rep): alpha, beta, C, *_ = make_ot(**dimensions, random_state=None) args = check_tensor(alpha, beta, C, device=device) for i, nl in enumerate(layers): progress = (j*n_probe_layers + i) / (n_rep * n_probe_layers) print(f"\rBenchmark gradient computation on {device}: " f"{progress:.1%}", end='', flush=True) for gradient in ['analytic', 'autodiff', 'implicit']: model = Sinkhorn( n_layers=nl, gradient_computation=gradient, device=device, log_domain=False) t_start = time() model.gradient_beta(*args, eps=eps) delta_t = time() - t_start results.append(dict( gradient=gradient, n_layers=nl, time=delta_t, **dimensions )) df = pd.DataFrame(results) tag = f"{datetime.now().strftime('%Y-%m-%d_%Hh%M')}" df.to_pickle(os.path.join(OUTPUT_DIR, f"{BENCH_NAME}_{tag}.pkl"))
def wasserstein_barycenter_sgd(alphas, C, eps, n_epochs, n_inner, gradient, step_size=.1, device=None): n_samples, n_alpha = alphas.shape # Generate the initial barycenter as the uniform distribution beta = np.ones(n_alpha) / n_alpha alphas, beta, C = check_tensor(alphas, beta, C, device=device) sinkhorn = Sinkhorn(n_layers=n_inner, log_domain=False, gradient_computation=gradient, device=device, verbose=0) sinkhorn_full = Sinkhorn(n_layers=N_INNER_FULL, log_domain=False, gradient_computation='analytic', device=device, verbose=0) results = [] log_max = np.log10(n_epochs * n_samples) it_loss = np.logspace(0, log_max, num=int(5 * log_max), dtype=int) t_start = time() idx_sample = np.random.randint(n_samples, size=n_epochs * n_samples) for id_epoch in range(n_epochs): print(f"{id_epoch/n_epochs:.1%}".rjust(7, '.') + '\b' * 7, end='', flush=True) for i in range(n_samples): it = id_epoch * n_samples + i id_sample = idx_sample[it] f, g, _ = sinkhorn(alphas, beta, C, eps) G = sinkhorn._get_grad_beta(f, g, alphas[id_sample], beta, C, eps) with torch.no_grad(): beta *= torch.exp(-step_size * G) beta /= beta.sum() if it in it_loss: delta_t = time() - t_start with torch.no_grad(): G_star, loss = sinkhorn_full.gradient_beta( alphas, beta, C, eps, return_loss=True) assert not np.isnan(loss) results.append( dict(n_inner=n_inner, gradient=gradient, step_size=step_size, iteration=it, time=delta_t, loss=loss, norm_gstar=np.linalg.norm(G_star.ravel()), g_diff=np.linalg.norm(G_star.ravel() - G.cpu().numpy().ravel()), best=n_inner == N_INNER_FULL)) t_start = time() print("done".rjust(7, '.')) return beta, results