def test_fit_spectral(device): # TODO deflake this test pytest.skip("This test is flaky on macOS.") np.random.seed(0) torch.random.manual_seed(0) n = 200 m = 3 max_iter = 1000 edges = util.all_edges(n) weights = torch.ones(edges.shape[0]) f = penalties.Quadratic(weights) mde = problem.MDE( n, m, edges=edges, distortion_function=f, constraint=Standardized(), device=device, ) X = mde.embed(max_iter=max_iter, eps=1e-10, memory_size=10) assert id(X) == id(mde.X) X_spectral = quadratic.spectral(n, m, edges=edges, weights=weights, device=device) testing.assert_allclose( mde.average_distortion(X).detach().cpu().numpy(), mde.average_distortion(X_spectral).detach().cpu().numpy(), atol=1e-4, )
def SpectralMDE(data, edges, weights, embedding_dim=2, cg=False, max_iter=40, device='cpu'): """ Performs spectral embedding (very useful for initializations). Parameters ---------- data: np.ndarray, torch.tensor, sp.csr_matrix or pymde.graph Input data or graph edges: torch.tensor, optional Tensor of edges. Optional if `data` is a pymde.graph weights: torch.tensor, optional Tensor of weights. Optional if `data` is a pymde.graph embedding_dim: int, optional, default 2 Output dimension space to reduce the graph to. cg: bool If True, uses a preconditioned CG method to find the embedding, which requires that the Laplacian matrix plus the identity is positive definite; otherwise, a Lanczos method is used. Use True when the Lanczos method is too slow (which might happen when the number of edges is very large). max_iter: int max iteration count for the CG method device: str, optional, default 'cpu' Returns ------- The output of an appropriately fit pymde.quadratic.spectral problem, with shape (n_items, embedding_dim). n_items is the number of samples from the input data or graph. """ if isinstance(data, preprocess.graph.Graph): n_items = data.n_items else: n_items = data.shape[0] if isinstance(data, preprocess.graph.Graph): edges = data.edges.to(device) weights = data.weights.to(device) emb = quadratic.spectral(n_items, embedding_dim, edges, weights, cg=cg, max_iter=max_iter, device=device) return emb
def test_spectral(): np.random.seed(0) torch.random.manual_seed(0) n = 5 m = 3 L = -np.abs(np.random.randn(n, n).astype(np.float32)) L += L.T np.fill_diagonal(L, 0.0) np.fill_diagonal(L, -L.sum(axis=1)) offdiag = np.triu_indices(n, 1) edges = np.column_stack(offdiag) weights = -L[offdiag] X = quadratic.spectral(n, m, edges, torch.tensor(weights)) testing.assert_allclose(1.0 / n * X.T @ X, np.eye(m)) X *= 1.0 / np.sqrt(n) eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( L, k=m + 1, which="SM", return_eigenvectors=True) eigenvectors = eigenvectors[:, 1:] for col in range(m): testing.assert_allclose(eigenvectors[:, col], X[:, col], up_to_sign=True)
def preserve_neighbors( data, embedding_dim=2, attractive_penalty=penalties.Log1p, repulsive_penalty=penalties.Log, constraint=None, n_neighbors=None, repulsive_fraction=None, max_distance=None, init="quadratic", device="cpu", verbose=False, ) -> problem.MDE: """Construct an MDE problem designed to preserve local structure. This function constructs an MDE problem for preserving the local structure of original data. This MDE problem is well-suited for visualization (using ``embedding_dim`` 2 or 3), but can also be used to generate features for machine learning tasks (with ``embedding_dim`` = 10, 50, or 100, for example). It yields embeddings in which similar items are near each other, and dissimilar items are not near each other. The original data can either be a data matrix, or a graph. Data matrices should be torch Tensors, NumPy arrays, or scipy sparse matrices; graphs should be instances of ``pymde.Graph``. The MDE problem uses distortion functions derived from weights (i.e., penalties). To obtain an embedding, call the ``embed`` method on the returned ``MDE`` object. To plot it, use ``pymde.plot``. .. code:: python3 embedding = pymde.preserve_neighbors(data).embed() pymde.plot(embedding) Arguments --------- data: {torch.Tensor, numpy.ndarray, scipy.sparse matrix}( shape=(n_items, n_features)) or pymde.Graph The original data, a data matrix or a graph. Neighbors are computed using Euclidean distance if the data is a matrix, or the shortest-path metric if the data is a graph. embedding_dim: int The embedding dimension. Use 2 or 3 for visualization. attractive_penalty: pymde.Function class (or factory) Callable that constructs a distortion function, given positive weights. Typically one of the classes from ``pymde.penalties``, such as ``pymde.penalties.log1p``, ``pymde.penalties.Huber``, or ``pymde.penalties.Quadratic``. repulsive_penalty: pymde.Function class (or factory) Callable that constructs a distortion function, given negative weights. (If ``None``, only positive weights are used.) For example, ``pymde.penalties.Log`` or ``pymde.penalties.InversePower``. constraint: pymde.constraints.Constraint (optional) Embedding constraint, like ``pymde.Standardized()`` or ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no constraint when a repulsive penalty is provided, otherwise defaults to ``pymde.Standardized()``. n_neighbors: int (optional) The number of nearest neighbors to compute for each row (item) of ``data``. A sensible value is chosen by default, depending on the number of items. repulsive_fraction: float (optional) How many repulsive edges to include, relative to the number of attractive edges. ``1`` means as many repulsive edges as attractive edges. The higher this number, the more uniformly spread out the embedding will be. Defaults to ``0.5`` for standardized embeddings, and ``1`` otherwise. (If ``repulsive_penalty`` is ``None``, this argument is ignored.) max_distance: float (optional) If not None, neighborhoods are restricted to have a radius no greater than ``max_distance``. init: str Initialization strategy; 'quadratic' or 'random'. device: str (optional) Device for the embedding (eg, 'cpu', 'cuda'). verbose: bool If ``True``, print verbose output. Returns ------- pymde.MDE A ``pymde.MDE`` object, based on the original data. """ if isinstance(data, preprocess.graph.Graph): n = data.n_items elif data.shape[0] <= 1: raise ValueError("The data matrix must have at least two rows.") else: n = data.shape[0] if n_neighbors is None: # target included edges to be ~1% of total number of edges n_choose_2 = n * (n - 1) / 2 n_neighbors = int(max(min(15, n_choose_2 * 0.01 / n), 5)) if n_neighbors > n: problem.LOGGER.warning( ( "Requested n_neighbors {0} > number of items {1}." " Setting n_neighbors to {2}" ).format(n_neighbors, n, n - 1) ) n_neighbors = n - 1 if constraint is None and repulsive_penalty is not None: constraint = constraints.Centered() elif constraint is None and repulsive_penalty is None: constraint = constraints.Standardized() if isinstance(data, preprocess.graph.Graph): # enforce a max distance, otherwise may very well run out of memory # when n_items is large if max_distance is None: max_distance = (3 * torch.quantile(data.distances, 0.75)).item() if verbose: problem.LOGGER.info( f"Computing {n_neighbors}-nearest neighbors, with " f"max_distance={max_distance}" ) knn_graph = preprocess.generic.k_nearest_neighbors( data, k=n_neighbors, max_distance=max_distance, verbose=verbose, ) edges = knn_graph.edges.to(device) weights = knn_graph.weights.to(device) if init == "quadratic": if verbose: problem.LOGGER.info("Computing quadratic initialization.") X_init = quadratic.spectral( n, embedding_dim, edges, weights, device=device ) elif init == "random": X_init = constraint.initialization(n, embedding_dim, device) else: raise ValueError( f"Unsupported value '{init}' for keyword argument `init`; " "the supported values are 'quadratic' and 'random'." ) if repulsive_penalty is not None: if repulsive_fraction is None: if isinstance(constraint, constraints._Standardized): repulsive_fraction = 0.5 else: repulsive_fraction = 1 n_repulsive = int(repulsive_fraction * edges.shape[0]) negative_edges = preprocess.sample_edges( n, n_repulsive, exclude=edges ).to(device) edges = torch.cat([edges, negative_edges]) negative_weights = -torch.ones( negative_edges.shape[0], dtype=X_init.dtype, device=device ) weights = torch.cat([weights, negative_weights]) f = penalties.PushAndPull( weights, attractive_penalty=attractive_penalty, repulsive_penalty=repulsive_penalty, ) else: f = attractive_penalty(weights) mde = problem.MDE( n_items=n, embedding_dim=embedding_dim, edges=edges, distortion_function=f, constraint=constraint, device=device, ) mde._X_init = X_init # TODO cache the graph for subsequent calls / constructor for MDE from graph distances = mde.distances(mde._X_init) if (distances == 0).any(): # pathological scenario in which at least two points overlap can yield # non-differentiable average distortion. perturb the initialization to # mitigate. mde._X_init += 1e-4 * torch.randn( mde._X_init.shape, device=mde._X_init.device, dtype=mde._X_init.dtype, ) return mde