예제 #1
0
def test_fit_spectral(device):
    # TODO deflake this test
    pytest.skip("This test is flaky on macOS.")
    np.random.seed(0)
    torch.random.manual_seed(0)
    n = 200
    m = 3
    max_iter = 1000

    edges = util.all_edges(n)
    weights = torch.ones(edges.shape[0])
    f = penalties.Quadratic(weights)

    mde = problem.MDE(
        n,
        m,
        edges=edges,
        distortion_function=f,
        constraint=Standardized(),
        device=device,
    )
    X = mde.embed(max_iter=max_iter, eps=1e-10, memory_size=10)

    assert id(X) == id(mde.X)
    X_spectral = quadratic.spectral(n,
                                    m,
                                    edges=edges,
                                    weights=weights,
                                    device=device)

    testing.assert_allclose(
        mde.average_distortion(X).detach().cpu().numpy(),
        mde.average_distortion(X_spectral).detach().cpu().numpy(),
        atol=1e-4,
    )
예제 #2
0
def SpectralMDE(data,
                edges,
                weights,
                embedding_dim=2,
                cg=False,
                max_iter=40,
                device='cpu'):
    """
    Performs spectral embedding (very useful for initializations).
    Parameters
    ----------
    data: np.ndarray, torch.tensor, sp.csr_matrix or pymde.graph
        Input data or graph
    edges: torch.tensor, optional
        Tensor of edges. Optional if `data` is a pymde.graph
    weights: torch.tensor, optional
        Tensor of weights. Optional if `data` is a pymde.graph
    embedding_dim: int, optional, default 2
        Output dimension space to reduce the graph to.
    cg: bool
        If True, uses a preconditioned CG method to find the embedding,
        which requires that the Laplacian matrix plus the identity is
        positive definite; otherwise, a Lanczos method is used. Use True when
        the Lanczos method is too slow (which might happen when the number of
        edges is very large).
    max_iter: int
        max iteration count for the CG method
    device: str, optional, default 'cpu'

    Returns
    -------

    The output of an appropriately fit pymde.quadratic.spectral problem, with shape (n_items, embedding_dim).
    n_items is the number of samples from the input data or graph.

    """
    if isinstance(data, preprocess.graph.Graph):
        n_items = data.n_items
    else:
        n_items = data.shape[0]
    if isinstance(data, preprocess.graph.Graph):
        edges = data.edges.to(device)
        weights = data.weights.to(device)

    emb = quadratic.spectral(n_items,
                             embedding_dim,
                             edges,
                             weights,
                             cg=cg,
                             max_iter=max_iter,
                             device=device)
    return emb
예제 #3
0
def test_spectral():
    np.random.seed(0)
    torch.random.manual_seed(0)
    n = 5
    m = 3
    L = -np.abs(np.random.randn(n, n).astype(np.float32))
    L += L.T
    np.fill_diagonal(L, 0.0)
    np.fill_diagonal(L, -L.sum(axis=1))
    offdiag = np.triu_indices(n, 1)
    edges = np.column_stack(offdiag)
    weights = -L[offdiag]
    X = quadratic.spectral(n, m, edges, torch.tensor(weights))
    testing.assert_allclose(1.0 / n * X.T @ X, np.eye(m))
    X *= 1.0 / np.sqrt(n)

    eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
        L, k=m + 1, which="SM", return_eigenvectors=True)
    eigenvectors = eigenvectors[:, 1:]
    for col in range(m):
        testing.assert_allclose(eigenvectors[:, col],
                                X[:, col],
                                up_to_sign=True)
예제 #4
0
def preserve_neighbors(
    data,
    embedding_dim=2,
    attractive_penalty=penalties.Log1p,
    repulsive_penalty=penalties.Log,
    constraint=None,
    n_neighbors=None,
    repulsive_fraction=None,
    max_distance=None,
    init="quadratic",
    device="cpu",
    verbose=False,
) -> problem.MDE:
    """Construct an MDE problem designed to preserve local structure.

    This function constructs an MDE problem for preserving the
    local structure of original data. This MDE problem is well-suited for
    visualization (using ``embedding_dim`` 2 or 3), but can also be used to
    generate features for machine learning tasks (with ``embedding_dim`` = 10,
    50, or 100, for example). It yields embeddings in which similar items
    are near each other, and dissimilar items are not near each other.

    The original data can either be a data matrix, or a graph.
    Data matrices should be torch Tensors, NumPy arrays, or scipy sparse
    matrices; graphs should be instances of ``pymde.Graph``.

    The MDE problem uses distortion functions derived from weights (i.e.,
    penalties).

    To obtain an embedding, call the ``embed`` method on the returned ``MDE``
    object. To plot it, use ``pymde.plot``.

    .. code:: python3

        embedding = pymde.preserve_neighbors(data).embed()
        pymde.plot(embedding)

    Arguments
    ---------
    data: {torch.Tensor, numpy.ndarray, scipy.sparse matrix}(
            shape=(n_items, n_features)) or pymde.Graph
        The original data, a data matrix or a graph. Neighbors are
        computed using Euclidean distance if the data is a matrix,
        or the shortest-path metric if the data is a graph.
    embedding_dim: int
        The embedding dimension. Use 2 or 3 for visualization.
    attractive_penalty: pymde.Function class (or factory)
        Callable that constructs a distortion function, given positive
        weights. Typically one of the classes from ``pymde.penalties``,
        such as ``pymde.penalties.log1p``, ``pymde.penalties.Huber``, or
        ``pymde.penalties.Quadratic``.
    repulsive_penalty: pymde.Function class (or factory)
        Callable that constructs a distortion function, given negative
        weights. (If ``None``, only positive weights are used.) For example,
        ``pymde.penalties.Log`` or ``pymde.penalties.InversePower``.
    constraint: pymde.constraints.Constraint (optional)
        Embedding constraint, like ``pymde.Standardized()`` or
        ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no
        constraint when a repulsive penalty is provided, otherwise defaults to
        ``pymde.Standardized()``.
    n_neighbors: int (optional)
        The number of nearest neighbors to compute for each row (item) of
        ``data``. A sensible value is chosen by default, depending on the
        number of items.
    repulsive_fraction: float (optional)
        How many repulsive edges to include, relative to the number
        of attractive edges. ``1`` means as many repulsive edges as attractive
        edges. The higher this number, the more uniformly spread out the
        embedding will be. Defaults to ``0.5`` for standardized embeddings, and
        ``1`` otherwise. (If ``repulsive_penalty`` is ``None``, this argument
        is ignored.)
    max_distance: float (optional)
        If not None, neighborhoods are restricted to have a radius
        no greater than ``max_distance``.
    init: str
        Initialization strategy; 'quadratic' or 'random'.
    device: str (optional)
        Device for the embedding (eg, 'cpu', 'cuda').
    verbose: bool
        If ``True``, print verbose output.

    Returns
    -------
    pymde.MDE
        A ``pymde.MDE`` object, based on the original data.
    """
    if isinstance(data, preprocess.graph.Graph):
        n = data.n_items
    elif data.shape[0] <= 1:
        raise ValueError("The data matrix must have at least two rows.")
    else:
        n = data.shape[0]

    if n_neighbors is None:
        # target included edges to be ~1% of total number of edges
        n_choose_2 = n * (n - 1) / 2
        n_neighbors = int(max(min(15, n_choose_2 * 0.01 / n), 5))
    if n_neighbors > n:
        problem.LOGGER.warning(
            (
                "Requested n_neighbors {0} > number of items {1}."
                " Setting n_neighbors to {2}"
            ).format(n_neighbors, n, n - 1)
        )
        n_neighbors = n - 1

    if constraint is None and repulsive_penalty is not None:
        constraint = constraints.Centered()
    elif constraint is None and repulsive_penalty is None:
        constraint = constraints.Standardized()

    if isinstance(data, preprocess.graph.Graph):
        # enforce a max distance, otherwise may very well run out of memory
        # when n_items is large
        if max_distance is None:
            max_distance = (3 * torch.quantile(data.distances, 0.75)).item()

    if verbose:
        problem.LOGGER.info(
            f"Computing {n_neighbors}-nearest neighbors, with "
            f"max_distance={max_distance}"
        )

    knn_graph = preprocess.generic.k_nearest_neighbors(
        data,
        k=n_neighbors,
        max_distance=max_distance,
        verbose=verbose,
    )
    edges = knn_graph.edges.to(device)
    weights = knn_graph.weights.to(device)

    if init == "quadratic":
        if verbose:
            problem.LOGGER.info("Computing quadratic initialization.")
        X_init = quadratic.spectral(
            n, embedding_dim, edges, weights, device=device
        )
    elif init == "random":
        X_init = constraint.initialization(n, embedding_dim, device)
    else:
        raise ValueError(
            f"Unsupported value '{init}' for keyword argument `init`; "
            "the supported values are 'quadratic' and 'random'."
        )

    if repulsive_penalty is not None:
        if repulsive_fraction is None:
            if isinstance(constraint, constraints._Standardized):
                repulsive_fraction = 0.5
            else:
                repulsive_fraction = 1

        n_repulsive = int(repulsive_fraction * edges.shape[0])
        negative_edges = preprocess.sample_edges(
            n, n_repulsive, exclude=edges
        ).to(device)
        edges = torch.cat([edges, negative_edges])

        negative_weights = -torch.ones(
            negative_edges.shape[0], dtype=X_init.dtype, device=device
        )
        weights = torch.cat([weights, negative_weights])

        f = penalties.PushAndPull(
            weights,
            attractive_penalty=attractive_penalty,
            repulsive_penalty=repulsive_penalty,
        )
    else:
        f = attractive_penalty(weights)

    mde = problem.MDE(
        n_items=n,
        embedding_dim=embedding_dim,
        edges=edges,
        distortion_function=f,
        constraint=constraint,
        device=device,
    )
    mde._X_init = X_init

    # TODO cache the graph for subsequent calls / constructor for MDE from graph

    distances = mde.distances(mde._X_init)
    if (distances == 0).any():
        # pathological scenario in which at least two points overlap can yield
        # non-differentiable average distortion. perturb the initialization to
        # mitigate.
        mde._X_init += 1e-4 * torch.randn(
            mde._X_init.shape,
            device=mde._X_init.device,
            dtype=mde._X_init.dtype,
        )
    return mde