예제 #1
0
def create_graph(data, k=30, dview=None, metric='euclidean'):
    # def _kernel(dxy, sigma=1):
    #     return np.exp(-dxy ** 2 / sigma)
    _, idx = find_neighbors(data, k=k, dview=dview, metric=metric)
    # affinities = np.apply_along_axis(lambda x: _kernel(x, x.std()), axis=1, arr=d)
    # n, k = idx.shape
    # i = [np.tile(x, (k, )) for x in range(n)]
    # i = np.concatenate(np.array(i))
    # j = np.concatenate(idx)
    # s = np.concatenate(affinities)
    # graph = sp.coo_matrix((s, (i, j)), shape=(n, n)).tocsr()
    # graph = (graph + graph.transpose()).multiply(.5)
    graph = neighbor_graph(jaccard_kernel, {'idx': idx})
    # make symmetric
    # graph = (graph + graph.transpose()).multiply(.5)
    return graph
예제 #2
0
def create_graph(data, k=30, metric='euclidean', n_jobs=-1):
    # def _kernel(dxy, sigma=1):
    #     return np.exp(-dxy ** 2 / sigma)

    _, idx = find_neighbors(data, k=k, metric=metric, n_jobs=n_jobs)
    # affinities = np.apply_along_axis(lambda x: _kernel(x, x.std()), axis=1, arr=d)
    # n, k = idx.shape
    # i = [np.tile(x, (k, )) for x in range(n)]
    # i = np.concatenate(np.array(i))
    # j = np.concatenate(idx)
    # s = np.concatenate(affinities)
    # graph = sp.coo_matrix((s, (i, j)), shape=(n, n)).tocsr()
    # graph = (graph + graph.transpose()).multiply(.5)
    graph = neighbor_graph(jaccard_kernel, {'idx': idx})
    # make symmetric
    # graph = (graph + graph.transpose()).multiply(.5)
    return graph
예제 #3
0
def cluster(data,
            k=30,
            directed=False,
            prune=False,
            min_cluster_size=10,
            jaccard=True,
            primary_metric='euclidean',
            n_jobs=-1,
            q_tol=1e-3,
            louvain_time_limit=2000,
            nn_method='kdtree',
            verbosity=2):
    """
    PhenoGraph clustering

    :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph
        If ndarray, n-by-d array of n cells in d dimensions
        If sparse matrix, n-by-n adjacency matrix
    :param k: Number of nearest neighbors to use in first step of graph construction
    :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph
        The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below)
    :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph
        and its transpose
    :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers
        and are assigned to -1 in the cluster labels
    :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph.
        If False, use a Gaussian kernel.
    :param primary_metric: Distance metric to define nearest neighbors.
        Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}
        Note that performance will be slower for correlation and cosine.
    :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1,
        the number of jobs is determined automatically
    :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization
    :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded
        the best result so far is returned
    :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional
        data sets, brute force (with parallel computation) performs faster than kdtree.
    :param verbosity: How much text output to produce. Higher values produce more output. Zero
        should silence all output including warnings, so use with caution.

    :return communities: numpy integer array of community assignments for each row in data
    :return graph: numpy sparse array of the graph that was used for clustering
    :return Q: the modularity score for communities on graph
    """

    logger.setLevel(max([logging.ERROR - verbosity * 10, logging.DEBUG]))

    # NB if prune=True, graph must be undirected, and the prune setting takes precedence
    if prune and directed:
        logger.warning("Setting directed=False because prune=True")
        directed = False

    if n_jobs == 1:
        kernel = jaccard_kernel
    else:
        kernel = parallel_jaccard_kernel
    kernelargs = {}

    # Start timer
    tic = time.time()
    # Go!
    if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]:
        logger.info(
            "Using neighbor information from provided graph, rather than computing "
            + "neighbors directly")
        lilmatrix = data.tolil()
        d = np.vstack(lilmatrix.data).astype('float32')  # distances
        idx = np.vstack(lilmatrix.rows).astype(
            'int32')  # neighbor indices by row
        del lilmatrix
        assert idx.shape[0] == data.shape[0]
        k = idx.shape[1]
    else:
        d, idx = find_neighbors(data,
                                k=k,
                                metric=primary_metric,
                                method=nn_method,
                                n_jobs=n_jobs)
        logger.info("Neighbors computed in {} seconds".format(time.time() -
                                                              tic))

    subtic = time.time()
    kernelargs['idx'] = idx
    # if not using jaccard kernel, use gaussian
    if not jaccard:
        kernelargs['d'] = d
        kernelargs['sigma'] = 1.
        kernel = gaussian_kernel
        graph = neighbor_graph(kernel, kernelargs)
        logger.info("Gaussian kernel graph constructed in {} seconds".format(
            time.time() - subtic))
    else:
        del d
        graph = neighbor_graph(kernel, kernelargs)
        logger.info(
            "Jaccard graph constructed in {} seconds".format(time.time() -
                                                             subtic))
    if not directed:
        if not prune:
            # symmetrize graph by averaging with transpose
            sg = (graph + graph.transpose()).multiply(.5)
        else:
            # symmetrize graph by multiplying with transpose
            sg = graph.multiply(graph.transpose())
        # retain lower triangle (for efficiency)
        graph = sp.tril(sg, -1)
    # write to file with unique id
    uid = uuid.uuid1().hex
    graph2binary(uid, graph)
    communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit)
    logger.info("PhenoGraph complete in {} seconds".format(time.time() - tic))
    communities = sort_by_size(communities, min_cluster_size)
    # clean up
    for f in os.listdir():
        if re.search(uid, f):
            os.remove(f)

    return communities, graph, Q
예제 #4
0
def cluster(data, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True,
            primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000,
            nn_method='kdtree'):
    """
    PhenoGraph clustering

    :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph
        If ndarray, n-by-d array of n cells in d dimensions
        If sparse matrix, n-by-n adjacency matrix
    :param k: Number of nearest neighbors to use in first step of graph construction
    :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph
        The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below)
    :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph
        and its transpose
    :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers
        and are assigned to -1 in the cluster labels
    :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph.
        If False, use a Gaussian kernel.
    :param primary_metric: Distance metric to define nearest neighbors.
        Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}
        Note that performance will be slower for correlation and cosine.
    :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1,
        the number of jobs is determined automatically
    :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization
    :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded
        the best result so far is returned
    :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional
        data sets, brute force (with parallel computation) performs faster than kdtree.

    :return communities: numpy integer array of community assignments for each row in data
    :return graph: numpy sparse array of the graph that was used for clustering
    :return Q: the modularity score for communities on graph
    """

    # NB if prune=True, graph must be undirected, and the prune setting takes precedence
    if prune:
        print("Setting directed=False because prune=True")
        directed = False

    if n_jobs == 1:
        kernel = jaccard_kernel
    else:
        kernel = parallel_jaccard_kernel
    kernelargs = {}

    # Start timer
    tic = time.time()
    # Go!
    if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]:
        print("Using neighbor information from provided graph, rather than computing neighbors directly", flush=True)
        lilmatrix = data.tolil()
        d = np.vstack(lilmatrix.data).astype('float32')  # distances
        idx = np.vstack(lilmatrix.rows).astype('int32')  # neighbor indices by row
        del lilmatrix
        assert idx.shape[0] == data.shape[0]
        k = idx.shape[1]
    else:
        d, idx = find_neighbors(data, k=k, metric=primary_metric, method=nn_method, n_jobs=n_jobs)
        print("Neighbors computed in {} seconds".format(time.time() - tic), flush=True)

    subtic = time.time()
    kernelargs['idx'] = idx
    # if not using jaccard kernel, use gaussian
    if not jaccard:
        kernelargs['d'] = d
        kernelargs['sigma'] = 1.
        kernel = gaussian_kernel
        graph = neighbor_graph(kernel, kernelargs)
        print("Gaussian kernel graph constructed in {} seconds".format(time.time() - subtic), flush=True)
    else:
        del d
        graph = neighbor_graph(kernel, kernelargs)
        print("Jaccard graph constructed in {} seconds".format(time.time() - subtic), flush=True)
    if not directed:
        if not prune:
            # symmetrize graph by averaging with transpose
            sg = (graph + graph.transpose()).multiply(.5)
        else:
            # symmetrize graph by multiplying with transpose
            sg = graph.multiply(graph.transpose())
        # retain lower triangle (for efficiency)
        graph = sp.tril(sg, -1)
    # write to file with unique id
    uid = uuid.uuid1().hex
    graph2binary(uid, graph)
    communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit)
    print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)
    communities = sort_by_size(communities, min_cluster_size)
    # clean up
    for f in os.listdir():
        if re.search(uid, f):
            os.remove(f)

    return communities, graph, Q
예제 #5
0
def cluster(data,
            k=30,
            directed=False,
            prune=False,
            min_cluster_size=10,
            jaccard=True,
            primary_metric='euclidean',
            n_jobs=-1,
            q_tol=1e-3):
    """
    PhenoGraph clustering

    :param data: Numpy ndarray of data to cluster
    :param k: Number of nearest neighbors to use in first step of graph construction
    :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph
        The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below)
    :param prune: Whether to symmetrize by taking the average (prune=False) or produce (prune=True) between the graph
        and its transpose
    :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers
        and are assigned to -1 in the cluster labels
    :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph.
        If False, use a Gaussian kernel.
    :param primary_metric: Distance metric to define nearest neighbors.
        Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}
        Note that performance will be slower for correlation and cosine.
    :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1,
     the number of jobs is determined automatically
    :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization

    :return communities: numpy integer array of community assignments for each row in data
    :return graph: numpy sparse array of the graph that was used for clustering
    :return Q: the modularity score for communities on graph
    """

    # NB if prune=True, graph must be undirected, and the prune setting takes precedence
    if prune:
        print("Setting directed=False because prune=True")
        directed = False

    if n_jobs == 1:
        kernel = jaccard_kernel
    else:
        kernel = parallel_jaccard_kernel
    kernelargs = {}

    # Start timer
    tic = time.time()
    # Go!
    d, idx = find_neighbors(data, k=k, metric=primary_metric, n_jobs=n_jobs)
    print("Neighbors computed in {} seconds".format(time.time() - tic),
          flush=True)
    subtic = time.time()
    kernelargs['idx'] = idx
    # if not using jaccard kernel, use gaussian
    if not jaccard:
        kernelargs['d'] = d
        kernelargs['sigma'] = 1.
        kernel = gaussian_kernel
        graph = neighbor_graph(kernel, kernelargs)
        print("Gaussian kernel graph constructed in {} seconds".format(
            time.time() - subtic),
              flush=True)
    else:
        del d
        graph = neighbor_graph(kernel, kernelargs)
        print("Jaccard graph constructed in {} seconds".format(time.time() -
                                                               subtic),
              flush=True)
    if not directed:
        if not prune:
            # symmetrize graph by averaging with transpose
            sg = (graph + graph.transpose()).multiply(.5)
        else:
            # symmetrize graph by multiplying with transpose
            sg = graph.multiply(graph.transpose())
        # retain lower triangle (for efficiency)
        graph = sp.tril(sg, -1)
    # write to file with unique id
    uid = uuid.uuid1().hex
    graph2binary(uid, graph)
    communities, Q = runlouvain(uid, tol=q_tol)
    print("PhenoGraph complete in {} seconds".format(time.time() - tic),
          flush=True)
    communities = sort_by_size(communities, min_cluster_size)
    # clean up
    for f in os.listdir():
        if re.search(uid, f):
            os.remove(f)

    return communities, graph, Q
예제 #6
0
def cluster(
    data: Union[np.ndarray, spmatrix],
    clustering_algo: Union["louvain", "leiden"] = "louvain",
    k: int = 30,
    directed: bool = False,
    prune: bool = False,
    min_cluster_size: int = 10,
    jaccard: bool = True,
    primary_metric: Union["euclidean", "manhattan", "correlation",
                          "cosine"] = "euclidean",
    n_jobs: int = -1,
    q_tol: float = 1e-3,
    louvain_time_limit: int = 2000,
    nn_method: Union["kdtree", "brute"] = "kdtree",
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    resolution_parameter: float = 1,
    n_iterations: int = -1,
    use_weights: bool = True,
    seed: Optional[int] = None,
    **kargs,
) -> Tuple[np.array, spmatrix, float]:
    """\
    PhenoGraph clustering

    Parameters
    ----------
    data
        Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph.
        If ndarray, n-by-d array of n cells in d dimensions.
        If sparse matrix, n-by-n adjacency matrix.
    clustering_algo
        Choose `'louvain'` or `'leiden'`. Any other value will return only graph object.
    k
        Number of nearest neighbors to use in first step of graph construction.
    directed
        Whether to use a symmetric (default) or asymmetric ("directed") graph.
        The graph construction process produces a directed graph, which is symmetrized
        by one of two methods (see below).
    prune
        Whether to symmetrize by taking the average (prune = False) or product
        (prune = True) between the graph and its transpose.
    min_cluster_size
        Cells that end up in a cluster smaller than min_cluster_size are considered
        outliers and are assigned to -1 in the cluster labels.
    jaccard
        If True, use Jaccard metric between k-neighborhoods to build graph.
        If False, use a Gaussian kernel.
    primary_metric
        Distance metric to define nearest neighbors. Options include: {'euclidean',
        'manhattan', 'correlation', 'cosine'}. Note that performance will be slower for
        `correlation` and `cosine`.
    n_jobs
        Nearest Neighbors and Jaccard coefficients will be computed in parallel using
        n_jobs. If 1 is given, no parallelism is used. If set to -1, all CPUs are used.
        For n_jobs below -1, `n_cpus + 1 + n_jobs` are used.
    q_tol
        Tolerance (i.e., precision) for monitoring modularity optimization
    louvain_time_limit
        Maximum number of seconds to run modularity optimization. If exceeded the best
        result so far is returned.
    nn_method
        Whether to use brute force or kdtree for nearest neighbor search. For very large
        high-dimensional data sets, brute force (with parallel computation) performs
        faster than kdtree.
    partition_type
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the
        available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    resolution_parameter
        A parameter value controlling the coarseness of the clustering in Leiden. Higher
        values lead to more clusters. Set to `None` if overriding `partition_type` to
        one that doesn’t accept a `resolution_parameter`.
    n_iterations
        Number of iterations to run the Leiden algorithm. If the number of iterations is
        negative, the Leiden algorithm is run until an iteration in which there was no
        improvement.
    use_weights
        Use vertices in the Leiden computation.
    seed
        Leiden initialization of the optimization.
    kargs
        Additional arguments passed to :func:`~leidenalg.find_partition` and the
        constructor of the `partition_type`.

    Returns
    -------
    communities
        numpy integer array of community assignments for each row in data.
    graph
        numpy sparse array of the graph that was used for clustering.
    Q
        the modularity score for communities on graph.

    Example
    -------
    >>> import phenograph
    >>> import scipy.sparse
    >>> import numpy as np

    >>> N = 5000
    >>> K = 30
    >>> RowInd = np.repeat(np.arange(N), K)
    >>> ColInd = np.tile(np.arange(N), K)
    >>> Mat = scipy.sparse.csr_matrix(
    ...     (np.ones(ColInd.shape), (RowInd, ColInd)), shape=(N, N)
    ... )

    >>> communities, graph, Q = phenograph.cluster(Mat, clustering_algo = 'leiden')
    """

    # NB if prune=True, graph must be undirected, and the prune setting takes precedence
    if prune:
        print("Setting directed=False because prune=True")
        directed = False

    if n_jobs == 1:
        kernel = jaccard_kernel
    else:
        kernel = parallel_jaccard_kernel
    kernelargs = {}

    # Start timer
    tic = time.time()
    # Go!
    if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]:
        print(
            "Using neighbor information from provided graph, "
            "rather than computing neighbors directly",
            flush=True,
        )
        lilmatrix = data.tolil()
        d = np.vstack(lilmatrix.data).astype("float32")  # distances
        idx = np.vstack(lilmatrix.rows).astype(
            "int32")  # neighbor indices by row
        del lilmatrix
        assert idx.shape[0] == data.shape[0]
    else:
        d, idx = find_neighbors(data,
                                k=k,
                                metric=primary_metric,
                                method=nn_method,
                                n_jobs=n_jobs)
        print("Neighbors computed in {} seconds".format(time.time() - tic),
              flush=True)

    subtic = time.time()
    kernelargs["idx"] = idx
    # if not using jaccard kernel, use gaussian
    if not jaccard:
        kernelargs["d"] = d
        kernelargs["sigma"] = 1.0
        kernel = gaussian_kernel
        graph = neighbor_graph(kernel, kernelargs)
        print(
            "Gaussian kernel graph constructed in {} seconds".format(
                time.time() - subtic),
            flush=True,
        )
    else:
        del d
        graph = neighbor_graph(kernel, kernelargs)
        print(
            "Jaccard graph constructed in {} seconds".format(time.time() -
                                                             subtic),
            flush=True,
        )
    if not directed:
        if not prune:
            # symmetrize graph by averaging with transpose
            sg = (graph + graph.transpose()).multiply(0.5)
        else:
            # symmetrize graph by multiplying with transpose
            sg = graph.multiply(graph.transpose())
        # retain lower triangle (for efficiency)
        graph = sp.tril(sg, -1)

    # choose between Louvain or Leiden algorithm
    communities, Q = "", ""
    if clustering_algo == "louvain":
        # write to file with unique id
        uid = uuid.uuid1().hex
        graph2binary(uid, graph)
        communities, Q = runlouvain(uid,
                                    tol=q_tol,
                                    time_limit=louvain_time_limit)

        print("Sorting communities by size, please wait ...", flush=True)
        communities = sort_by_size(communities, min_cluster_size)

        print("PhenoGraph complete in {} seconds".format(time.time() - tic),
              flush=True)

        # clean up
        for f in os.listdir():
            if re.search(uid, f):
                os.remove(f)

    elif clustering_algo == "leiden":
        # convert resulting graph from scipy.sparse.coo.coo_matrix to Graph object
        # get indices of vertices
        edgelist = np.vstack(graph.nonzero()).T.tolist()
        g = ig.Graph(max(graph.shape), edgelist, directed=directed)
        # set vertices as weights
        g.es["weights"] = graph.data

        if not partition_type:
            partition_type = leidenalg.RBConfigurationVertexPartition
        if resolution_parameter:
            kargs["resolution_parameter"] = resolution_parameter
        if use_weights:
            kargs["weights"] = np.array(g.es["weights"]).astype("float64")
        kargs["n_iterations"] = n_iterations
        kargs["seed"] = seed

        print("Running Leiden optimization", flush=True)
        tic_ = time.time()
        communities = leidenalg.find_partition(
            g,
            partition_type=partition_type,
            **kargs,
        )
        Q = communities.q
        print(
            "Leiden completed in {} seconds".format(time.time() - tic_),
            flush=True,
        )
        communities = np.asarray(communities.membership)

        print("Sorting communities by size, please wait ...", flush=True)
        communities = sort_by_size(communities, min_cluster_size)

        print("PhenoGraph completed in {} seconds".format(time.time() - tic),
              flush=True)

    else:
        # return only graph object
        pass

    return communities, graph, Q
예제 #7
0
def cluster(data, k=30, d=None, idx=None, directed=False, prune=False, min_cluster_size=10, jaccard=True,
            primary_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000,
            nn_method='kdtree', use_gpu=False):
    """
    PhenoGraph clustering

    :param data: Numpy ndarray of data to cluster, or sparse matrix of k-nearest neighbor graph
        If ndarray, n-by-d array of n cells in d dimensions
        If sparse matrix, n-by-n adjacency matrix
    :param d: None or a Numpy ndarray with shape (data.shape[0], k-1), each data's (k-1) nearest neighbors' distance. 
        If None, it would be calculated.
    :param idx: None or a Numpy ndarray with shape (data.shape[0], k-1), each data's (k-1) nearest neighbors' index. 
        If None, it would be calculated.
    :param k: Number of nearest neighbors to use in first step of graph construction
    :param directed: Whether to use a symmetric (default) or asymmetric ("directed") graph
        The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below)
    :param prune: Whether to symmetrize by taking the average (prune=False) or product (prune=True) between the graph
        and its transpose
    :param min_cluster_size: Cells that end up in a cluster smaller than min_cluster_size are considered outliers
        and are assigned to -1 in the cluster labels
    :param jaccard: If True, use Jaccard metric between k-neighborhoods to build graph.
        If False, use a Gaussian kernel.
    :param primary_metric: Distance metric to define nearest neighbors.
        Options include: {'euclidean', 'manhattan', 'correlation', 'cosine'}
        Note that performance will be slower for correlation and cosine.
    :param n_jobs: Nearest Neighbors and Jaccard coefficients will be computed in parallel using n_jobs. If n_jobs=-1,
        the number of jobs is determined automatically
    :param q_tol: Tolerance (i.e., precision) for monitoring modularity optimization
    :param louvain_time_limit: Maximum number of seconds to run modularity optimization. If exceeded
        the best result so far is returned
    :param nn_method: Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional
        data sets, brute force (with parallel computation) performs faster than kdtree.
    :param use_gpu: Whether to use GPU to calculate distance. Now only support euclidean and inner product metrics.

    :return communities: numpy integer array of community assignments for each row in data
    :return graph: numpy sparse array of the graph that was used for clustering
    :return Q: the modularity score for communities on graph
    """

    # NB if prune=True, graph must be undirected, and the prune setting takes precedence
    if prune:
        print("Setting directed=False because prune=True")
        directed = False

    if n_jobs == 1:
        kernel = jaccard_kernel
    else:
        kernel = parallel_jaccard_kernel
    kernelargs = {}

    data = data.astype(np.float32)
    if not data.flags.contiguous:
        data = np.ascontiguousarray(data) # faiss must use contiguous array and float32!

    # Start timer
    tic = time.time()
    # Go!
    if (d is not None) and (idx is not None):
        assert d.shape == idx.shape, "d and idx has different shapes!"
        assert idx.shape[0] == data.shape[0], "the number of rows of d is different with that of data!"
        assert d.shape[1] != k-1, "not k-1 nearest neighbors!"
    else:
        if isinstance(data, sp.spmatrix) and data.shape[0] == data.shape[1]:
            print("Using neighbor information from provided graph, rather than computing neighbors directly", flush=True)
            lilmatrix = data.tolil()
            d = np.vstack(lilmatrix.data).astype('float32')  # distances
            idx = np.vstack(lilmatrix.rows).astype(
                'int32')  # neighbor indices by row
            del lilmatrix
            assert idx.shape[0] == data.shape[0]
            k = idx.shape[1]
        else:
            d, idx = find_neighbors(
                data, k=k, use_gpu=use_gpu, metric=primary_metric, method=nn_method, n_jobs=n_jobs)
            print("Neighbors computed in {} seconds".format(
                time.time() - tic), flush=True)

    subtic = time.time()
    kernelargs['idx'] = idx
    # if not using jaccard kernel, use gaussian
    if not jaccard:
        kernelargs['d'] = d
        kernelargs['sigma'] = 1.
        kernel = gaussian_kernel
        graph = neighbor_graph(kernel, kernelargs)
        print("Gaussian kernel graph constructed in {} seconds".format(
            time.time() - subtic), flush=True)
    else:
        del d
        graph = neighbor_graph(kernel, kernelargs)
        print("Jaccard graph constructed in {} seconds".format(
            time.time() - subtic), flush=True)
    if not directed:
        if not prune:
            # symmetrize graph by averaging with transpose
            sg = (graph + graph.transpose()).multiply(.5)
        else:
            # symmetrize graph by multiplying with transpose
            sg = graph.multiply(graph.transpose())
        # retain lower triangle (for efficiency)
        graph = sp.tril(sg, -1)
    # write to file with unique id
    uid = uuid.uuid1().hex
    graph2binary(uid, graph)
    communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit)
    print("PhenoGraph complete in {} seconds".format(
        time.time() - tic), flush=True)
    communities = sort_by_size(communities, min_cluster_size)
    # clean up
    for f in os.listdir():
        if re.search(uid, f):
            os.remove(f)

    return communities, graph, Q