Пример #1
0
def consensus_clustering(datapath, stepsize, out_dir):
    graph, initial_partition = read_data(datapath)

    cons_modularities = []
    cons_memberships = []
    t_range = np.arange(0, 1, stepsize)
    for t in t_range:
        print('threshold: {}'.format(t))
        cm, consensus_membership = nwtools.consensus.consensus_partition(graph,
                                                                         weights='weight',
                                                                         initial_partition=initial_partition,
                                                                         nr_partitions=len(
                                                                             initial_partition),
                                                                         threshold=t,
                                                                         verbose=True)
        cons_memberships.append(consensus_membership)
        cons_modularities.append(leidenalg.ModularityVertexPartition(graph,
                                                                     initial_membership=consensus_membership,
                                                                     weights='weight').quality())

    df_cons_memberships = pd.DataFrame(cons_memberships).transpose()
    df_cons_memberships.columns = ['{:.3f}'.format(t) for t in t_range]
    df_cons_memberships.index = graph.vs['name']
    df_cons_memberships.to_csv(
        os.path.join(datapath, 'consensus_thresholds.csv'))

    df_modularities = pd.DataFrame({'threshold': t_range,
                                    'modularity': cons_modularities})
    df_modularities.to_csv(os.path.join(datapath, 'thresholds_modularity.csv'),
                           index=False)
Пример #2
0
    def fit(self):
        '''Compute communities from a matrix with fixed nodes

        Returns:
            None, but the membership attribute is set as an array of int with
            size N - n_fixed with the community/cluster membership of all
            columns except the first n fixed ones.
        '''
        self._parse_graph()

        aa = self.annotations
        n_fixed = len(aa)
        g = self.graph
        N = g.vcount()

        opt = leidenalg.Optimiser()
        fixed_nodes = [int(i < n_fixed) for i in range(N)]

        # NOTE: initial membership is singletons except for atlas nodes, which
        # get the membership they have.
        aau = list(np.unique(aa))
        aaun = len(aau)
        initial_membership = []
        for j in range(N):
            if j < n_fixed:
                mb = aau.index(aa[j])
            else:
                mb = aaun + (j - n_fixed)
            initial_membership.append(mb)

        if self.metric == 'cpm':
            partition = leidenalg.CPMVertexPartition(
                g,
                resolution_parameter=self.resolution_parameter,
                initial_membership=initial_membership,
            )
        elif self.metric == 'modularity':
            partition = leidenalg.ModularityVertexPartition(
                g,
                resolution_parameter=self.resolution_parameter,
                initial_membership=initial_membership,
            )
        else:
            raise ValueError('clustering_metric not understood: {:}'.format(
                self.metric))

        # Run modified Leiden here
        opt.optimise_partition(partition, fixed_nodes=fixed_nodes)

        # Exctract result
        membership = partition.membership[n_fixed:]

        # Convert the known cell types
        lstring = len(max(aau, key=len))
        self.membership = np.array([str(x) for x in membership],
                                   dtype='U{:}'.format(lstring))
        for i, ct in enumerate(aau):
            self.membership[self.membership == str(i)] = ct
Пример #3
0
def read_data(datapath):
    partitions_df = pd.read_csv(os.path.join(datapath, 'partitions.csv'),
                                na_filter=False)
    graph = ig.Graph.Read_Pickle(os.path.join(datapath, 'graph.pkl'))

    initial_partition = [leidenalg.ModularityVertexPartition(graph,
                                                             initial_membership=row.values,
                                                             weights='weight')
                         for i, row in partitions_df.iterrows()]

    return graph, initial_partition
    def community_consensus_iterative(self, C):
        ## function finding the consensus of a given set of partitions. refer to the paper:
        ## 'Robust detection of dynamic community structure in networks', Danielle S. Bassett,
        ## Mason A. Porter, Nicholas F. Wymbs, Scott T. Grafton, Jean M. Carlson et al.

        npart, m = C.shape
        C_rand3 = np.zeros((C.shape))  #permuted version of C
        X = np.zeros((m, m))  #Nodal association matrix for C
        X_rand3 = X  # Random nodal association matrix for C_rand3

        # randomly permute rows of C
        for i in range(npart):
            C_rand3[i, :] = C[i, np.random.permutation(m)]
            for k in range(m):
                for p in range(m):
                    if int(C[i, k]) == int(C[i, p]):
                        X[p, k] = X[
                            p,
                            k] + 1  #(i,j) is the # of times node i and j are assigned in the same comm
                    if int(C_rand3[i, k]) == int(C_rand3[i, p]):
                        X_rand3[p, k] = X_rand3[
                            p,
                            k] + 1  #(i,j) is the # of times node i and j are expected to be assigned in the same comm by chance
        #thresholding
        #keep only associated assignments that occur more often than expected in the random data

        X_new3 = np.zeros((m, m))
        X_new3[X > (np.max(np.triu(X_rand3, 1))) /
               2] = X[X > (np.max(np.triu(X_rand3, 1))) / 2]

        ##turn thresholded nodal association matrix into igraph
        edge_list = []
        weight_list = []
        for k, e in enumerate(np.transpose(np.nonzero(X_new3))):
            i, j = e[0], e[1]
            pair = (i, j)
            edge_list.append(pair)
            weight_list.append(X_new3[i][j])

        G = ig.Graph()
        G.add_vertices(m)
        G.add_edges(edge_list)
        G.es['weight'] = weight_list
        G.vs['id'] = list(range(m))

        optimiser = la.Optimiser()
        partition = la.ModularityVertexPartition(G, weights='weight')
        diff = optimiser.optimise_partition(partition, n_iterations=-1)

        return (partition)
Пример #5
0
    def compute_communities(self):
        '''Compute communities from a matrix with fixed nodes

        Returns:
            None, but SemiAnnotate.membership is set as an array of int with
            size N - n_fixed with the community/cluster membership of all
            columns except the first n_fixed ones.
        '''
        import inspect
        import igraph as ig
        import leidenalg

        # Check whether this version of Leiden has fixed nodes support
        opt = leidenalg.Optimiser()
        sig = inspect.getfullargspec(opt.optimise_partition)
        if 'fixed_nodes' not in sig.args:
            raise ImportError(
                'This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version'
            )

        matrix = self.matrix
        aa = self.cell_types
        aau = np.unique(aa)
        n_fixed = self.n_fixed
        clustering_metric = self.clustering_metric
        resolution_parameter = self.resolution_parameter
        neighbors = self.neighbors

        L, N = matrix.shape

        # Construct graph from the lists of neighbors
        edges_d = set()
        for i, neis in enumerate(neighbors):
            for n in neis:
                edges_d.add(frozenset((i, n)))

        edges = [tuple(e) for e in edges_d]
        g = ig.Graph(n=N, edges=edges, directed=False)

        # NOTE: initial membership is singletons except for atlas nodes, which
        # get the membership they have.
        aaun = len(aau)
        initial_membership = []
        for j in range(N):
            if j < self.n_fixed:
                mb = aau.index(aa[j])
            else:
                mb = aaun + (j - n_fixed)
            initial_membership.append(mb)

        # Compute communities with semi-supervised Leiden
        if clustering_metric == 'cpm':
            partition = leidenalg.CPMVertexPartition(
                g,
                resolution_parameter=resolution_parameter,
                initial_membership=initial_membership,
            )
        elif clustering_metric == 'modularity':
            partition = leidenalg.ModularityVertexPartition(
                g,
                resolution_parameter=resolution_parameter,
                initial_membership=initial_membership,
            )
        else:
            raise ValueError('clustering_metric not understood: {:}'.format(
                clustering_metric))

        fixed_nodes = [int(i < n_fixed) for i in range(N)]
        opt.optimise_partition(partition, fixed_nodes=fixed_nodes)
        membership = partition.membership[n_fixed:]

        # Convert the known cell types
        lstring = len(max(self.cell_types, key=len))
        self.membership = np.array([str(x) for x in membership],
                                   dtype='U{:}'.format(lstring))
        for i, ct in enumerate(self.cell_types):
            self.membership[self.membership == str(i)] = ct
Пример #6
0
    def leiden(
        self,
        axis,
        edges,
        edge_weights=None,
        metric='cpm',
        resolution_parameter=0.001,
        initial_membership=None,
        fixed_nodes=None,
    ):
        '''Graph-based Leiden clustering

        Args:
            axis (string): It must be 'samples' or 'features'.
                The Dataset.counts matrix is used and
                either samples or features are clustered.
            edges (list of pairs): list of edges to make a graph used to
            cluster. Each member of a pair is an int referring to the index
            of the sample or feature in the sample/featuresheet.
            edge_weights (list of float or None): edge weights to use for
            clustering. If None, all edge weights are 1.
            metric (str): What metric to optimize. Can be 'modularity' or
            'cpm'.
            resolution_parameter (float): a number between 0 and 1 that sets
            how easy it is to call new clusters.
            initial_membership (str or None): name of a metadata column
            containing the initial membership vector for the clustering. If
            None (default), each samples starts as a singleton
            fixed_nodes (str or None): name of a metadata column containing
            a boolean vector for which nodes are not allowed to change
            cluster membership during the Leiden algorithm. Your version of
            leidenalg must support fixed nodes for this feature to work.

        Returns:
            pd.Series with the labels of the clusters.
        '''
        import igraph as ig
        import leidenalg

        if axis == 'samples':
            n_nodes = self.dataset.n_samples
            index = self.dataset.samplenames
        elif axis == 'features':
            n_nodes = self.dataset.n_features
            index = self.dataset.featurenames

        g = ig.Graph(n=n_nodes, edges=edges, directed=False)
        if edge_weights is not None:
            g.es['weight'] = edge_weights

        if initial_membership is not None:
            if axis == 'samples':
                im = self.dataset.samplesheet[
                    initial_membership].values.astype(int)
            else:
                im = self.dataset.featuresheet[
                    initial_membership].values.astype(int)
        else:
            im = np.arange(n_nodes)
        im = list(im)

        if metric == 'cpm':
            partition = leidenalg.CPMVertexPartition(
                g,
                resolution_parameter=resolution_parameter,
                initial_membership=im,
            )
        elif metric == 'modularity':
            partition = leidenalg.ModularityVertexPartition(
                g,
                resolution_parameter=resolution_parameter,
                initial_membership=im,
            )
        else:
            raise ValueError(
                'clustering_metric not understood: {:}'.format(metric))

        opt = leidenalg.Optimiser()

        if fixed_nodes is not None:
            if axis == 'samples':
                fxn = self.dataset.samplesheet[fixed_nodes].values.astype(int)
            else:
                fxn = self.dataset.featuresheet[fixed_nodes].values.astype(int)
            fxn = list(fxn)

            opt.optimise_partition(partition, fixed_nodes=fxn)
        else:
            opt.optimise_partition(partition)

        communities = partition.membership

        labels = pd.Series(communities, index=index)

        return labels
Пример #7
0
    def cluster_graph(self):
        '''Compute communities from a matrix with fixed nodes

        Returns:
            None, but Averages.membership is set as an array with
            size N - n_fixed with the atlas cell types of all cells from the
            new dataset.
        '''
        import inspect
        import leidenalg

        # Check whether this version of Leiden has fixed nodes support
        opt = leidenalg.Optimiser()
        sig = inspect.getfullargspec(opt.optimise_partition)
        if 'fixed_nodes' not in sig.args:
            raise ImportError('This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version')

        matrix = self.matrix
        sizes = self.sizes
        n_fixed = self.n_fixed
        clustering_metric = self.clustering_metric
        resolution_parameter = self.resolution_parameter
        g = self.graph

        L, N = matrix.shape
        n_fixede = int(np.sum(sizes[:n_fixed]))
        Ne = int(np.sum(sizes))

        # NOTE: initial membership is singletons except for atlas nodes, which
        # get the membership they have.
        initial_membership = []
        for isi in range(N):
            if isi < n_fixed:
                for ii in range(int(self.sizes[isi])):
                    initial_membership.append(isi)
            else:
                initial_membership.append(isi)

        if len(initial_membership) != Ne:
            raise ValueError('initial_membership list has wrong length!')

        # Compute communities with semi-supervised Leiden
        if clustering_metric == 'cpm':
            partition = leidenalg.CPMVertexPartition(
                    g,
                    resolution_parameter=resolution_parameter,
                    initial_membership=initial_membership,
                    )
        elif clustering_metric == 'modularity':
            partition = leidenalg.ModularityVertexPartition(
                    g,
                    resolution_parameter=resolution_parameter,
                    initial_membership=initial_membership,
                    )
        else:
            raise ValueError(
                'clustering_metric not understood: {:}'.format(clustering_metric))

        fixed_nodes = [int(i < n_fixede) for i in range(Ne)]
        opt.optimise_partition(partition, fixed_nodes=fixed_nodes)
        membership = partition.membership[n_fixede:]

        # Convert the known cell types
        lstring = len(max(self.cell_types, key=len))
        self.membership = np.array(
                [str(x) for x in membership],
                dtype='U{:}'.format(lstring))
        for i, ct in enumerate(self.cell_types):
            self.membership[self.membership == str(i)] = ct
Пример #8
0
def leiden(
    adata: AnnData,
    resolution: float = 1,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_state: Optional[Union[int, RandomState]] = 0,
    key_added: str = 'leiden',
    adjacency: Optional[sparse.spmatrix] = None,
    directed: bool = True,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: Optional[Type[MutableVertexPartition]] = None,
    copy: bool = False,
    **partition_kwargs,
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Traag18]_.

    Cluster cells using the Leiden algorithm [Traag18]_,
    an improved version of the Louvain algorithm [Blondel08]_.
    It has been proposed for single-cell analysis by [Levine15]_.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    resolution
        A parameter value controlling the coarseness of the clustering.
        Higher values lead to more clusters.
        Set to `None` if overriding `partition_type`
        to one that doesn’t accept a `resolution_parameter`.
    random_state
        Change the initialization of the optimization.
    restrict_to
        Restrict the clustering to the categories within the key for sample
        annotation, tuple needs to contain `(obs_key, list_of_categories)`.
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']`.
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges).
    n_iterations
        How many iterations of the Leiden clustering algorithm to perform.
        Positive values above 2 define the total number of iterations to perform,
        -1 has the algorithm run until it reaches its optimal clustering.
    partition_type
        Type of partition to use.
        Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`.
        For the available options, consult the documentation for
        :func:`~leidenalg.find_partition`.
    copy
        Whether to copy `adata` or modify it inplace.
    **partition_kwargs
        Any further arguments to pass to `~leidenalg.find_partition`
        (which in turn passes arguments to the `partition_type`).

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell.
    `adata.uns['leiden']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    """
    try:
        import leidenalg
    except ImportError:
        raise ImportError(
            'Please install the leiden algorithm: `pip3 install leidenalg`.'
        )
    partition_kwargs = dict(partition_kwargs)

    start = logg.info('running Leiden clustering')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if 'neighbors' not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        adjacency = adata.uns['neighbors']['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = _utils.get_igraph_from_adjacency(adjacency, directed=directed)
    # flip to the default partition type if not overriden by the user
    if partition_type is None:
        partition_type = leidenalg.RBConfigurationVertexPartition
    # Prepare find_partition arguments as a dictionary,
    # appending to whatever the user provided. It needs to be this way
    # as this allows for the accounting of a None resolution
    # (in the case of a partition variant that doesn't take it on input)
    if use_weights:
        partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64)
    partition_kwargs['n_iterations'] = n_iterations
    partition_kwargs['seed'] = random_state
    if resolution is not None:
        partition_kwargs['resolution_parameter'] = resolution
    # clustering proper
    part = leidenalg.find_partition(g, partition_type, **partition_kwargs)
    # store output into adata.obs
    groups = np.array(part.membership)
    if restrict_to is not None:
        if key_added == 'leiden':
            key_added += '_R'
        groups = rename_groups(
            adata,
            key_added,
            restrict_key,
            restrict_categories,
            restrict_indices,
            groups,
        )
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype('U'),
        categories=natsorted(np.unique(groups).astype('U')),
    )
    # store information on the clustering parameters
    adata.uns[key_added] = {}
    adata.uns[key_added]['params'] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
        use_weights=use_weights,
        directed=directed,
        partition_type=None if partition_type is None else partition_type.__name__,
    )
    # calculate modularity
    modularity_part = leidenalg.ModularityVertexPartition(
        g,
        initial_membership=part.membership,
    )
    q = modularity_part.quality()
    adata.uns[key_added]['modularity'] = q
    logg.info(
        '    finished',
        time=start,
        deep=(
            f'found {len(np.unique(groups))} clusters and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)\n'
            f'    modularity: {q:.3f}, resolution: {resolution}\n'
            f'    added "modularity" key to adata.uns["{key_added}"]'
        ),
    )
    return adata if copy else None