Пример #1
0
    def multiflip_mcmc_sweep(self,
                             n_steps=1000,
                             beta=np.inf,
                             niter=10,
                             verbose=True):
        '''
        Fit the sbm to the word-document network. Use multtiplip_mcmc_sweep
        - n_steps, int (default:1): number of steps.
        '''
        g = self.g
        if g is None:
            print('No data to fit the SBM. Load some data first (make_graph)')
        else:
            clabel = g.vp['kind']

            state_args = {'clabel': clabel, 'pclabel': clabel}
            if "count" in g.ep:
                state_args["eweight"] = g.ep.count

        state = self.state
        if state is not None:
            state = state.copy(bs=state.get_bs() + [np.zeros(1)] * 4,
                               sampling=True)
        else:
            state = gt.NestedBlockState(g)

        for step in range(n_steps):  # this should be sufficiently large
            if verbose:
                print(f"step: {step}")
            state.multiflip_mcmc_sweep(beta=beta, niter=niter)

        self.state = state
        ## minimum description length
        self.mdl = self.state.entropy()
        ## collect group membership for each level in the hierarchy
        L = len(state.levels)
        dict_groups_L = {}

        ## only trivial bipartite structure
        if L == 2:
            self.L = 1
            for l in range(L - 1):
                dict_groups_l = self.get_groups(l=l)
                dict_groups_L[l] = dict_groups_l
        ## omit trivial levels: l=L-1 (single group), l=L-2 (bipartite)
        else:
            self.L = L - 2
            for l in range(L - 2):
                dict_groups_l = self.get_groups(l=l)
                dict_groups_L[l] = dict_groups_l
        self.groups = dict_groups_L
Пример #2
0
def state_from_blocks(
    adata: AnnData,
    state_key: Optional[str] = 'nsbm',
    neighbors_key: Optional[str] = 'neighbors',
    adjacency: Optional[spmatrix] = None,
    directed: bool = False,
    use_weights: bool = False,
    deg_corr: bool = True,
):
    """
    Returns a gt state object given an AnnData

    Parameters
    ----------
    adata
        The annotated data matrix.
    state_key
        The key under which the state has been saved
    neighbors_key
        The key passed to `sc.pp.neighbors`
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
        
    Returns
    -------
    
    Nothing, adds a `gt.block_state` object in adata.uns        
        
    """
    bl_d = adata.uns['schist'][f'{state_key}']['blocks']
    params = adata.uns['schist'][f'{state_key}']['params']
    if params['model'] == 'nested' or params['model'] == 'multiome_nested':
        blocks = []
        for nl in range(len(bl_d)):
            blocks.append(bl_d[str(nl)])
    else:
        blocks = bl_d['0']
    
    if 'deg_corr' in params:
        deg_corr=params['deg_corr']

    recs=[]
    rec_types=[]
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs=[g.ep.weight]
        rec_types=['real-normal']
        
    if 'recs' in params:
        recs=params['recs']
    if 'rec_types' in params:
        rec_types=params['rec_types']
            
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError(
                'You need to run `pp.neighbors` first '
                'to compute a neighborhood graph.'
            )
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']

    g = get_igraph_from_adjacency(adjacency, directed=directed)
    g = g.to_graph_tool()
    gt.remove_parallel_edges(g)

    if params['model'] == 'flat':
        state = gt.BlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    elif params['model'] == 'ppbm':
        state = gt.PPBlockState(g, b=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    else:
        state = gt.NestedBlockState(g, bs=blocks, 
            state_args=dict(deg_corr=deg_corr,
            recs=recs,
            rec_types=rec_types)
            )
    return state            
    
Пример #3
0
def codon_circos(cmap='tab20', filetype="pdf", reverse=False):
    cm = plt.cm.get_cmap(cmap)
    cmappable = ScalarMappable(norm=Normalize(vmin=0, vmax=20), cmap=cm)

    g_codons = gt.Graph(directed=False)
    g_codons.vp.codon = g_codons.new_vertex_property("string")
    g_codons.vp.aa = g_codons.new_vertex_property("string")
    g_codons.vp.aa_index = g_codons.new_vertex_property("int")
    g_codons.vp.aa_color = g_codons.new_vertex_property("vector<float>")
    g_codons.vp.codon_index = g_codons.new_vertex_property("int")
    g_codons.ep.syn = g_codons.new_edge_property("bool")
    g_codons.ep.grad = g_codons.new_edge_property("vector<float>")

    for aa_index, aa in enumerate(aa_order):
        if aa == "X": continue
        for codon_index, c in enumerate(sorted([k for k, v in codontable.items() if v == aa])):
            v = g_codons.add_vertex()
            g_codons.vp.codon[v] = c
            g_codons.vp.aa[v] = aa
            g_codons.vp.codon_index[v] = codon_index
            g_codons.vp.aa_index[v] = aa_index
            g_codons.vp.aa_color[v] = cmappable.to_rgba(aa_index)

    for ref in g_codons.vertices():
        for alt in g_codons.vertices():
            if alt <= ref: continue
            codon_ref, codon_alt = g_codons.vp.codon[ref], g_codons.vp.codon[alt]
            if distance_str(codon_ref, codon_alt) != 1: continue
            if codontable[codon_ref] != codontable[codon_alt]:
                e_c = g_codons.add_edge(ref, alt)
                g_codons.ep.syn[e_c] = False
                x = cmappable.to_rgba(g_codons.vp.aa_index[ref])[:3]
                y = cmappable.to_rgba(g_codons.vp.aa_index[alt])[:3]
                if reverse: x, y = y, x
                g_codons.ep.grad[e_c] = [0.0, *x, 0.75, 1.0, *y, 0.75]

    for ref in g_codons.vertices():
        for alt in g_codons.vertices():
            if alt >= ref: continue
            codon_ref, codon_alt = g_codons.vp.codon[ref], g_codons.vp.codon[alt]
            if distance_str(codon_ref, codon_alt) != 1: continue
            if codontable[codon_ref] == codontable[codon_alt]:
                e_c = g_codons.add_edge(ref, alt)
                g_codons.ep.syn[e_c] = True
                syn_color = 0.0, 0.0, 0.0, 1.0
                g_codons.ep.grad[e_c] = [0.0, *syn_color, 1.0, *syn_color]

    assert g_codons.num_vertices() == 61
    dist = gt.shortest_distance(g_codons)
    r = max([max(dist[g_codons.vertex(i)].a) for i in g_codons.vertices()])
    print('Codons graph radius : {0}'.format(r))
    print('Codons : {0} transitions out of {1} possibles.'.format(g_codons.num_edges(), 61 * 60 / 2))
    syn_array = g_codons.ep.syn.get_array()
    print('Codons : {0} are synonymous and {1} are non-synonymous.'.format(sum(syn_array),
                                                                           len(syn_array) - sum(syn_array)))
    state = gt.NestedBlockState(g_codons, bs=[g_codons.vp.aa_index, g_codons.vp.codon_index], sampling=False)
    t = gt.get_hierarchy_tree(state)[0]
    tpos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
    cts = gt.get_hierarchy_control_points(g_codons, t, tpos)
    pos = g_codons.own_property(tpos)
    gt.graph_draw(g_codons, pos=pos, edge_control_points=cts, edge_gradient=g_codons.ep.grad, edge_pen_width=2.5,
                  vertex_text=g_codons.vp.codon, vertex_anchor=0, vertex_font_size=9, vertex_pen_width=1.6,
                  vertex_color=(0.65, 0.65, 0.65, 1), vertex_fill_color=g_codons.vp.aa_color, vertex_size=25.0,
                  output="gt-codon-{0}.{1}".format(cmap, filetype))
Пример #4
0
        ecolor[e] = 'green'
    i += 1

g.edge_properties["weight"] = eweight
g.edge_properties["color"] = ecolor
'''

#pos = gt.planar_layout(g)
#pos = gt.radial_tree_layout(g, g.vertex(0))

for i in range(3):
    pos = gt.arf_layout(g, d = 1, a = 5, max_iter=0) # good
    #pos = gt.fruchterman_reingold_layout(g, n_iter=1000)
    #pos = gt.sfdp_layout(g, C = 1)
    #pos = gt.circle_layout(g)
    #gt.graph_draw(g, pos = pos, vertex_text=g.vertex_properties["name"], vertex_font_size=20, vertex_size=10, vertex_color = 'white', vertex_fill_color = 'blue', vertex_text_position=0, output_size=(2000, 1000), output="imgs/small_graph_top_" + str(i) + ".pdf")
    #gt.graph_draw(g, pos = pos, vertex_text=g.vertex_properties["name"], vertex_font_size=20, vertex_size=10, vertex_color = 'white', vertex_fill_color = 'blue', vertex_text_position=0, output_size=(2000, 1000), output="imgs/small_graph_top_" + str(i) + ".png")

    state = gt.minimize_blockmodel_dl(g) # , deg_corr=True, B_min = 10
    state.draw(pos=pos, vertex_shape=state.get_blocks(), vertex_text=g.vertex_properties["name"], vertex_font_size=20, vertex_size=20, edge_pen_width = 2, vertex_text_position=0, output="small_graph_top/small_graph_top_blocks_mdl_" + str(i) + ".pdf", output_size=(1500, 1000), fit_view=1.1)
    state.draw(pos=pos, vertex_shape=state.get_blocks(), vertex_text=g.vertex_properties["name"], vertex_font_size=20, vertex_size=20, edge_pen_width = 2, vertex_text_position=0, output="small_graph_top/small_graph_top_blocks_mdl_" + str(i) + ".png", output_size=(1500, 1000), fit_view=1.1)
    print(i)
    #gt.draw_hierarchy(state, layout="sfdp", vertex_text=g.vertex_properties["name"], vertex_font_size=24, vertex_text_position="centered", edge_color=g.edge_properties["color"], output_size=(2000, 1000), output="small_graph_mdl.pdf", fit_view = 0.8, hide = 2)

print(vchrom)
print(np.array(vchrom))


state = gt.NestedBlockState(g, [np.array(vchrom), np.arange(0, 22)])
gt.draw_hierarchy(state, vertex_text=g.vertex_properties["name"], vertex_font_size=24, vertex_text_position="centered", output_size=(2000, 1000), output="small_graph_top/small_graph_top_mdl.pdf", fit_view = 0.8, hide = 2)
Пример #5
0
def nested_model(
    adata: AnnData,
    max_iterations: int = 1000000,
    epsilon: float = 0,
    equilibrate: bool = False,
    wait: int = 1000,
    nbreaks: int = 2,
    collect_marginals: bool = False,
    niter_collect: int = 10000,
    hierarchy_length: int = 10,
    deg_corr: bool = True,
    multiflip: bool = True,
    fast_model: bool = False,
    fast_tol: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    n_init: int = 1,
    beta_range: Tuple[float] = (1., 1000.),
    steps_anneal: int = 3,
    resume: bool = False,
    *,
    restrict_to: Optional[Tuple[str, Sequence[str]]] = None,
    random_seed: Optional[int] = None,
    key_added: str = 'nsbm',
    adjacency: Optional[sparse.spmatrix] = None,
    neighbors_key: Optional[str] = 'neighbors',
    directed: bool = False,
    use_weights: bool = False,
    prune: bool = False,
    return_low: bool = False,
    copy: bool = False,
    minimize_args: Optional[Dict] = {},
    equilibrate_args: Optional[Dict] = {},
) -> Optional[AnnData]:
    """\
    Cluster cells into subgroups [Peixoto14]_.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    a hierarchical version of Stochastic Block Model [Holland83]_, performing
    Bayesian inference on node groups. NSBM should circumvent classical
    limitations of SBM in detecting small groups in large graphs
    replacing the noninformative priors used by a hierarchy of priors
    and hyperpriors.

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    Parameters
    ----------
    adata
        The annotated data matrix.
    max_iterations
        Maximal number of iterations to be performed by the equilibrate step.
    epsilon
        Relative changes in entropy smaller than epsilon will
        not be considered as record-breaking.
    equilibrate
        Whether or not perform the mcmc_equilibrate step.
        Equilibration should always be performed. Note, also, that without
        equilibration it won't be possible to collect marginals.
    collect_marginals
        Whether or not collect node probability of belonging
        to a specific partition.
    niter_collect
        Number of iterations to force when collecting marginals. This will
        increase the precision when calculating probabilites
    wait
        Number of iterations to wait for a record-breaking event.
        Higher values result in longer computations. Set it to small values
        when performing quick tests.
    nbreaks
        Number of iteration intervals (of size `wait`) without
        record-breaking events necessary to stop the algorithm.
    hierarchy_length
        Initial length of the hierarchy. When large values are
        passed, the top-most levels will be uninformative as they
        will likely contain the very same groups. Increase this valus
        if a very large number of cells is analyzed (>100.000).
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    multiflip
        Whether to perform MCMC sweep with multiple simultaneous moves to sample
        network partitions. It may result in slightly longer runtimes, but under
        the hood it allows for a more efficient space exploration.
    fast_model
        Whether to skip initial minization step and let the MCMC find a solution. 
        This approach tend to be faster and consume less memory, but may be
        less accurate.
    fast_tol
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    n_init
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    beta_range
        Inverse temperature at the beginning and the end of the equilibration
    steps_anneal
        Number of steps in which the simulated annealing is performed
    resume
        Start from a previously created model, if any, without initializing a novel
        model    
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    prune
        Some high levels in hierarchy may contain the same information in terms of 
        cell assignments, even if they apparently have different group names. When this
        option is set to `True`, the function only returns informative levels.
        Note, however, that cell affinities are still reported for all levels. Pruning
        does not rename group levels
    return_low
        Whether or not return nsbm_level_0 in adata.obs. This level usually contains
        so many groups that it cannot be plot anyway, but it may be useful for particular
        analysis. By default it is not returned
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['nsbm']['params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['nsbm']['stats']`
        A dict with the values returned by mcmc_sweep
    `adata.uns['nsbm']['cell_affinity']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['nsbm']['state']`
        The NestedBlockModel state object
    """

    if resume:
        # if the fast_model is chosen perform equilibration anyway
        # also if a model has previously created
        equilibrate = True

    if resume and ('nsbm' not in adata.uns
                   or 'state' not in adata.uns['nsbm']):
        # let the model proceed as default
        logg.warning('Resuming has been specified but a state was not found\n'
                     'Will continue with default minimization step')

        resume = False

    if random_seed:
        np.random.seed(random_seed)
        gt.seed_rng(random_seed)

    if collect_marginals:
        logg.warning('Collecting marginals has a large impact on running time')
        if not equilibrate:
            raise ValueError(
                "You can't collect marginals without MCMC equilibrate "
                "step. Either set `equlibrate` to `True` or "
                "`collect_marginals` to `False`")

    start = logg.info('minimizing the nested Stochastic Block Model')
    adata = adata.copy() if copy else adata
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        if neighbors_key not in adata.uns:
            raise ValueError('You need to run `pp.neighbors` first '
                             'to compute a neighborhood graph.')
        elif 'connectivities_key' in adata.uns[neighbors_key]:
            # scanpy>1.4.6 has matrix in another slot
            conn_key = adata.uns[neighbors_key]['connectivities_key']
            adjacency = adata.obsp[conn_key]
        else:
            # scanpy<=1.4.6 has sparse matrix here
            adjacency = adata.uns[neighbors_key]['connectivities']
    if restrict_to is not None:
        restrict_key, restrict_categories = restrict_to
        adjacency, restrict_indices = restrict_adjacency(
            adata,
            restrict_key,
            restrict_categories,
            adjacency,
        )
    # convert it to igraph
    g = get_graph_tool_from_adjacency(adjacency, directed=directed)

    recs = []
    rec_types = []
    if use_weights:
        # this is not ideal to me, possibly we may need to transform
        # weights. More tests needed.
        recs = [g.ep.weight]
        rec_types = ['real-normal']

    if n_init < 1:
        n_init = 1

    if fast_model:
        # do not minimize, start with a dummy state and perform only equilibrate

        states = [
            gt.NestedBlockState(g=g,
                                state_args=dict(deg_corr=deg_corr,
                                                recs=recs,
                                                rec_types=rec_types))
            for n in range(n_init)
        ]
        for x in range(n_init):
            dS = 1
            while np.abs(dS) > fast_tol:
                # perform sweep until a tolerance is reached
                dS, _, _ = states[x].multiflip_mcmc_sweep(beta=beta,
                                                          niter=n_sweep)

        _amin = np.argmin([s.entropy() for s in states])
        state = states[_amin]

        #        dS = 1
        #        while np.abs(dS) > fast_tol:
        #            dS, nattempts, nmoves = state.multiflip_mcmc_sweep(niter=10, beta=np.inf)
        bs = state.get_bs()
        logg.info('    done', time=start)

    elif resume:
        # create the state and make sure sampling is performed
        state = adata.uns['nsbm']['state'].copy(sampling=True)
        bs = state.get_bs()
        # get the graph from state
        g = state.g
    else:

        states = [
            gt.minimize_nested_blockmodel_dl(
                g,
                deg_corr=deg_corr,
                state_args=dict(recs=recs, rec_types=rec_types),
                **minimize_args) for n in range(n_init)
        ]

        state = states[np.argmin([s.entropy() for s in states])]
        #        state = gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr,
        #                                                 state_args=dict(recs=recs,
        #                                                 rec_types=rec_types),
        #                                                 **minimize_args)
        logg.info('    done', time=start)
        bs = state.get_bs()
        if len(bs) <= hierarchy_length:
            # increase hierarchy length up to the specified value
            # according to Tiago Peixoto 10 is reasonably large as number of
            # groups decays exponentially
            bs += [np.zeros(1)] * (hierarchy_length - len(bs))
        else:
            logg.warning(
                f'A hierarchy length of {hierarchy_length} has been specified\n'
                f'but the minimized model contains {len(bs)} levels')
            pass
        # create a new state with inferred blocks
        state = gt.NestedBlockState(g,
                                    bs,
                                    state_args=dict(recs=recs,
                                                    rec_types=rec_types),
                                    sampling=True)

    # equilibrate the Markov chain
    if equilibrate:
        logg.info('running MCMC equilibration step')
        # equlibration done by simulated annealing

        equilibrate_args['wait'] = wait
        equilibrate_args['nbreaks'] = nbreaks
        equilibrate_args['max_niter'] = max_iterations
        equilibrate_args['multiflip'] = multiflip
        equilibrate_args['mcmc_args'] = {'niter': 10}

        dS, nattempts, nmoves = gt.mcmc_anneal(
            state,
            mcmc_equilibrate_args=equilibrate_args,
            niter=steps_anneal,
            beta_range=beta_range)
    if collect_marginals and equilibrate:
        # we here only retain level_0 counts, until I can't figure out
        # how to propagate correctly counts to higher levels
        # I wonder if this should be placed after group definition or not
        logg.info('    collecting marginals')
        group_marginals = [
            np.zeros(g.num_vertices() + 1) for s in state.get_levels()
        ]

        def _collect_marginals(s):
            levels = s.get_levels()
            for l, sl in enumerate(levels):
                group_marginals[l][sl.get_nonempty_B()] += 1

        gt.mcmc_equilibrate(state,
                            wait=wait,
                            nbreaks=nbreaks,
                            epsilon=epsilon,
                            max_niter=max_iterations,
                            multiflip=True,
                            force_niter=niter_collect,
                            mcmc_args=dict(niter=10),
                            callback=_collect_marginals)
        logg.info('    done', time=start)

    # everything is in place, we need to fill all slots
    # first build an array with
    groups = np.zeros((g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        new_cat_names = dict([
            (cx, u'%s' % cn)
            for cn, cx in enumerate(groups.loc[:, c].cat.categories)
        ])
        groups.loc[:, c].cat.rename_categories(new_cat_names, inplace=True)

    if restrict_to is not None:
        groups.index = adata.obs[restrict_key].index
    else:
        groups.index = adata.obs_names

    # add column names
    groups.columns = [
        "%s_level_%d" % (key_added, level) for level in range(len(bs))
    ]

    # remove any column with the same key
    keep_columns = [
        x for x in adata.obs.columns
        if not x.startswith('%s_level_' % key_added)
    ]
    adata.obs = adata.obs.loc[:, keep_columns]
    # concatenate obs with new data, skipping level_0 which is usually
    # crap. In the future it may be useful to reintegrate it
    # we need it in this function anyway, to match groups with node marginals
    if return_low:
        adata.obs = pd.concat([adata.obs, groups], axis=1)
    else:
        adata.obs = pd.concat([adata.obs, groups.iloc[:, 1:]], axis=1)

    # add some unstructured info

    adata.uns['nsbm'] = {}
    adata.uns['nsbm']['stats'] = dict(level_entropy=np.array(
        [state.level_entropy(x) for x in range(len(state.levels))]),
                                      modularity=np.array([
                                          gt.modularity(
                                              g, state.project_partition(x, 0))
                                          for x in range(len((state.levels)))
                                      ]))
    if equilibrate:
        adata.uns['nsbm']['stats']['dS'] = dS
        adata.uns['nsbm']['stats']['nattempts'] = nattempts
        adata.uns['nsbm']['stats']['nmoves'] = nmoves

    adata.uns['nsbm']['state'] = state

    # now add marginal probabilities.

    if collect_marginals:
        # refrain group marginals. We collected data in vector as long as
        # the number of cells, cut them into appropriate length data
        adata.uns['nsbm']['group_marginals'] = {}
        for nl, level_marginals in enumerate(group_marginals):
            idx = np.where(level_marginals > 0)[0] + 1
            adata.uns['nsbm']['group_marginals'][nl] = np.array(
                level_marginals[:np.max(idx)])

    # prune uninformative levels, if any
    if prune:
        to_remove = prune_groups(groups)
        logg.info(f'    Removing levels f{to_remove}')
        adata.obs.drop(to_remove, axis='columns', inplace=True)

    # calculate log-likelihood of cell moves over the remaining levels
    # we have to calculate events at level 0 and propagate to upper levels
    logg.info('    calculating cell affinity to groups')
    levels = [
        int(x.split('_')[-1]) for x in adata.obs.columns
        if x.startswith(f'{key_added}_level')
    ]
    adata.uns['nsbm']['cell_affinity'] = dict.fromkeys(
        [str(x) for x in levels])
    p0 = get_cell_loglikelihood(state, level=0, as_prob=True)

    adata.uns['nsbm']['cell_affinity'][0] = p0
    l0 = "%s_level_0" % key_added
    for nl, level in enumerate(groups.columns[1:]):
        cross_tab = pd.crosstab(groups.loc[:, l0], groups.loc[:, level])
        cl = np.zeros((p0.shape[0], cross_tab.shape[1]), dtype=p0.dtype)
        for x in range(cl.shape[1]):
            # sum counts of level_0 groups corresponding to
            # this group at current level
            cl[:, x] = p0[:, np.where(cross_tab.iloc[:, x] > 0)[0]].sum(axis=1)
        adata.uns['nsbm']['cell_affinity'][str(nl + 1)] = cl / np.sum(
            cl, axis=1)[:, None]

    # last step is recording some parameters used in this analysis
    adata.uns['nsbm']['params'] = dict(
        epsilon=epsilon,
        wait=wait,
        nbreaks=nbreaks,
        equilibrate=equilibrate,
        fast_model=fast_model,
        collect_marginals=collect_marginals,
        hierarchy_length=hierarchy_length,
        random_seed=random_seed,
        prune=prune,
    )

    logg.info(
        '    finished',
        time=start,
        deep=
        (f'found {state.get_levels()[1].get_nonempty_B()} clusters at level_1, and added\n'
         f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adata if copy else None
Пример #6
0
def nested_model_multi(
    adatas: List[AnnData],
    deg_corr: bool = True,
    tolerance: float = 1e-6,
    n_sweep: int = 10,
    beta: float = np.inf,
    samples: int = 100,
    collect_marginals: bool = True,
    n_jobs: int = -1,
    *,
    random_seed: Optional[int] = None,
    key_added: str = 'multi_nsbm',
    adjacency: Optional[List[sparse.spmatrix]] = None,
    neighbors_key: Optional[List[str]] = ['neighbors'],
    directed: bool = False,
    use_weights: bool = False,
    save_model: Union[str, None] = None,
    copy: bool = False,
    #    minimize_args: Optional[Dict] = {},
    dispatch_backend: Optional[str] = 'processes',
    #    equilibrate_args: Optional[Dict] = {},
) -> Optional[List[AnnData]]:
    """\
    Cluster cells into subgroups using multiple modalities.

    Cluster cells using the nested Stochastic Block Model [Peixoto14]_,
    performing Bayesian inference on node groups. This function takes multiple
    experiments, possibly across different modalities, and perform joint
    clustering.
    

    This requires having ran :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first. It also requires cells having the same
    names if coming from paired experiments

    Parameters
    ----------
    adatas
        A list of processed AnnData. Neighbors must have been already
        calculated.
    deg_corr
        Whether to use degree correction in the minimization step. In many
        real world networks this is the case, although this doesn't seem
        the case for KNN graphs used in scanpy.
    tolerance
        Tolerance for fast model convergence.
    n_sweep 
        Number of iterations to be performed in the fast model MCMC greedy approach
    beta
        Inverse temperature for MCMC greedy approach    
    samples
        Number of initial minimizations to be performed. The one with smaller
        entropy is chosen
    n_jobs
        Number of parallel computations used during model initialization
    key_added
        `adata.obs` key under which to add the cluster labels.
    adjacency
        Sparse adjacency matrix of the graph, defaults to
        `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or
        `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6
    neighbors_key
        The key passed to `sc.pp.neighbors`. If all AnnData share the same key, one
        only has to be specified, otherwise the full tuple of all keys must 
        be provided
    directed
        Whether to treat the graph as directed or undirected.
    use_weights
        If `True`, edge weights from the graph are used in the computation
        (placing more emphasis on stronger edges). Note that this
        increases computation times
    save_model
        If provided, this will be the filename for the PartitionModeState to 
        be saved    
    copy
        Whether to copy `adata` or modify it inplace.
    random_seed
        Random number to be used as seed for graph-tool

    Returns
    -------
    `adata.obs[key_added]`
        Array of dim (number of samples) that stores the subgroup id
        (`'0'`, `'1'`, ...) for each cell. 
    `adata.uns['schist']['multi_level_params']`
        A dict with the values for the parameters `resolution`, `random_state`,
        and `n_iterations`.
    `adata.uns['schist']['multi_level_stats']`
        A dict with the values returned by mcmc_sweep
    `adata.obsm['CA_multi_nsbm_level_{n}']`
        A `np.ndarray` with cell probability of belonging to a specific group
    `adata.uns['schist']['multi_level_state']`
        The NestedBlockModel state object
    """

    if random_seed:
        np.random.seed(random_seed)

    seeds = np.random.choice(range(samples**2), size=samples, replace=False)

    if collect_marginals and samples < 100:
        logg.warning(
            'Collecting marginals requires sufficient number of samples\n'
            f'It is now set to {samples} and should be at least 100')

    start = logg.info('minimizing the nested Stochastic Block Model')

    if copy:
        adatas = [x.copy() for x in adatas]

    n_keys = len(neighbors_key)
    n_data = len(adatas)
    # are we clustering a user-provided graph or the default AnnData one?
    if adjacency is None:
        adjacency = []
        if n_keys > 1 and n_keys < n_data:
            raise ValueError(
                'The number of neighbors keys does not match'
                'the number of data matrices. Either fix this'
                'or pass a neighbor key that is shared across all modalities')
        if n_keys == 1:
            neighbors_key = [neighbors_key[0] for x in range(n_data)]
        for x in range(n_data):
            logg.info(f'getting adjacency for data {x}', time=start)
            if neighbors_key[x] not in adatas[x].uns:
                raise ValueError('You need to run `pp.neighbors` first '
                                 'to compute a neighborhood graph. for'
                                 f'data entry {x}')
            elif 'connectivities_key' in adatas[x].uns[neighbors_key[x]]:
                # scanpy>1.4.6 has matrix in another slot
                conn_key = adatas[x].uns[
                    neighbors_key[x]]['connectivities_key']
                adjacency.append(adatas[x].obsp[conn_key])
            else:
                # scanpy<=1.4.6 has sparse matrix here
                adjacency.append(
                    adatas[x].uns[neighbors_key[x]]['connectivities'])

    # convert it to igraph and graph-tool

    graph_list = []
    for x in range(n_data):
        g = get_igraph_from_adjacency(adjacency[x], directed=directed)
        g = g.to_graph_tool()
        gt.remove_parallel_edges(g)
        # add cell names to graph, this will be used to create
        # layered graph
        g_names = g.new_vertex_property('string')
        d_names = adatas[x].obs_names
        for xn in range(len(d_names)):
            g_names[xn] = d_names[xn]
        g.vp['cell'] = g_names
        graph_list.append(g)

# skip weights for now
#    recs=[]
#    rec_types=[]
#    if use_weights:
# this is not ideal to me, possibly we may need to transform
# weights. More tests needed.
#        recs=[g.ep.weight]
#        rec_types=['real-normal']

# get a non-redundant list of all cell names across all modalities
    all_names = set(adatas[0].obs_names)
    [all_names.update(adatas[x].obs_names) for x in range(1, n_data)]
    all_names = list(all_names)
    # create the shared graph
    union_g = gt.Graph(directed=False)
    union_g.add_vertex(len(all_names))
    u_names = union_g.new_vertex_property('string')
    for xn in range(len(all_names)):
        u_names[xn] = all_names[xn]
    union_g.vp['cell'] = u_names

    # now handle in a non elegant way the index mapping across all
    # modalities and the unified Graph

    u_cell_index = dict([(union_g.vp['cell'][x], x)
                         for x in range(union_g.num_vertices())])
    # now create layers
    layer = union_g.new_edge_property('int')
    for ng in range(n_data):
        for e in graph_list[ng].edges():
            S, T = e.source(), e.target()
            Sn = graph_list[ng].vp['cell'][S]
            Tn = graph_list[ng].vp['cell'][T]
            Sidx = u_cell_index[Sn]
            Tidx = u_cell_index[Tn]
            ne = union_g.add_edge(Sidx, Tidx)
            layer[ne] = ng + 1  # this is the layer label

    union_g.ep['layer'] = layer
    # DONE! now proceed with standard minimization, ish

    if samples < 1:
        samples = 1

    states = [
        gt.NestedBlockState(g=union_g,
                            base_type=gt.LayeredBlockState,
                            state_args=dict(deg_corr=deg_corr,
                                            ec=union_g.ep.layer,
                                            layers=True))
        for n in range(samples)
    ]

    def fast_min(state, beta, n_sweep, fast_tol, seed=None):
        if seed:
            gt.seed_rng(seed)
        dS = 1
        while np.abs(dS) > fast_tol:
            dS, _, _ = state.multiflip_mcmc_sweep(beta=beta,
                                                  niter=n_sweep,
                                                  c=0.5)
        return state

    states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)(
        delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x])
        for x in range(samples))
    logg.info('        minimization step done', time=start)
    pmode = gt.PartitionModeState([x.get_bs() for x in states],
                                  converge=True,
                                  nested=True)
    bs = pmode.get_max_nested()
    logg.info('        consensus step done', time=start)

    if save_model:
        import pickle
        fname = save_model
        if not fname.endswith('pkl'):
            fname = f'{fname}.pkl'
        logg.info(f'Saving model into {fname}')
        with open(fname, 'wb') as fout:
            pickle.dump(pmode, fout, 2)

    # prune redundant levels at the top
    bs = [x for x in bs if len(np.unique(x)) > 1]
    bs.append(np.array([0],
                       dtype=np.int32))  #in case of type changes, check this
    state = gt.NestedBlockState(union_g,
                                bs=bs,
                                base_type=gt.LayeredBlockState,
                                state_args=dict(deg_corr=deg_corr,
                                                ec=union_g.ep.layer,
                                                layers=True))

    logg.info('    done', time=start)
    u_groups = np.unique(bs[0])
    n_groups = len(u_groups)
    last_group = np.max(u_groups) + 1

    if collect_marginals:
        # note that the size of this will be equal to the number of the groups in Mode
        # but some entries won't sum to 1 as in the collection there may be differently
        # sized partitions
        pv_array = pmode.get_marginal(union_g).get_2d_array(
            range(last_group)).T[:, u_groups] / samples

    groups = np.zeros((union_g.num_vertices(), len(bs)), dtype=int)

    for x in range(len(bs)):
        # for each level, project labels to the vertex level
        # so that every cell has a name. Note that at this level
        # the labels are not necessarily consecutive
        groups[:, x] = state.project_partition(x, 0).get_array()

    groups = pd.DataFrame(groups).astype('category')

    # rename categories from 0 to n
    for c in groups.columns:
        ncat = len(groups[c].cat.categories)
        new_cat = [u'%s' % x for x in range(ncat)]
        groups[c].cat.rename_categories(new_cat, inplace=True)

    levels = groups.columns

    # recode block names to have consistency with group names
    i_groups = groups.astype(int)
    bs = [i_groups.iloc[:, 0].values]
    for x in range(1, groups.shape[1]):
        bs.append(
            np.where(
                pd.crosstab(i_groups.iloc[:, x - 1], i_groups.iloc[:,
                                                                   x]) > 0)[1])
    state = gt.NestedBlockState(union_g, bs)
    del (i_groups)

    groups.index = all_names

    # add column names
    groups.columns = [f"{key_added}_level_{level}" for level in range(len(bs))]

    # remove any column with the same key
    for xn in range(n_data):
        drop_columns = groups.columns.intersection(adatas[xn].obs.columns)
        adatas[xn].obs.drop(drop_columns, 'columns', inplace=True)
        adatas[xn].obs = pd.concat(
            [adatas[xn].obs, groups.loc[adatas[xn].obs_names]], axis=1)

        # now add marginal probabilities.

        if collect_marginals:
            # add marginals for level 0, the sum up according to the hierarchy
            _groups = groups.loc[adatas[xn].obs_names]
            _pv_array = pd.DataFrame(
                pv_array, index=all_names).loc[adatas[xn].obs_names].values
            adatas[xn].obsm[f"CM_{key_added}_level_0"] = _pv_array
            for group in groups.columns[1:]:
                ct = pd.crosstab(_groups[_groups.columns[0]],
                                 _groups[group],
                                 normalize='index',
                                 dropna=False)
                adatas[xn].obsm[f'CM_{group}'] = _pv_array @ ct.values

        # add some unstructured info
        if not 'schist' in adatas[xn].uns:
            adatas[xn].uns['schist'] = {}

        adatas[xn].uns['schist'][f'{key_added}'] = {}
        adatas[xn].uns['schist'][f'{key_added}']['stats'] = dict(
            level_entropy=np.array(
                [state.level_entropy(x) for x in range(len(state.levels))]),
            modularity=np.array([
                gt.modularity(union_g, state.project_partition(x, 0))
                for x in range(len((state.levels)))
            ]))

        bl_d = {}
        levels = state.get_levels()
        for nl in range(len(levels)):
            bl_d[str(nl)] = np.array(levels[nl].get_blocks().a)
        adatas[xn].uns['schist'][f'{key_added}']['blocks'] = bl_d

        # last step is recording some parameters used in this analysis
        adatas[xn].uns['schist'][f'{key_added}']['params'] = dict(
            model='multiome_nested',
            use_weights=use_weights,
            neighbors_key=neighbors_key[xn],
            key_added=key_added,
            samples=samples,
            collect_marginals=collect_marginals,
            random_seed=random_seed,
            deg_corr=deg_corr,
            #            recs=recs,
            #            rec_types=rec_types
        )

    logg.info(
        '    finished',
        time=start,
        deep=(
            f'and added\n'
            f'    {key_added!r}, the cluster labels (adata.obs, categorical)'),
    )
    return adatas if copy else None