示例#1
0
文件: map.py 项目: Mr-Milk/SpatialTis
def neighbors_map(
    data: AnnData,
    roi: str,
    cell_type_key: Optional[str] = None,
    centroid_key: Optional[str] = None,
    roi_key: Optional[str] = None,
    **plot_options,
):
    """Visualize neighbors network built in a ROI

    Args:
        data: {adata_plotting}
        roi: {roi}
        cell_type_key: {cell_type_key}
        centroid_key: {centroid_key}
        roi_key: {roi_key}
        **plot_options:

    Returns:

    """
    # cell_type_key = Config.cell_type_key if cell_type_key is None else cell_type_key
    # centroid_key = Config.centroid_key if centroid_key is None else centroid_key
    # roi_key = Config.roi_key if roi_key is None else roi_key
    ab = AnalysisBase(data,
                      cell_type_key=cell_type_key,
                      centroid_key=centroid_key,
                      roi_key=roi_key)

    iter_data = data.obs.copy()
    points = ab.get_points()
    if len(points[0]) == 3:
        raise NotImplementedError("Does not support 3D neighbor map")
    iter_data['__spatial_centroid'] = points
    roi_info = iter_data[iter_data[ab.roi_key] == roi]

    if len(roi_info) == 0:
        raise ValueError(f"ROI not exist, roi = {roi}")
    cell_types = roi_info[ab.cell_type_key] if ab.has_cell_type else None

    internal_kwargs = dict(legend_title="Cell type", **plot_options)

    cells = np.array(roi_info['__spatial_centroid'].to_list())
    x, y = cells[:, 0], cells[:, 1]
    neighbors = read_neighbors(roi_info, "cell_neighbors")
    labels = roi_info["cell_id"].astype(int)
    nmin = labels.min()
    links = []
    for l, neigh in zip(labels, neighbors):
        for n in neigh:
            if n > l:
                links.append((n - nmin, l - nmin))

    return point_map(x, y, types=cell_types, links=links, **internal_kwargs)
示例#2
0
def NCD_marker(
    data: AnnData,
    selected_markers: Optional[Array] = None,
    importance_cutoff: Number = 0.5,
    layer_key: Optional[str] = None,
    tree_kwargs: Optional[Dict] = None,
    test_method: str = "mannwhitneyu",
    pval: Number = 0.01,
    export_key: str = "ncd_marker",
    **kwargs,
):
    """Identify neighbor cells dependent marker

    This method tells you the dependency between markers and its neighbor cell type.
    The dependency is calculated by building a gradiant boosting tree (in here lightgbm) to determine
    the feature importance. A statistic test and fold change will be calculated for importance markers and its
    neighbor cells, the fold change is between marker with cell type at / not at the neighborhood.

    Args:
        data: {adata}
        importance_cutoff: Threshold to determine the feature markers
        selected_markers: {selected_markers}
        layer_key: {layer_key}
        tree_kwargs: {tree_kwargs}
        test_method: which test method to use, anything from :code:`scipy.stats`
        pval: {pval}
        export_key: {export_key}
        **kwargs: {analysis_kwargs}

    """

    try:
        from lightgbm import LGBMRegressor
    except ImportError:
        raise ImportError(
            "lightgbm is not installed, please try `pip install lightgbm`.")
    ab = AnalysisBase(data,
                      display_name="NCD Markers",
                      export_key=export_key,
                      **kwargs)
    ab.check_neighbors()
    ab.check_cell_type()

    tree_kwargs_ = {"n_jobs": -1, "random_state": 0, "importance_type": "gain"}
    if tree_kwargs is not None:
        for k, v in tree_kwargs.items():
            tree_kwargs_[k] = v

    markers = ab.selected_markers(selected_markers)
    markers_mask = ab.markers_col.isin(markers)

    neighbors = read_neighbors(data.obs, ab.neighbors_key)
    labels = data.obs[ab.cell_id_key]
    cell_types = data.obs[ab.cell_type_key]
    col, comps = neighbor_components(neighbors, labels.tolist(),
                                     cell_types.tolist())
    neigh_comp = pd.DataFrame(
        data=comps,
        columns=col,
        index=pd.MultiIndex.from_frame(
            data.obs[[ab.cell_type_key, ab.cell_id_key]],
            names=["type", "id"],
        ),
    )
    results_data = []
    # For markers in different cell types
    with np.errstate(divide="ignore"):
        for t, x in neigh_comp.groupby(level=["type"]):
            exp_ix = x.index.to_frame()["id"]
            exp = read_exp(data[exp_ix, markers_mask], layer_key)
            for i, y in enumerate(exp):
                # copy it to prevent memory peak according to lightgbm
                reg = LGBMRegressor(**tree_kwargs_).fit(x, y.copy())
                weights = np.asarray(reg.feature_importances_)
                weights = weights / weights.sum()
                max_ix = np.argmax(weights)
                max_weight = weights[max_ix]
                max_type = col[max_ix]
                if max_weight > importance_cutoff:
                    nx = x.copy()
                    # add expression data to dataframe to allow cutting afterwards
                    nx["exp"] = y
                    # cells with max_type at neighbors
                    at_neighbor = (nx.iloc[:, max_ix] != 0)
                    at_neighbor_exp = nx[at_neighbor]["exp"].to_numpy()
                    non_at_neighbor_exp = nx[~at_neighbor]["exp"].to_numpy()
                    at_sum = at_neighbor_exp.sum()
                    non_at_sum = non_at_neighbor_exp.sum()
                    if (at_sum > 0) & (non_at_sum > 0):
                        test_result = getattr(scipy.stats,
                                              test_method).__call__(
                                                  at_neighbor_exp,
                                                  non_at_neighbor_exp)
                        pvalue = test_result.pvalue
                        if pvalue < pval:
                            at_mean = at_neighbor_exp.mean()
                            non_at_mean = non_at_neighbor_exp.mean()
                            log2_fc = np.log2(at_mean / non_at_mean)
                            results_data.append([
                                t,
                                markers[i],
                                max_type,
                                max_weight,
                                log2_fc,
                                pvalue,
                            ])
    ab.result = pd.DataFrame(
        data=results_data,
        columns=[
            "cell_type",
            "marker",
            "neighbor_type",
            "dependency",
            "log2_FC",
            "pval",
        ],
    )
示例#3
0
def spatial_autocorr(
    data: AnnData,
    method: str = "moran_i",
    pval: float = 0.05,
    two_tailed: bool = True,
    layer_key: Optional[str] = None,
    export_key: str = "spatial_autocorr",
    **kwargs,
):
    """Spatial auto-correlation for every markers

    This is used measure the correlation of marker expression with spatial locations.

    Moran's I is more for global spatial autocorrelation,
    Geary's C is more for local spatial autocorrelation

    Args:
        data: {data}
        method: "moran_i" or "geary_c" (Default: "moran_i")
        pval: {pval}
        two_tailed: Whether to use two tailed for p-value
        layer_key: {layer_key}
        export_key: {export_key}
        **kwargs: {analysis_kwargs}

    .. seealso:: :class:`spatialtis.somde`

    """
    method = options_guard(method, ['moran_i', 'geary_c'])
    ab = AnalysisBase(data,
                      method=method,
                      display_name="Spatial auto-correlation",
                      export_key=export_key,
                      **kwargs)
    track_ix = []
    results_data = []
    for roi_name, roi_data, markers, exp in ab.roi_exp_iter(
            layer_key=layer_key, desc=ab.display_name):
        neighbors = read_neighbors(roi_data, ab.neighbors_key)
        labels = roi_data[ab.cell_id_key]
        results = autocorr(
            exp.astype(np.float64),
            neighbors,
            labels=labels,
            two_tailed=two_tailed,
            pval=pval,
            method=method,
        )
        markers = markers.to_numpy()
        results = np.hstack([markers.reshape(-1, 1), results])
        track_ix += [roi_name for _ in range(len(markers))]
        results_data.append(results)

    ab.result = pd.concat(
        [
            pd.DataFrame(data=track_ix, columns=ab.exp_obs),
            pd.DataFrame(
                data=np.concatenate(results_data),
                columns=["marker", "pattern", "index_value", "pval"],
            ),
        ],
        axis=1,
    )
示例#4
0
def NMD_marker(
    data: AnnData,
    pval: float = 0.01,
    selected_markers: Optional[Array] = None,
    importance_cutoff: Number = 0.5,
    layer_key: Optional[str] = None,
    tree_kwargs: Optional[Dict] = None,
    export_key: str = "nmd_marker",
    **kwargs,
):
    """Identify neighbor markers dependent marker

    The neighborhood is treated as a single cell.

    Args:
        data: {adata}
        exp_std_cutoff: Standard deviation, threshold to filter out markers that are not variant enough
        pval: {pval}
        selected_markers: {selected_markers}
        layer_key: {layers_key}
        tree_kwargs: {tree_kwargs}
        export_key: {export_key}
        **kwargs: {analysis_kwargs}

    """
    try:
        from lightgbm import LGBMRegressor
    except ImportError:
        raise ImportError(
            "lightgbm is not installed, please try `pip install lightgbm`.")
    ab = AnalysisBase(data,
                      display_name="NMD marker",
                      export_key=export_key,
                      **kwargs)
    ab.check_neighbors()

    tree_kwargs_ = {"n_jobs": -1, "random_state": 0, "importance_type": "gain"}
    if tree_kwargs is not None:
        for k, v in tree_kwargs.items():
            tree_kwargs_[k] = v

    markers = ab.selected_markers(selected_markers)
    markers_mask = ab.markers_col.isin(markers)
    neighbors = read_neighbors(data.obs, ab.neighbors_key)
    cent_exp = read_exp(data[:, markers_mask], layer_key)
    # treat the neighbors as single cell
    # sum the expression
    neigh_exp = np.asarray(
        [read_exp(data[n, markers_mask], layer_key).sum(1) for n in neighbors])
    results_data = []
    for i, y in enumerate(
            pbar_iter(
                cent_exp,
                desc="Neighbor-dependent markers",
            )):
        reg = LGBMRegressor(**tree_kwargs_).fit(neigh_exp, y)
        weights = np.asarray(reg.feature_importances_)
        ws = weights.sum()
        if ws != 0:
            weights = weights / weights.sum()
            max_ix = np.argmax(weights)
            max_weight = weights[max_ix]
            if max_weight > importance_cutoff:
                r, pvalue = spearmanr(y, neigh_exp[:, max_ix])
                if pvalue < pval:
                    results_data.append(
                        [markers[i], markers[max_ix], max_weight, r, pvalue])

    ab.result = pd.DataFrame(
        data=results_data,
        columns=["marker", "neighbor_marker", "dependency", "corr", "pval"],
    )
示例#5
0
    def __init__(
        self,
        data: AnnData,
        known_pairs: Optional[pd.DataFrame] = None,
        predict_pairs: Optional[List[Tuple]] = None,
        train_partition: float = 0.9,
        gpus: Optional[int] = None,
        max_epochs: int = 10,
        lr: float = 1e-4,
        batch_size: int = 32,
        random_seed: int = 42,
        load_model: bool = False,
        **kwargs,
    ):
        try:
            import torch
            import torch.nn.functional as F
            from torch.nn import Flatten, Linear
            from torch_geometric.nn import GCNConv, global_max_pool
            import pytorch_lightning as pl
            from pytorch_lightning.core.lightning import LightningModule
        except ImportError:
            raise ImportError(
                "To run GCNG, please install pytorch, pytorch-lightning, "
                "torch-geometric, torch_sparse and torch_scatter.")
        if known_pairs is None:
            raise NotImplementedError(
                "Currently, you need to supply the training pairs youself")
        if predict_pairs is None:
            raise ValueError(
                "To run the model, you must specific the `predict_pairs`"
                "and tell spatialtis the ligand-receptor pairs you want to predict."
            )
        else:
            if len(predict_pairs) < batch_size:
                raise ValueError(
                    "The predict_pairs must be longer than batch size")
        super().__init__(data, display_name="GCNG", **kwargs)
        device = "cpu"
        if gpus is None:
            cuda_count = torch.cuda.device_count()
            gpus = cuda_count
            if gpus > 0:
                device = "cuda"

        # To make pytorch a optional deps
        # We could only init the model from within
        class GCNGModel(LightningModule):
            def __init__(self, node_size, output_features, lr=lr):
                super().__init__()
                self.conv1 = GCNConv(2, 32)
                self.conv2 = GCNConv(32, 32)
                self.dense1 = Linear(output_features * node_size * 32, 512)
                self.dense2 = Linear(512, output_features)
                self.flatten = Flatten()

                self.lr = lr
                self.correct = 0
                self.test_data_len = 0
                self.acc = 0
                self.pred = []

            def forward(self, x, edge_index):
                # x, edge_index = data.x, data.edge_index
                x = self.conv1(x, edge_index)
                x = F.elu(x)
                x = self.conv2(x, edge_index)
                x = F.elu(x)
                x = torch.flatten(x)
                x = self.dense1(x)
                x = F.elu(x)
                x = self.dense2(x)

                return torch.sigmoid(x)

            def configure_optimizers(self):
                return torch.optim.Adam(self.parameters(), lr=self.lr)

            def training_step(self, train_data, batch_idx):
                x, edge_index, batch = train_data.x, train_data.edge_index, train_data.batch
                x = self(x, edge_index)
                loss_in = x.flatten()
                loss_out = train_data.y
                loss = F.binary_cross_entropy(loss_in, loss_out)
                return loss

            def test_step(self, test_data, batch_idx):
                x, edge_index, batch = test_data.x, test_data.edge_index, test_data.batch
                x = self(x, edge_index)
                pred = x.detach().cpu().numpy().flatten().round()
                truth_y = test_data.y.cpu().numpy()
                self.correct += (pred == truth_y).sum()
                self.test_data_len += len(test_data.y)
                self.acc = self.correct / self.test_data_len
                return self.acc

            def predict_step(self, predict_data, batch_idx):
                x, edge_index, batch = predict_data.x, predict_data.edge_index, predict_data.batch
                x = self(x, edge_index)
                self.pred = x.detach().cpu().numpy().flatten().round().tolist()
                return self.pred

            def release_gpu_mem(self):
                try:
                    torch.cuda.empty_cache()
                except Exception:
                    pass

            def on_train_end(self, *args, **kwargs):
                self.release_gpu_mem()

            def on_predict_batch_end(self, *args, **kwargs):
                self.release_gpu_mem()

        # init model and trainer first
        gc = GCNGModel(data.n_obs, batch_size, lr=lr)
        pl.seed_everything(random_seed, workers=True)
        trainer = pl.Trainer(gpus=gpus,
                             max_epochs=max_epochs,
                             deterministic=True,
                             progress_bar_refresh_rate=0,
                             weights_summary=None,
                             precision=16)
        # create neighbors pairs
        npairs = neighbors_pairs(data.obs[self.cell_id_key],
                                 read_neighbors(data.obs, self.neighbors_key))
        # get exp info and create markers mapper
        # markers' name will all be lowercase
        exp = data.X.T
        markers = data.var.markers
        markers = pd.Series([i.lower() for i in markers], index=markers.index)
        markers_mapper = dict(zip(markers.tolist(), range(len(markers))))

        if load_model:  # load pre-trained model
            try:
                state = self.data.uns[MODEL_SAVE_KEY]
            except KeyError:
                raise ValueError(
                    "Pre-trained model not found, please retrain the model")
            gc.load_state_dict(state)
        else:
            # find overlap genes
            lr_genes = known_pairs.iloc[:, [0, 1]].to_numpy().flatten()
            overlap_sets = overlap_genes(self.markers, lr_genes)
            filtered_pairs = known_pairs[
                known_pairs.iloc[:, 0].isin(overlap_sets)
                & known_pairs.iloc[:, 1].isin(overlap_sets)].iloc[:, [0, 1, 2]]
            if len(filtered_pairs) == 0:
                raise ValueError(
                    "The gene in `known_pairs` has no overlap with genes in data"
                )
            # train the model
            train, test = train_test_split(filtered_pairs, train_partition)
            log_print(f"Training set: {len(train)}, Test set: {len(test)}")

            train_loader = graph_data_loader(train,
                                             exp,
                                             markers_mapper,
                                             npairs,
                                             device,
                                             batch_size,
                                             shuffle=True)
            test_loader = graph_data_loader(test,
                                            exp,
                                            markers_mapper,
                                            npairs,
                                            device,
                                            batch_size,
                                            shuffle=False)
            log_print("Finish loading data, start training")
            trainer.fit(gc, train_loader)
            trainer.test(dataloaders=test_loader, verbose=False)
            log_print(f"Model accuracy {gc.acc}")
            self.data.uns[MODEL_SAVE_KEY] = gc.state_dict()  # save model
            self.model = gc  # allow user to access model
            self.trainer = trainer

        # the model output is dynamically adjust according to batch size
        # the predict step should be able to iter through all pairs
        predict_size = len(predict_pairs)
        append_amount = batch_size - predict_size % batch_size
        predict_pairs += predict_pairs[:append_amount]
        predict = pd.DataFrame(predict_pairs)
        # print(f"predict size {len(predict)}")
        # predict_loader = predict_data_loader(predict, exp, markers_mapper, npairs, device, batch_size)
        # # init the model and train
        # trainer.predict(dataloaders=predict_loader)
        # predict['relationship'] = gc.pred

        pred = []
        for i in pbar_iter(range(0, predict_size, batch_size),
                           desc="Fetching predict result"):
            predict_tmp = pd.DataFrame(predict_pairs[i:i + batch_size])
            predict_loader = predict_data_loader(predict_tmp, exp,
                                                 markers_mapper, npairs,
                                                 device, batch_size)
            # init the model and train
            trainer.predict(dataloaders=predict_loader)
            pred += gc.pred
        #     release_gpu_mem()
        # completely release mem when exit
        gc.release_gpu_mem()
        predict['relationship'] = pred
        predict.columns = ['Gene1', 'Gene2', 'relationship']
        self.result = predict.iloc[:predict_size, :].copy()
示例#6
0
def cell_community(
    data: AnnData,
    resolution: float = 0.05,
    partition_type: Optional[Any] = None,
    partition_kwargs: Optional[Dict] = None,
    export_key: str = "community_id",
    **kwargs,
):
    """Spatial communities detection

    Here we use Leiden graph cluster algorithm

    Args:
        data: {adata}
        resolution:
        partition_type: The leidenalg partition type
        partition_kwargs: Pass to leidenalg.find_partition
        export_key: {export_key}
        **kwargs: {analysis_kwargs}

    """

    ab = AnalysisBase(data,
                      display_name="Cell community",
                      export_key=export_key,
                      **kwargs)

    # import leidenalg
    # import igraph as ig
    leidenalg = try_import("leidenalg")
    ig = try_import("igraph", install_name="python-igraph")

    ab.check_neighbors()

    if partition_type is None:
        partition_type = leidenalg.CPMVertexPartition
    if partition_kwargs is None:
        partition_kwargs = {"resolution_parameter": resolution}
    else:
        partition_kwargs = {"resolution_parameter": 0.05, **partition_kwargs}

    graphs = []
    track_ix = []
    sub_comm = []
    for roi_name, roi_data, points in ab.roi_iter_with_points():
        labels = roi_data[ab.cell_id_key]
        neighbors = read_neighbors(roi_data, ab.neighbors_key)
        vertices = []
        edge_mapper = {}
        for i, (x, y) in zip(labels, points):
            vertices.append({"name": i, "x": x, "y": y})
            edge_mapper[i] = (x, y)

        graph_edges = []
        for k, vs in zip(labels, neighbors):
            if len(vs) > 0:
                for v in vs:
                    if k < v:
                        distance = euclidean(edge_mapper[k], edge_mapper[v])
                        graph_edges.append({
                            "source": k,
                            "target": v,
                            "weight": distance
                        })
        graph = ig.Graph.DictList(vertices, graph_edges)
        part = leidenalg.find_partition(graph, partition_type,
                                        **partition_kwargs)
        sub_comm += part.membership
        graphs.append(graph)
        track_ix.append(roi_name)

    sub_comm = pd.Series(sub_comm, index=data.obs.index)
    col2adata_obs(sub_comm, data, ab.export_key)
    ab.stop_timer()