def neighbors_map( data: AnnData, roi: str, cell_type_key: Optional[str] = None, centroid_key: Optional[str] = None, roi_key: Optional[str] = None, **plot_options, ): """Visualize neighbors network built in a ROI Args: data: {adata_plotting} roi: {roi} cell_type_key: {cell_type_key} centroid_key: {centroid_key} roi_key: {roi_key} **plot_options: Returns: """ # cell_type_key = Config.cell_type_key if cell_type_key is None else cell_type_key # centroid_key = Config.centroid_key if centroid_key is None else centroid_key # roi_key = Config.roi_key if roi_key is None else roi_key ab = AnalysisBase(data, cell_type_key=cell_type_key, centroid_key=centroid_key, roi_key=roi_key) iter_data = data.obs.copy() points = ab.get_points() if len(points[0]) == 3: raise NotImplementedError("Does not support 3D neighbor map") iter_data['__spatial_centroid'] = points roi_info = iter_data[iter_data[ab.roi_key] == roi] if len(roi_info) == 0: raise ValueError(f"ROI not exist, roi = {roi}") cell_types = roi_info[ab.cell_type_key] if ab.has_cell_type else None internal_kwargs = dict(legend_title="Cell type", **plot_options) cells = np.array(roi_info['__spatial_centroid'].to_list()) x, y = cells[:, 0], cells[:, 1] neighbors = read_neighbors(roi_info, "cell_neighbors") labels = roi_info["cell_id"].astype(int) nmin = labels.min() links = [] for l, neigh in zip(labels, neighbors): for n in neigh: if n > l: links.append((n - nmin, l - nmin)) return point_map(x, y, types=cell_types, links=links, **internal_kwargs)
def NCD_marker( data: AnnData, selected_markers: Optional[Array] = None, importance_cutoff: Number = 0.5, layer_key: Optional[str] = None, tree_kwargs: Optional[Dict] = None, test_method: str = "mannwhitneyu", pval: Number = 0.01, export_key: str = "ncd_marker", **kwargs, ): """Identify neighbor cells dependent marker This method tells you the dependency between markers and its neighbor cell type. The dependency is calculated by building a gradiant boosting tree (in here lightgbm) to determine the feature importance. A statistic test and fold change will be calculated for importance markers and its neighbor cells, the fold change is between marker with cell type at / not at the neighborhood. Args: data: {adata} importance_cutoff: Threshold to determine the feature markers selected_markers: {selected_markers} layer_key: {layer_key} tree_kwargs: {tree_kwargs} test_method: which test method to use, anything from :code:`scipy.stats` pval: {pval} export_key: {export_key} **kwargs: {analysis_kwargs} """ try: from lightgbm import LGBMRegressor except ImportError: raise ImportError( "lightgbm is not installed, please try `pip install lightgbm`.") ab = AnalysisBase(data, display_name="NCD Markers", export_key=export_key, **kwargs) ab.check_neighbors() ab.check_cell_type() tree_kwargs_ = {"n_jobs": -1, "random_state": 0, "importance_type": "gain"} if tree_kwargs is not None: for k, v in tree_kwargs.items(): tree_kwargs_[k] = v markers = ab.selected_markers(selected_markers) markers_mask = ab.markers_col.isin(markers) neighbors = read_neighbors(data.obs, ab.neighbors_key) labels = data.obs[ab.cell_id_key] cell_types = data.obs[ab.cell_type_key] col, comps = neighbor_components(neighbors, labels.tolist(), cell_types.tolist()) neigh_comp = pd.DataFrame( data=comps, columns=col, index=pd.MultiIndex.from_frame( data.obs[[ab.cell_type_key, ab.cell_id_key]], names=["type", "id"], ), ) results_data = [] # For markers in different cell types with np.errstate(divide="ignore"): for t, x in neigh_comp.groupby(level=["type"]): exp_ix = x.index.to_frame()["id"] exp = read_exp(data[exp_ix, markers_mask], layer_key) for i, y in enumerate(exp): # copy it to prevent memory peak according to lightgbm reg = LGBMRegressor(**tree_kwargs_).fit(x, y.copy()) weights = np.asarray(reg.feature_importances_) weights = weights / weights.sum() max_ix = np.argmax(weights) max_weight = weights[max_ix] max_type = col[max_ix] if max_weight > importance_cutoff: nx = x.copy() # add expression data to dataframe to allow cutting afterwards nx["exp"] = y # cells with max_type at neighbors at_neighbor = (nx.iloc[:, max_ix] != 0) at_neighbor_exp = nx[at_neighbor]["exp"].to_numpy() non_at_neighbor_exp = nx[~at_neighbor]["exp"].to_numpy() at_sum = at_neighbor_exp.sum() non_at_sum = non_at_neighbor_exp.sum() if (at_sum > 0) & (non_at_sum > 0): test_result = getattr(scipy.stats, test_method).__call__( at_neighbor_exp, non_at_neighbor_exp) pvalue = test_result.pvalue if pvalue < pval: at_mean = at_neighbor_exp.mean() non_at_mean = non_at_neighbor_exp.mean() log2_fc = np.log2(at_mean / non_at_mean) results_data.append([ t, markers[i], max_type, max_weight, log2_fc, pvalue, ]) ab.result = pd.DataFrame( data=results_data, columns=[ "cell_type", "marker", "neighbor_type", "dependency", "log2_FC", "pval", ], )
def spatial_autocorr( data: AnnData, method: str = "moran_i", pval: float = 0.05, two_tailed: bool = True, layer_key: Optional[str] = None, export_key: str = "spatial_autocorr", **kwargs, ): """Spatial auto-correlation for every markers This is used measure the correlation of marker expression with spatial locations. Moran's I is more for global spatial autocorrelation, Geary's C is more for local spatial autocorrelation Args: data: {data} method: "moran_i" or "geary_c" (Default: "moran_i") pval: {pval} two_tailed: Whether to use two tailed for p-value layer_key: {layer_key} export_key: {export_key} **kwargs: {analysis_kwargs} .. seealso:: :class:`spatialtis.somde` """ method = options_guard(method, ['moran_i', 'geary_c']) ab = AnalysisBase(data, method=method, display_name="Spatial auto-correlation", export_key=export_key, **kwargs) track_ix = [] results_data = [] for roi_name, roi_data, markers, exp in ab.roi_exp_iter( layer_key=layer_key, desc=ab.display_name): neighbors = read_neighbors(roi_data, ab.neighbors_key) labels = roi_data[ab.cell_id_key] results = autocorr( exp.astype(np.float64), neighbors, labels=labels, two_tailed=two_tailed, pval=pval, method=method, ) markers = markers.to_numpy() results = np.hstack([markers.reshape(-1, 1), results]) track_ix += [roi_name for _ in range(len(markers))] results_data.append(results) ab.result = pd.concat( [ pd.DataFrame(data=track_ix, columns=ab.exp_obs), pd.DataFrame( data=np.concatenate(results_data), columns=["marker", "pattern", "index_value", "pval"], ), ], axis=1, )
def NMD_marker( data: AnnData, pval: float = 0.01, selected_markers: Optional[Array] = None, importance_cutoff: Number = 0.5, layer_key: Optional[str] = None, tree_kwargs: Optional[Dict] = None, export_key: str = "nmd_marker", **kwargs, ): """Identify neighbor markers dependent marker The neighborhood is treated as a single cell. Args: data: {adata} exp_std_cutoff: Standard deviation, threshold to filter out markers that are not variant enough pval: {pval} selected_markers: {selected_markers} layer_key: {layers_key} tree_kwargs: {tree_kwargs} export_key: {export_key} **kwargs: {analysis_kwargs} """ try: from lightgbm import LGBMRegressor except ImportError: raise ImportError( "lightgbm is not installed, please try `pip install lightgbm`.") ab = AnalysisBase(data, display_name="NMD marker", export_key=export_key, **kwargs) ab.check_neighbors() tree_kwargs_ = {"n_jobs": -1, "random_state": 0, "importance_type": "gain"} if tree_kwargs is not None: for k, v in tree_kwargs.items(): tree_kwargs_[k] = v markers = ab.selected_markers(selected_markers) markers_mask = ab.markers_col.isin(markers) neighbors = read_neighbors(data.obs, ab.neighbors_key) cent_exp = read_exp(data[:, markers_mask], layer_key) # treat the neighbors as single cell # sum the expression neigh_exp = np.asarray( [read_exp(data[n, markers_mask], layer_key).sum(1) for n in neighbors]) results_data = [] for i, y in enumerate( pbar_iter( cent_exp, desc="Neighbor-dependent markers", )): reg = LGBMRegressor(**tree_kwargs_).fit(neigh_exp, y) weights = np.asarray(reg.feature_importances_) ws = weights.sum() if ws != 0: weights = weights / weights.sum() max_ix = np.argmax(weights) max_weight = weights[max_ix] if max_weight > importance_cutoff: r, pvalue = spearmanr(y, neigh_exp[:, max_ix]) if pvalue < pval: results_data.append( [markers[i], markers[max_ix], max_weight, r, pvalue]) ab.result = pd.DataFrame( data=results_data, columns=["marker", "neighbor_marker", "dependency", "corr", "pval"], )
def __init__( self, data: AnnData, known_pairs: Optional[pd.DataFrame] = None, predict_pairs: Optional[List[Tuple]] = None, train_partition: float = 0.9, gpus: Optional[int] = None, max_epochs: int = 10, lr: float = 1e-4, batch_size: int = 32, random_seed: int = 42, load_model: bool = False, **kwargs, ): try: import torch import torch.nn.functional as F from torch.nn import Flatten, Linear from torch_geometric.nn import GCNConv, global_max_pool import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule except ImportError: raise ImportError( "To run GCNG, please install pytorch, pytorch-lightning, " "torch-geometric, torch_sparse and torch_scatter.") if known_pairs is None: raise NotImplementedError( "Currently, you need to supply the training pairs youself") if predict_pairs is None: raise ValueError( "To run the model, you must specific the `predict_pairs`" "and tell spatialtis the ligand-receptor pairs you want to predict." ) else: if len(predict_pairs) < batch_size: raise ValueError( "The predict_pairs must be longer than batch size") super().__init__(data, display_name="GCNG", **kwargs) device = "cpu" if gpus is None: cuda_count = torch.cuda.device_count() gpus = cuda_count if gpus > 0: device = "cuda" # To make pytorch a optional deps # We could only init the model from within class GCNGModel(LightningModule): def __init__(self, node_size, output_features, lr=lr): super().__init__() self.conv1 = GCNConv(2, 32) self.conv2 = GCNConv(32, 32) self.dense1 = Linear(output_features * node_size * 32, 512) self.dense2 = Linear(512, output_features) self.flatten = Flatten() self.lr = lr self.correct = 0 self.test_data_len = 0 self.acc = 0 self.pred = [] def forward(self, x, edge_index): # x, edge_index = data.x, data.edge_index x = self.conv1(x, edge_index) x = F.elu(x) x = self.conv2(x, edge_index) x = F.elu(x) x = torch.flatten(x) x = self.dense1(x) x = F.elu(x) x = self.dense2(x) return torch.sigmoid(x) def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) def training_step(self, train_data, batch_idx): x, edge_index, batch = train_data.x, train_data.edge_index, train_data.batch x = self(x, edge_index) loss_in = x.flatten() loss_out = train_data.y loss = F.binary_cross_entropy(loss_in, loss_out) return loss def test_step(self, test_data, batch_idx): x, edge_index, batch = test_data.x, test_data.edge_index, test_data.batch x = self(x, edge_index) pred = x.detach().cpu().numpy().flatten().round() truth_y = test_data.y.cpu().numpy() self.correct += (pred == truth_y).sum() self.test_data_len += len(test_data.y) self.acc = self.correct / self.test_data_len return self.acc def predict_step(self, predict_data, batch_idx): x, edge_index, batch = predict_data.x, predict_data.edge_index, predict_data.batch x = self(x, edge_index) self.pred = x.detach().cpu().numpy().flatten().round().tolist() return self.pred def release_gpu_mem(self): try: torch.cuda.empty_cache() except Exception: pass def on_train_end(self, *args, **kwargs): self.release_gpu_mem() def on_predict_batch_end(self, *args, **kwargs): self.release_gpu_mem() # init model and trainer first gc = GCNGModel(data.n_obs, batch_size, lr=lr) pl.seed_everything(random_seed, workers=True) trainer = pl.Trainer(gpus=gpus, max_epochs=max_epochs, deterministic=True, progress_bar_refresh_rate=0, weights_summary=None, precision=16) # create neighbors pairs npairs = neighbors_pairs(data.obs[self.cell_id_key], read_neighbors(data.obs, self.neighbors_key)) # get exp info and create markers mapper # markers' name will all be lowercase exp = data.X.T markers = data.var.markers markers = pd.Series([i.lower() for i in markers], index=markers.index) markers_mapper = dict(zip(markers.tolist(), range(len(markers)))) if load_model: # load pre-trained model try: state = self.data.uns[MODEL_SAVE_KEY] except KeyError: raise ValueError( "Pre-trained model not found, please retrain the model") gc.load_state_dict(state) else: # find overlap genes lr_genes = known_pairs.iloc[:, [0, 1]].to_numpy().flatten() overlap_sets = overlap_genes(self.markers, lr_genes) filtered_pairs = known_pairs[ known_pairs.iloc[:, 0].isin(overlap_sets) & known_pairs.iloc[:, 1].isin(overlap_sets)].iloc[:, [0, 1, 2]] if len(filtered_pairs) == 0: raise ValueError( "The gene in `known_pairs` has no overlap with genes in data" ) # train the model train, test = train_test_split(filtered_pairs, train_partition) log_print(f"Training set: {len(train)}, Test set: {len(test)}") train_loader = graph_data_loader(train, exp, markers_mapper, npairs, device, batch_size, shuffle=True) test_loader = graph_data_loader(test, exp, markers_mapper, npairs, device, batch_size, shuffle=False) log_print("Finish loading data, start training") trainer.fit(gc, train_loader) trainer.test(dataloaders=test_loader, verbose=False) log_print(f"Model accuracy {gc.acc}") self.data.uns[MODEL_SAVE_KEY] = gc.state_dict() # save model self.model = gc # allow user to access model self.trainer = trainer # the model output is dynamically adjust according to batch size # the predict step should be able to iter through all pairs predict_size = len(predict_pairs) append_amount = batch_size - predict_size % batch_size predict_pairs += predict_pairs[:append_amount] predict = pd.DataFrame(predict_pairs) # print(f"predict size {len(predict)}") # predict_loader = predict_data_loader(predict, exp, markers_mapper, npairs, device, batch_size) # # init the model and train # trainer.predict(dataloaders=predict_loader) # predict['relationship'] = gc.pred pred = [] for i in pbar_iter(range(0, predict_size, batch_size), desc="Fetching predict result"): predict_tmp = pd.DataFrame(predict_pairs[i:i + batch_size]) predict_loader = predict_data_loader(predict_tmp, exp, markers_mapper, npairs, device, batch_size) # init the model and train trainer.predict(dataloaders=predict_loader) pred += gc.pred # release_gpu_mem() # completely release mem when exit gc.release_gpu_mem() predict['relationship'] = pred predict.columns = ['Gene1', 'Gene2', 'relationship'] self.result = predict.iloc[:predict_size, :].copy()
def cell_community( data: AnnData, resolution: float = 0.05, partition_type: Optional[Any] = None, partition_kwargs: Optional[Dict] = None, export_key: str = "community_id", **kwargs, ): """Spatial communities detection Here we use Leiden graph cluster algorithm Args: data: {adata} resolution: partition_type: The leidenalg partition type partition_kwargs: Pass to leidenalg.find_partition export_key: {export_key} **kwargs: {analysis_kwargs} """ ab = AnalysisBase(data, display_name="Cell community", export_key=export_key, **kwargs) # import leidenalg # import igraph as ig leidenalg = try_import("leidenalg") ig = try_import("igraph", install_name="python-igraph") ab.check_neighbors() if partition_type is None: partition_type = leidenalg.CPMVertexPartition if partition_kwargs is None: partition_kwargs = {"resolution_parameter": resolution} else: partition_kwargs = {"resolution_parameter": 0.05, **partition_kwargs} graphs = [] track_ix = [] sub_comm = [] for roi_name, roi_data, points in ab.roi_iter_with_points(): labels = roi_data[ab.cell_id_key] neighbors = read_neighbors(roi_data, ab.neighbors_key) vertices = [] edge_mapper = {} for i, (x, y) in zip(labels, points): vertices.append({"name": i, "x": x, "y": y}) edge_mapper[i] = (x, y) graph_edges = [] for k, vs in zip(labels, neighbors): if len(vs) > 0: for v in vs: if k < v: distance = euclidean(edge_mapper[k], edge_mapper[v]) graph_edges.append({ "source": k, "target": v, "weight": distance }) graph = ig.Graph.DictList(vertices, graph_edges) part = leidenalg.find_partition(graph, partition_type, **partition_kwargs) sub_comm += part.membership graphs.append(graph) track_ix.append(roi_name) sub_comm = pd.Series(sub_comm, index=data.obs.index) col2adata_obs(sub_comm, data, ab.export_key) ab.stop_timer()