def download(self, fpath: Optional[PathLike] = None, **kwargs: Any) -> Any: """Download the dataset into ``fpath``.""" fpath = str(self.path if fpath is None else fpath) if not fpath.endswith(self._extension): fpath += self._extension if os.path.isfile(fpath): logg.debug(f"Loading dataset `{self.name}` from `{fpath}`") else: logg.debug( f"Downloading dataset `{self.name}` from `{self.url}` as `{fpath}`" ) dirname = Path(fpath).parent try: if not dirname.is_dir(): logg.info(f"Creating directory `{dirname}`") dirname.mkdir(parents=True, exist_ok=True) except OSError as e: logg.error(f"Unable to create directory `{dirname}`. Reason `{e}`") data = self._download(fpath=fpath, backup_url=self.url, **kwargs) if self.shape is not None and data.shape != self.shape: raise ValueError( f"Expected the data to have shape `{self.shape}`, found `{data.shape}`." ) return data
def compute_distances(self) -> sp.csr_matrix: """Compute the distances between clonotypes. `prepare` must have been ran previously. Returns a clonotype x clonotype sparse distance matrix.""" start = logging.info( "Computing clonotype x clonotype distances.") # type: ignore n_clonotypes = self.clonotypes.shape[0] # only use multiprocessing for sufficiently large datasets # for small datasets the overhead is too large for a benefit if self.n_jobs == 1 or n_clonotypes <= 2 * self.chunksize: dist_rows = tqdm( (self._dist_for_clonotype(i) for i in range(n_clonotypes)), total=n_clonotypes, ) else: logging.info( "NB: Computation happens in chunks. The progressbar only advances " "when a chunk has finished. ") # type: ignore dist_rows = process_map( self._dist_for_clonotype, range(n_clonotypes), max_workers=self.n_jobs if self.n_jobs is not None else cpu_count(), chunksize=2000, tqdm_class=tqdm, ) dist = sp.vstack(dist_rows) dist.eliminate_zeros() logging.hint("Done computing clonotype x clonotype distances. ", time=start) return dist # type: ignore
def add_image(self, layer: str) -> bool: """ Add a new :mod:`napari` image layer. Parameters ---------- layer Layer in the underlying's :class:`ImageContainer` which contains the image. Returns ------- `True` if the layer has been added, otherwise `False`. """ if layer in self.view.layernames: self._handle_already_present(layer) return False img: np.ndarray = self.model.container.data[layer].transpose( "y", "x", ...).values if img.shape[-1] > 4: logg.warning(f"Unable to show image of shape `{img.shape}`") return False logg.info(f"Creating image `{layer}` layer") self.view.viewer.add_image( img_as_float(img), name=layer, rgb=True, colormap=self.model.cmap, blending=self.model.blending, ) return True
def compute_distances( self, n_jobs: Union[int, None] = None, ): """Computes the distances between CDR3 sequences Parameters ---------- j_jobs Number of CPUs to use for alignment and levenshtein distance. Default: use all CPUS. """ for arm, arm_dict in self.index_dict.items(): arm_dict["dist_mat"] = tcr_dist( arm_dict["unique_seqs"], metric=self.metric, cutoff=self.cutoff, n_jobs=n_jobs, ) logging.info("Finished computing {} pairwise distances.".format(arm)) coords, values = zip(*self._cell_dist_mat_reduce()) rows, cols = zip(*coords) dist_mat = coo_matrix( (values, (rows, cols)), shape=(self.adata.n_obs, self.adata.n_obs) ) logging.info("Finished constructing cell x cell distance matrix. ") dist_mat.eliminate_zeros() self._dist_mat = dist_mat.tocsr()
def __init__( self, adata: AnnData, *, metric: Union[Literal["alignment", "identity", "levenshtein"], DistanceCalculator] = "identity", cutoff: float = 10, receptor_arms: Literal["TRA", "TRB", "all", "any"] = "all", dual_tcr: Literal["primary_only", "all", "any"] = "primary_only", sequence: Literal["aa", "nt"] = "aa", ): """Class to compute Neighborhood graphs of CDR3 sequences. For documentation of the parameters, see :func:`tcr_neighbors`. """ start = logging.info("Initializing TcrNeighbors object...") if metric == "identity" and cutoff != 0: raise ValueError("Identity metric only works with cutoff == 0") if metric != "identity" and cutoff == 0: logging.warn(f"Running with {metric} metric, but cutoff == 0. ") if sequence == "nt" and metric == "alignment": raise ValueError( "Using nucleotide sequences with alignment metric is not supported. " ) self.adata = adata self.metric = metric self.cutoff = cutoff self.receptor_arms = receptor_arms self.dual_tcr = dual_tcr self.sequence = sequence self._build_index_dict() self._dist_mat = None logging.info("Finished initalizing TcrNeighbors object. ", time=start)
def filter_low(self , value): if value is True: self.data_df = self.palantir.preprocess.filter_counts_data(self.data_df) adata.uns['palantir_norm_data'] = self.data_df logg.info('data filtered for low counts:\n\t' +\ 'cell_min_molecules=1000\n\tgenes_min_cells=10', r=True)
def test_timing(monkeypatch, capsys, logging_state): s.logfile = sys.stderr counter = 0 class IncTime: @staticmethod def now(tz): nonlocal counter counter += 1 return datetime(2000, 1, 1, second=counter, microsecond=counter, tzinfo=tz) monkeypatch.setattr(l, 'datetime', IncTime) s.verbosity = Verbosity.debug l.hint('1') assert counter == 1 and capsys.readouterr().err == '--> 1\n' start = l.info('2') assert counter == 2 and capsys.readouterr().err == '2\n' l.hint('3') assert counter == 3 and capsys.readouterr().err == '--> 3\n' l.info('4', time=start) assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n' l.info('5 {time_passed}', time=start) assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n'
def compute_eigen( self, n_comps: int = 15, sym: Optional[bool] = None, sort: Literal['decrease', 'increase'] = 'decrease', ): """\ Compute eigen decomposition of transition matrix. Parameters ---------- n_comps Number of eigenvalues/vectors to be computed, set `n_comps = 0` if you need all eigenvectors. sym Instead of computing the eigendecomposition of the assymetric transition matrix, computed the eigendecomposition of the symmetric Ktilde matrix. Returns ------- Writes the following attributes. eigen_values : numpy.ndarray Eigenvalues of transition matrix. eigen_basis : numpy.ndarray Matrix of eigenvectors (stored in columns). `.eigen_basis` is projection of data matrix on right eigenvectors, that is, the projection on the diffusion components. these are simply the components of the right eigenvectors and can directly be used for plotting. """ np.set_printoptions(precision=10) if self._transitions_sym is None: raise ValueError('Run `.compute_transitions` first.') matrix = self._transitions_sym # compute the spectrum if n_comps == 0: evals, evecs = scipy.linalg.eigh(matrix) else: n_comps = min(matrix.shape[0] - 1, n_comps) # ncv = max(2 * n_comps + 1, int(np.sqrt(matrix.shape[0]))) ncv = None which = 'LM' if sort == 'decrease' else 'SM' # it pays off to increase the stability with a bit more precision matrix = matrix.astype(np.float64) evals, evecs = scipy.sparse.linalg.eigsh(matrix, k=n_comps, which=which, ncv=ncv) evals, evecs = evals.astype(np.float32), evecs.astype(np.float32) if sort == 'decrease': evals = evals[::-1] evecs = evecs[:, ::-1] logg.info(' eigenvalues of transition matrix\n' ' {}'.format(str(evals).replace('\n', '\n '))) if self._number_connected_components > len(evals) / 2: logg.warning('Transition matrix has many disconnected components!') self._eigen_values = evals self._eigen_basis = evecs
def define_clonotypes( adata: AnnData, *, key_added: str = "clone_id", distance_key: Union[str, None] = None, **kwargs, ) -> Optional[Tuple[pd.Series, pd.Series, dict]]: """ Define :term:`clonotypes <Clonotype>` based on :term:`CDR3` nucleic acid sequence identity. As opposed to :func:`~scirpy.tl.define_clonotype_clusters` which employs a more flexible definition of :term:`clonotype clusters <Clonotype cluster>`, this function stringently defines clonotypes based on nucleic acid sequence identity. Technically, this function is an alias to :func:`~scirpy.tl.define_clonotype_clusters` with different default parameters. {clonotype_definition} Parameters ---------- adata Annotated data matrix {common_doc} {within_group} key_added The column name under which the clonotype clusters and cluster sizes will be stored in `adata.obs` and under which the clonotype network will be stored in `adata.uns` inplace If `True`, adds the results to anndata, otherwise return them. {paralellism} {return_values} """ if distance_key is None and "ir_dist_nt_identity" not in adata.uns: # For the case of "clonotypes" we want to compute the distance automatically # if it doesn't exist yet. Since it's just a sparse ID matrix, this # should be instant. logging.info( "ir_dist for sequence='nt' and metric='identity' not found. " "Computing with default parameters.") # type: ignore ir_dist(adata, metric="identity", sequence="nt", key_added=distance_key) return define_clonotype_clusters( adata, key_added=key_added, sequence="nt", metric="identity", partitions="connected", **kwargs, )
def add_img( self, img: Input_t, layer: Optional[str] = None, channel_dim: str = "channels", lazy: bool = True, chunks: Optional[int] = None, **kwargs: Any, ) -> None: """ Add a new image to the container. Parameters ---------- img In memory array or path to on-disk *TIFF*/*JPEG* image. %(img_layer)s channel_dim Name of the channel dimension. lazy Whether to use :mod:`rasterio` or :mod:`dask` to lazily load image. chunks Chunk size for :mod:`dask`, used in call to :func:`xarray.open_rasterio` for *TIFF* images. Returns ------- Nothing, just adds a new ``layer`` to :attr:`data`. Raises ------ ValueError If loading from a file/store with an unknown format. NotImplementedError If loading a specific data type has not been implemented. Notes ----- Lazy loading via :mod:`dask` is not supported for on-disk *JPEG* files, they will be loaded in memory. Multi-page *TIFFs* will be loaded in one :class:`xarray.DataArray`, with concatenated channel dimensions. """ layer = self._get_next_image_id("image") if layer is None else layer img = self._load_img(img, chunks=chunks, layer=layer, **kwargs) if img is not None: # not reading a .nc file if TYPE_CHECKING: assert isinstance(img, xr.DataArray) img = img.rename({img.dims[-1]: channel_dim}) logg.info( f"{'Overwriting' if layer in self else 'Adding'} image layer `{layer}`" ) self.data[layer] = img if not lazy: # load in memory self.data.load()
def _cell_dist_mat_reduce(self): """Compute the distance matrix by using custom reduction functions. More flexible than `_build_cell_dist_mat_min`, but requires more memory. Reduce dual is called before reduce arms. """ coord_dict = dict() def _add_to_dict(d, c1, c2, cell_row, cell_col, value): """Add a value to the nested coord dict""" try: tmp_dict = d[(cell_row, cell_col)] try: tmp_dict2 = tmp_dict[arm] try: if (c1, c2) in tmp_dict2: # can be in arbitrary order apprarently assert (c2, c1) not in tmp_dict2 tmp_dict2[(c2, c1)] = value tmp_dict2[(c1, c2)] = value except KeyError: tmp_dict2 = {(c1, c2): value} except KeyError: tmp_dict[arm] = {(c1, c2): value} except KeyError: d[(cell_row, cell_col)] = {arm: {(c1, c2): value}} for arm, arm_info in self.index_dict.items(): dist_mat, seq_to_cell, chain_inds = ( arm_info["dist_mat"], arm_info["seq_to_cell"], arm_info["chain_inds"], ) start = logging.info( f"Started comstructing {arm} coord-dictionary...") for row, col, value in tqdm(zip(dist_mat.row, dist_mat.col, dist_mat.data), total=dist_mat.nnz): for c1, c2 in itertools.product(chain_inds, repeat=2): for cell_row, cell_col in itertools.product( seq_to_cell[c1][row], seq_to_cell[c2][col]): # fill upper diagonal. Important: these are dist-mat row,cols # not cell-mat row cols. This is required, because the # itertools.product returns all combinations for the diagonal # but not for the other values. _add_to_dict(coord_dict, c1, c2, cell_row, cell_col, value) if row != col: _add_to_dict(coord_dict, c1, c2, cell_col, cell_row, value) logging.info(f"Finished constructing {arm} coord-dictionary", time=start) yield from self._reduce_coord_dict(coord_dict)
def test_formats(capsys, logging_state): s.logfile = sys.stderr s.verbosity = Verbosity.debug l.error('0') assert capsys.readouterr().err == 'ERROR: 0\n' l.warning('1') assert capsys.readouterr().err == 'WARNING: 1\n' l.info('2') assert capsys.readouterr().err == '2\n' l.hint('3') assert capsys.readouterr().err == '--> 3\n' l.debug('4') assert capsys.readouterr().err == ' 4\n'
def add_points(self, vec: Union[np.ndarray, pd.Series], layer_name: str, key: Optional[str] = None) -> bool: """ Add a new :mod:`napari` points layer. Parameters ---------- vec Values to plot. If :class:`pandas.Series`, it is expected to be categorical. layer_name Name of the layer to add. key Key from :attr:`anndata.AnnData.obs` from where the data was taken from. Only used when ``vec`` is :class:`pandas.Series`. Returns ------- `True` if the layer has been added, otherwise `False`. """ if layer_name in self.view.layernames: self._handle_already_present(layer_name) return False logg.info(f"Creating point `{layer_name}` layer") properties = self._get_points_properties(vec, key=key) layer: Points = self.view.viewer.add_points( self.model.coordinates, name=layer_name, size=self.model.spot_diameter, opacity=1, edge_width=1, blending=self.model.blending, face_colormap=self.model.cmap, edge_colormap=self.model.cmap, symbol=self.model.symbol.v, **properties, ) # https://github.com/napari/napari/issues/2019 # TODO: uncomment the 2 lines below once a solution is found for contrasting colors # we could use the selected points where the cluster labels are position as a black BG # layer._text._color = properties["colors"] # layer._text.events.color() self._hide_points_controls(layer, is_categorical=is_categorical_dtype(vec)) layer.editable = False layer.events.select.connect(self._move_layer_to_front) return True
def _self_loops(self_transitions, velo_graph): # set the diagonal elements. if self_transitions is not None: logg.info(f"Self transitions using {self_transitions!r}") if self_transitions == "scvelo": confidence = velo_graph.max(1).A.flatten() ub = np.percentile(confidence, 98) self_prob = np.clip(ub - confidence, 0, 1) velo_graph.setdiag(self_prob) if self_transitions == "velocyto": self_prob = velo_graph.max(1).A.flatten() velo_graph.setdiag(self_prob) return velo_graph
def clone_degree(self: Dandelion, weight: Union[None, str] = None, verbose: bool = True) -> Dandelion: """ Calculates node degree in BCR network. Parameters ---------- self : Dandelion `Dandelion` object after `tl.generate_network` has been run. weight : str, optional Atribute name for retrieving edge weight in graph. None defaults to ignoring this. See `networkx.Graph.degree`. verbose : bool Whether or not to show logging information. Returns ------- Dandelion object with metadata updated with node degree information. """ if verbose: start = logg.info('Calculating node degree') if self.__class__ == Dandelion: try: G = self.graph[0] except: dist = np.sum([ self.distance[x].toarray() for x in self.distance if type(self.distance[x]) is csr_matrix ], axis=0) A = csr_matrix(dist) G = nx.Graph() G.add_weighted_edges_from( zip(list(self.metadata.index), list(self.metadata.index), A.data)) if len(G) == 0: raise AttributeError( 'Graph not found. Plase run tl.generate_network.') else: cd = pd.DataFrame.from_dict(G.degree(weight=weight)) cd.set_index(0, inplace=True) self.metadata['clone_degree'] = pd.Series(cd[1]) if verbose: logg.info(' finished', time=start, deep=('Updated Dandelion metadata\n')) else: raise TypeError('Input object must be of {}'.format(Dandelion))
def clone_centrality(self: Dandelion, verbose: bool = True) -> Dandelion: """ Calculates node closeness centrality in BCR network. Parameters ---------- self : Dandelion `Dandelion` object after `tl.generate_network` has been run. verbose : bool Whether or not to show logging information. Returns ------- Dandelion object with metadata updated with node closeness centrality information. """ if verbose: start = logg.info('Calculating node closeness centrality') if self.__class__ == Dandelion: try: G = self.graph[0] except: dist = np.sum([ self.distance[x].toarray() for x in self.distance if type(self.distance[x]) is csr_matrix ], axis=0) A = csr_matrix(dist) G = nx.Graph() G.add_weighted_edges_from( zip(list(self.metadata.index), list(self.metadata.index), A.data)) if len(G) == 0: raise AttributeError( 'Graph not found. Plase run tl.generate_network.') else: cc = nx.closeness_centrality(G) cc = pd.DataFrame.from_dict(cc, orient='index', columns=['clone_centrality']) self.metadata['clone_centrality'] = pd.Series( cc['clone_centrality']) if verbose: logg.info(' finished', time=start, deep=('Updated Dandelion metadata\n')) else: raise TypeError('Input object must be of {}'.format(Dandelion))
def export(self, _: Viewer) -> None: """Export shapes into :class:`AnnData` object.""" for layer in self.view.layers: if not isinstance(layer, Shapes) or not layer.selected: continue if not len(layer.data): logg.warning( f"Shape layer `{layer.name}` has no visible shapes") continue key = f"{layer.name}_{self.model.key_added}" logg.info( f"Adding `adata.obs[{key!r}]`\n `adata.uns[{key!r}]['meshes']`" ) self._save_shapes(layer, key=key) self._update_obs_items(key)
def _prepare(self, adata: AnnData): """Initalize the DoubleLookupNeighborFinder and all required lookup tables""" start = logging.info("Initializing lookup tables. ") self._make_clonotype_table(adata) self._make_chain_count() self.neighbor_finder = DoubleLookupNeighborFinder(self.clonotypes) self._add_distance_matrices(adata) self._add_lookup_tables() logging.hint("Done initializing lookup tables.", time=start)
def lsi(data: Union[AnnData, MuData], scale_embeddings=True, n_comps=50): """ Run Latent Semantic Indexing PARAMETERS ---------- data: AnnData object or MuData object with 'atac' modality scale_embeddings: bool (default: True) Scale embeddings to zero mean and unit variance n_comps: int (default: 50) Number of components to calculate with SVD """ if isinstance(data, AnnData): adata = data elif isinstance(data, MuData) and "atac" in data.mod: adata = data.mod["atac"] else: raise TypeError( "Expected AnnData or MuData object with 'atac' modality") # In an unlikely scnenario when there are less 50 features, set n_comps to that value n_comps = min(n_comps, adata.X.shape[1]) logging.info("Performing SVD") cell_embeddings, svalues, peaks_loadings = svds(adata.X, k=n_comps) # Re-order components in the descending order cell_embeddings = cell_embeddings[:, ::-1] svalues = svalues[::-1] peaks_loadings = peaks_loadings[::-1, :] if scale_embeddings: cell_embeddings = (cell_embeddings - cell_embeddings.mean(axis=0) ) / cell_embeddings.std(axis=0) stdev = svalues / np.sqrt(adata.X.shape[0] - 1) adata.obsm["X_lsi"] = cell_embeddings adata.uns["lsi"] = {"stdev": stdev} adata.varm["LSI"] = peaks_loadings.T return None
def process(self): """ A method to run `palantir` on input Data Frame """ # Principal component analysis logg.info('PCA in progress ...') self.pca_projections, self.var_r = self.palantir.utils.run_pca( self.data_df) adata.uns['palantir_pca_results'] = {} adata.uns['palantir_pca_results'][ 'pca_projections'] = self.pca_projections adata.uns['palantir_pca_results']['variance_ratio'] = self.var_r # Diffusion maps logg.info('Diffusion maps in progress ...') self.dm_res = self.palantir.utils.run_diffusion_maps( self.pca_projections) self.ms_data = self.palantir.utils.determine_multiscale_space( self.dm_res) adata.uns['palantir_diff_maps'] = self.dm_res adata.uns['palantir_ms_data'] = self.ms_data # tSNE visualization logg.info('tSNE in progress ...') self.tsne = self.palantir.utils.run_tsne(self.ms_data) adata.uns['palantir_tsne'] = self.tsne # MAGIC imputation logg.info('imputation in progress ...') self.imp_df = self.palantir.utils.run_magic_imputation( self.data_df, self.dm_res) adata.uns['palantir_imp_df'] = self.imp_df logg.info('End of processing, start plotting.')
def scale_array( X, *, zero_center: bool = True, max_value: Optional[float] = None, copy: bool = False, return_mean_std: bool = False, ): if copy: X = X.copy() if not zero_center and max_value is not None: logg.info( # Be careful of what? This should be more specific "... be careful when using `max_value` " "without `zero_center`." ) if np.issubdtype(X.dtype, np.integer): logg.info( '... as scaling leads to float results, integer ' 'input is cast to float, returning copy.' ) X = X.astype(float) mean, var = _get_mean_var(X) std = np.sqrt(var) std[std == 0] = 1 if issparse(X): if zero_center: raise ValueError("Cannot zero-center sparse matrix.") sparsefuncs.inplace_column_scale(X, 1 / std) else: if zero_center: X -= mean X /= std # do the clipping if max_value is not None: logg.debug(f"... clipping at max_value {max_value}") X[X > max_value] = max_value if return_mean_std: return X, mean, std else: return X
def _reduce_coord_dict(self, coord_dict): """Applies reduction functions to the coord dict. Yield (coords, value) pairs.""" start = logging.info("Constructing cell x cell distance matrix...") reduce_dual = (self._reduce_dual_all if self.dual_ir == "all" else self._reduce_dual_any) reduce_arms = (self._reduce_arms_all if self.receptor_arms == "all" else self._reduce_arms_any) for (cell_row, cell_col), entry in tqdm(coord_dict.items(), total=len(coord_dict)): reduced_dual = (reduce_dual(value_dict, chain, cell_row, cell_col) for chain, value_dict in entry.items()) reduced = reduce_arms( reduced_dual, cell_row, cell_col, ) yield (cell_row, cell_col), reduced logging.info("Finished constructing cell x cell distance matrix. ", time=start)
def compute_partition(self) -> None: """ Compute communication classes for the Markov chain. Returns ------- None Nothing, but updates the following fields: - :paramref:`recurrent_classes` - :paramref:`transient_classes` - :paramref:`irreducible` """ start = logg.info("Computing communication classes") rec_classes, trans_classes = partition(self._T) self._is_irreducible = len(rec_classes) == 1 and len( trans_classes) == 0 if not self._is_irreducible: self._trans_classes = _make_cat(trans_classes, self._n_states, self._adata.obs_names) self._rec_classes = _make_cat(rec_classes, self._n_states, self._adata.obs_names) self._adata.obs[f"{self._rc_key}_rec_classes"] = self._rec_classes self._adata.obs[ f"{self._rc_key}_trans_classes"] = self._trans_classes logg.info( f"Found `{(len(rec_classes))}` recurrent and `{len(trans_classes)}` transient classes\n" f"Adding `.recurrent_classes`\n" f" `.transient_classes`\n" f" `.irreducible`\n" f" Finish", time=start, ) else: logg.warning( "The transition matrix is irreducible - cannot further partition it\n Finish", time=start, )
def __init__( self, adata: AnnData, *, metric: Union[Literal["alignment", "identity", "levenshtein", "hamming"], DistanceCalculator, ] = "identity", cutoff: Union[int, None] = None, receptor_arms: Literal["VJ", "VDJ", "all", "any"] = "all", dual_ir: Literal["primary_only", "all", "any"] = "primary_only", sequence: Literal["aa", "nt"] = "aa", ): """Class to compute Neighborhood graphs of CDR3 sequences. For documentation of the parameters, see :func:`ir_neighbors`. """ start = logging.info("Initializing IrNeighbors object...") if metric == "identity" and cutoff != 0: raise ValueError("Identity metric only works with cutoff == 0") if metric != "identity" and cutoff == 0: logging.warning(f"Running with {metric} metric, but cutoff == 0. ") if sequence == "nt" and metric == "alignment": raise ValueError( "Using nucleotide sequences with alignment metric is not supported. " ) if receptor_arms not in ["VJ", "VDJ", "all", "any"]: raise ValueError( "Invalid value for `receptor_arms`. Note that starting with v0.5 " "`TRA` and `TRB` are not longer valid values.") if dual_ir not in ["primary_only", "all", "any"]: raise ValueError("Invalid value for `dual_ir") if sequence not in ["aa", "nt"]: raise ValueError("Invalid value for `sequence`") self.adata = adata self.metric = metric self.cutoff = cutoff self.receptor_arms = receptor_arms self.dual_ir = dual_ir self.sequence = sequence self._build_index_dict() self._dist_mat = None logging.info("Finished initalizing IrNeighbors object. ", time=start)
def __init__(self , adata, func=None , normalize = False, log_transform = False, filter_low = False ): """ Parameters ---------- adata : AnnData, or Dataframe of cells X genes func : function wrapper to import palantir (not to be used) normalize : `bool` (default: `False`) property setter passed to palantir to normalize using palantir method `palantir.preprocess.normalize_counts`. log_transform : `bool` (default: `False`) property setter passed to palantir. Some datasets show better signal in the log scale. Applied using `palantir.preprocess.log_transform` filter_low : `bool` (default: `False`) property setter passed to palantir to remove low molecule count cells and low detection genes """ # instantiate variables self.func = func self.adata = adata self._normalize = normalize self._log_transform = log_transform self._filter_low = filter_low try: # for AnnData self.data_df = self.adata.to_df() except AttributeError: # assume the data is a cell X genes Dataframe logg.info('Assuming the data is a cell X genes Dataframe', r=True) # load palantir self.__call__() logg.info('palantir loaded ...', r=True)
def _root_final( adata: AnnData, final: bool = True, cluster_key: Optional[str] = None, weight_connectivities: float = None, percentile: int = 98, n_matches_min: Optional[int] = 1, n_start_end: Optional[int] = None, show_plots: bool = False, copy: bool = False, ) -> Optional[AnnData]: key = RcKey.FORWARD if final else RcKey.BACKWARD logg.info(f"Computing `{key}`") adata = adata.copy() if copy else adata # compute kernel object kernel = transition_matrix(adata, backward=not final, weight_connectivities=weight_connectivities) # create MarkovChain object mc = MarkovChain(kernel) # run the computation mc.compute_eig() mc.compute_approx_rcs( percentile=percentile, n_matches_min=n_matches_min, use=n_start_end, n_clusters_kmeans=n_start_end, cluster_key=cluster_key, ) if show_plots: mc.plot_real_spectrum() mc.plot_eig_embedding(abs_value=True, perc=[0, 98], use=n_start_end) mc.plot_eig_embedding(left=False, use=n_start_end) return adata if copy else None
def compute_transition_matrix( self, density_normalize: bool = True, **kwargs ) -> "ConnectivityKernel": """ Compute transition matrix based on transcriptomic similarity. Uses symmetric, weighted KNN graph to compute symmetric transition matrix. The connectivities are computed using :func:`scanpy.pp.neighbors`. Depending on the parameters used there, they can be UMAP connectivities or gaussian-kernel-based connectivities with adaptive kernel width. Params ------ density_normalize Whether or not to use the underlying KNN graph for density normalization. Returns ------- None Makes :paramref:`transition_matrix` available. """ start = logg.info("Computing transition matrix based on connectivities") params = dict(dnorm=density_normalize) if params == self._params: assert self.transition_matrix is not None, _ERROR_EMPTY_CACHE_MSG logg.debug(_LOG_USING_CACHE) logg.info(" Finish", time=start) return self self._params = params conn = self._conn.copy() if density_normalize: conn = self.density_normalize(conn) logg.info(" Finish", time=start) self.transition_matrix = csr_matrix(conn) return self
def cell_similarity(adata: AnnData, key_added: Optional[str] = 'cell_similarity', sim_type: Optional[str] = 'hub-promoted', use_weights: Optional[bool] = True, copy: bool = False, **neighbors_kwds) -> Optional[AnnData]: """\ Calculate cell similarity score based on the kNN graph. Higher scores are associated to cells mostly close to similar cells. Parameters ---------- adata Annotated data matrix. key_added The name of the entry in adata.obs with calculated values. copy Return a copy instead of writing to adata. sim_type: Similarity function. Can be one in 'dice', 'salton', 'hub-promoted','hub-suppressed', 'jaccard', 'inv-log-weight', 'resource-allocation','leight-holme-newman'. For more information check here https://graph-tool.skewed.de/static/doc/topology.html?highlight=distance#graph_tool.topology.vertex_similarity state A separate block state object Returns ------- Depending on `copy`, returns or updates `adata` with stability values in adata.obs['cell_stability'] """ from .._utils import get_graph_tool_from_adata logg.info("Adding cell similarity scores") g = get_graph_tool_from_adata(adata, use_weights=use_weights, **neighbors_kwds) n_cells = g.num_vertices() S = gt.vertex_similarity(g, sim_type=sim_type).get_2d_array(range(n_cells)) D = np.dot(S, S) D = np.diag(D / np.max(D)) # take the scaled diagonal adata.obs[f'{key_added}'] = D return adata if copy else None
def compute_transitions(self, density_normalize: bool = True): """\ Compute transition matrix. Parameters ---------- density_normalize The density rescaling of Coifman and Lafon (2006): Then only the geometry of the data matters, not the sampled density. Returns ------- Makes attributes `.transitions_sym` and `.transitions` available. """ start = logg.info('computing transitions') W = self._connectivities # density normalization as of Coifman et al. (2005) # ensures that kernel matrix is independent of sampling density if density_normalize: # q[i] is an estimate for the sampling density at point i # it's also the degree of the underlying graph q = np.asarray(W.sum(axis=0)) if not issparse(W): Q = np.diag(1.0 / q) else: Q = scipy.sparse.spdiags(1.0 / q, 0, W.shape[0], W.shape[0]) K = Q @ W @ Q else: K = W # z[i] is the square root of the row sum of K z = np.sqrt(np.asarray(K.sum(axis=0))) if not issparse(K): self.Z = np.diag(1.0 / z) else: self.Z = scipy.sparse.spdiags(1.0 / z, 0, K.shape[0], K.shape[0]) self._transitions_sym = self.Z @ K @ self.Z logg.info(' finished', time=start)
def scale_sparse( X, *, zero_center: bool = True, max_value: Optional[float] = None, copy: bool = False, return_mean_std: bool = False, ): # need to add the following here to make inplace logic work if zero_center: logg.info( "... as `zero_center=True`, sparse input is " "densified and may lead to large memory consumption" ) X = X.toarray() copy = False # Since the data has been copied return scale_array( X, zero_center=zero_center, copy=copy, max_value=max_value, return_mean_std=return_mean_std, )