def get_z_latent(self, adata, encoder_labels): """ Map ``adata`` in to the latent space. This function will feed data in encoder part of scNet and compute the latent space coordinates for each sample in data. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated data matrix to be mapped to latent space. Please note that `adata.X` has to be in shape [n_obs, x_dimension] encoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as class' condition array. Returns ------- adata_latent: :class:`~anndata.AnnData` returns Annotated data containing latent space encoding of ``adata`` """ adata = remove_sparsity(adata) encoder_inputs = [adata.X, encoder_labels] latent = self.encoder_model.predict(encoder_inputs)[2] latent = np.nan_to_num(latent, nan=0.0, posinf=0.0, neginf=0.0) adata_latent = anndata.AnnData(X=latent) adata_latent.obs = adata.obs.copy(deep=True) return adata_latent
def ari(adata, label_key): """Computes Adjusted Rand Index (ARI) metric for ``adata`` given the batch column name. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated dataset. label_key: str Name of the column which contains information about different studies in ``adata.obs`` data frame. Returns ------- score: float ARI score. A float between 0 and 1. """ adata = remove_sparsity(adata) n_labels = len(adata.obs[label_key].unique().tolist()) kmeans = KMeans(n_labels, n_init=200) labels_pred = kmeans.fit_predict(adata.X) labels = adata.obs[label_key].values labels_encoded = LabelEncoder().fit_transform(labels) return adjusted_rand_score(labels_encoded, labels_pred)
def knn_purity(adata, label_key, n_neighbors=30): """Computes KNN Purity metric for ``adata`` given the batch column name. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated dataset. label_key: str Name of the column which contains information about different studies in ``adata.obs`` data frame. n_neighbors: int Number of nearest neighbors. Returns ------- score: float KNN purity score. A float between 0 and 1. """ adata = remove_sparsity(adata) labels = LabelEncoder().fit_transform(adata.obs[label_key].to_numpy()) nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X) indices = nbrs.kneighbors(adata.X, return_distance=False)[:, 1:] neighbors_labels = np.vectorize(lambda i: labels[i])(indices) # pre cell purity scores scores = ((neighbors_labels - labels.reshape(-1, 1)) == 0).mean(axis=1) res = [np.mean(scores[labels == i]) for i in np.unique(labels)] # per cell-type purity return np.mean(res)
def predict(self, adata, encoder_labels, decoder_labels): """Feeds ``adata`` to scNet and produces the reconstructed data. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated data matrix whether in primary space. encoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as scNet's encoder condition array. decoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as scNet's decoder condition array. Returns ------- adata_pred: `~anndata.AnnData` Annotated data of predicted cells in primary space. """ adata = remove_sparsity(adata) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) x_hat = self.cvae_model.predict([adata.X, encoder_labels, decoder_labels])[0] adata_pred = anndata.AnnData(X=x_hat) adata_pred.obs = adata.obs adata_pred.var_names = adata.var_names return adata_pred
def evaluate(self, adata, batch_key): adata = remove_sparsity(adata) encoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) decoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) cvae_inputs = [adata.X, encoder_labels, decoder_labels] encoded_labels = self.cvae_model.predict(cvae_inputs)[2].argmax(axis=1) self._reverse_cell_type_encoder() labels = [] for encoded_label in encoded_labels: labels.append(self.inv_cell_type_encoder[encoded_label]) labels = np.array(labels) true_labels = adata.obs[batch_key].values accuracy = np.mean(labels == true_labels) print(classification_report(true_labels, labels)) return accuracy, confusion_matrix(true_labels, labels)
def silhouette(adata, group_key, metric='euclidean', scale=True): """ wrapper for sklearn silhouette function values range from [-1, 1] with 1 being an ideal fit, 0 indicating overlapping clusters and -1 indicating misclassified cells """ adata = remove_sparsity(adata) labels = adata.obs[group_key].values labels_encoded = LabelEncoder().fit_transform(labels) asw = silhouette_score(adata.X, labels_encoded, metric=metric) if scale: asw = (asw + 1) / 2 return asw
def entropy_batch_mixing(adata, label_key='batch', n_neighbors=50, n_pools=50, n_samples_per_pool=100): """Computes Entory of Batch mixing metric for ``adata`` given the batch column name. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated dataset. label_key: str Name of the column which contains information about different studies in ``adata.obs`` data frame. n_neighbors: int Number of nearest neighbors. n_pools: int Number of EBM computation which will be averaged. n_samples_per_pool: int Number of samples to be used in each pool of execution. Returns ------- score: float EBM score. A float between zero and one. """ adata = remove_sparsity(adata) neighbors = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X) indices = neighbors.kneighbors(adata.X, return_distance=False)[:, 1:] batch_indices = np.vectorize(lambda i: adata.obs[label_key].values[i])( indices) entropies = np.apply_along_axis(__entropy_from_indices, axis=1, arr=batch_indices) # average n_pools entropy results where each result is an average of n_samples_per_pool random samples. if n_pools == 1: score = np.mean(entropies) else: score = np.mean([ np.mean(entropies[np.random.choice(len(entropies), size=n_samples_per_pool)]) for _ in range(n_pools) ]) return score
def silhouette_batch(adata, batch_key, group_key, metric='euclidean', verbose=True, scale=True): """ Silhouette score of batch labels subsetted for each group. params: batch_key: batches to be compared against group_key: group labels to be subsetted by e.g. cell type metric: see sklearn silhouette score embed: name of column in adata.obsm returns: all scores: absolute silhouette scores per group label group means: if `mean=True` """ adata = remove_sparsity(adata) glob_batches = adata.obs[batch_key].values batch_enc = LabelEncoder() batch_enc.fit(glob_batches) sil_all = pd.DataFrame(columns=['group', 'silhouette_score']) for group in adata.obs[group_key].unique(): adata_group = adata[adata.obs[group_key] == group] if adata_group.obs[batch_key].nunique() == 1: continue batches = batch_enc.transform(adata_group.obs[batch_key]) sil_per_group = silhouette_samples(adata_group.X, batches, metric=metric) # take only absolute value sil_per_group = [abs(i) for i in sil_per_group] if scale: # scale s.t. highest number is optimal sil_per_group = [1 - i for i in sil_per_group] d = pd.DataFrame({ 'group': [group] * len(sil_per_group), 'silhouette_score': sil_per_group }) sil_all = sil_all.append(d) sil_all = sil_all.reset_index(drop=True) sil_means = sil_all.groupby('group').mean() if verbose: print(f'mean silhouette per cell: {sil_means}') return sil_all, sil_means
def __init__(self, filename: str, adata: anndata.AnnData, batch_key: str, cell_type_key: str, encoder_model: Model, n_per_epoch: int = 5, n_batch_labels: int = 0, n_celltype_labels: int = 0, clustering_scores: list_or_str = 'all'): super(ScoreCallback, self).__init__() self.adata = remove_sparsity(adata) self.batch_labels, _ = label_encoder(adata, le=None, condition_key=batch_key) self.batch_labels = np.reshape(self.batch_labels, (-1, )) self.batch_labels_onehot = to_categorical(self.batch_labels, num_classes=n_batch_labels) self.celltype_labels, _ = label_encoder(adata, le=None, condition_key=cell_type_key) self.celltype_labels = np.reshape(self.celltype_labels, (-1, )) self.celltype_labels_onehot = to_categorical( self.celltype_labels, num_classes=n_celltype_labels) self.filename = filename self.encoder_model = encoder_model self.n_per_epoch = n_per_epoch self.n_batch_labels = n_batch_labels self.n_celltype_labels = n_celltype_labels self.clustering_scores = clustering_scores self.score_computers = { "asw": self.asw, "ari": self.ari, "nmi": self.nmi, "ebm": self.entropy_of_batch_mixing, "knn": self.knn_purity } self.kmeans_batch = KMeans(self.n_batch_labels, n_init=200) self.kmeans_celltype = KMeans(self.n_celltype_labels, n_init=200)
def annotate(self, adata, batch_key, cell_type_key): adata = remove_sparsity(adata) encoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) decoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) cvae_inputs = [adata.X, encoder_labels, decoder_labels] encoded_labels = self.cvae_model.predict(cvae_inputs)[2].argmax(axis=1) self._reverse_cell_type_encoder() labels = [] for encoded_label in encoded_labels: labels.append(self.inv_cell_type_encoder[encoded_label]) adata.obs[f'pred_{cell_type_key}'] = np.array(labels)
def nmi_helper(adata, group1, group2, method="arithmetic"): """ Normalized mutual information NMI based on 2 different cluster assignments `group1` and `group2` params: adata: Anndata object group1: column name of `adata.obs` or group assignment group2: column name of `adata.obs` or group assignment method: NMI implementation 'max': scikit method with `average_method='max'` 'min': scikit method with `average_method='min'` 'geometric': scikit method with `average_method='geometric'` 'arithmetic': scikit method with `average_method='arithmetic'` 'Lancichinetti': implementation by A. Lancichinetti 2009 et al. 'ONMI': implementation by Aaron F. McDaid et al. (https://github.com/aaronmcdaid/Overlapping-NMI) Hurley 2011 nmi_dir: directory of compiled C code if 'Lancichinetti' or 'ONMI' are specified as `method`. Compilation should be done as specified in the corresponding README. return: normalized mutual information (NMI) """ adata = remove_sparsity(adata) if isinstance(group1, str): group1 = adata.obs[group1].tolist() elif isinstance(group1, pd.Series): group1 = group1.tolist() labels = adata.obs[group2].values labels_encoded = LabelEncoder().fit_transform(labels) group2 = labels_encoded if len(group1) != len(group2): raise ValueError( f'different lengths in group1 ({len(group1)}) and group2 ({len(group2)})' ) # choose method if method in ['max', 'min', 'geometric', 'arithmetic']: nmi_value = normalized_mutual_info_score(group1, group2, average_method=method) else: raise ValueError(f"Method {method} not valid") return nmi_value
def to_mmd_layer(self, adata, batch_key): """ Map ``adata`` in to the MMD space. This function will feed data in ``mmd_model`` of scArches and compute the MMD space coordinates for each sample in data. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated data matrix to be mapped to MMD latent space. Please note that ``adata.X`` has to be in shape [n_obs, x_dimension] encoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as scArches' encoder condition array. decoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as scArches' decoder condition array. Returns ------- adata_mmd: :class:`~anndata.AnnData` returns Annotated data containing MMD latent space encoding of ``adata`` """ adata = remove_sparsity(adata) encoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) decoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) cvae_inputs = [adata.X, encoder_labels, decoder_labels] mmd = self.cvae_model.predict(cvae_inputs)[1] mmd = np.nan_to_num(mmd, nan=0.0, posinf=0.0, neginf=0.0) adata_mmd = anndata.AnnData(X=mmd) adata_mmd.obs = adata.obs.copy(deep=True) return adata_mmd
def asw(adata, label_key): """Computes Average Silhouette Width (ASW) metric for ``adata`` given the batch column name. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated dataset. label_key: str Name of the column which contains information about different studies in ``adata.obs`` data frame. Returns ------- score: float ASW score. A float between -1 and 1. """ adata = remove_sparsity(adata) labels = adata.obs[label_key].values labels_encoded = LabelEncoder().fit_transform(labels) return silhouette_score(adata.X, labels_encoded)
def predict(self, adata, encoder_labels, decoder_labels): """Feeds ``adata`` to scNet and produces the reconstructed data. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated data matrix whether in primary space. encoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as class' encoder condition array. decoder_labels: :class:`~numpy.ndarray` :class:`~numpy.ndarray` of labels to be fed as class' decoder condition array. Returns ------- adata_pred: `~anndata.AnnData` Annotated data of predicted cells in primary space. """ adata = remove_sparsity(adata) encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions) decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions) if self.loss_fn in ['nb', 'zinb']: inputs = [ adata.X, encoder_labels, decoder_labels, self.adata.obs[self.size_factor_key] ] else: inputs = [adata.X, encoder_labels, decoder_labels] x_hat = self.cvae_model.predict(inputs) adata_pred = anndata.AnnData(X=x_hat) adata_pred.obs = adata.obs adata_pred.var_names = adata.var_names return adata_pred
def opt_louvain(adata, label_key, cluster_key, function=None, resolutions=None, inplace=True, plot=False, verbose=True, **kwargs): """ params: label_key: name of column in adata.obs containing biological labels to be optimised against cluster_key: name of column to be added to adata.obs during clustering. Will be overwritten if exists and `force=True` function: function that computes the cost to be optimised over. Must take as arguments (adata, group1, group2, **kwargs) and returns a number for maximising resolutions: list if resolutions to be optimised over. If `resolutions=None`, default resolutions of 20 values ranging between 0.1 and 2 will be used returns: res_max: resolution of maximum score score_max: maximum score score_all: `pd.DataFrame` containing all scores at resolutions. Can be used to plot the score profile. clustering: only if `inplace=False`, return cluster assignment as `pd.Series` plot: if `plot=True` plot the score profile over resolution """ adata = remove_sparsity(adata) if resolutions is None: n = 20 resolutions = [2 * x / n for x in range(1, n + 1)] score_max = 0 res_max = resolutions[0] clustering = None score_all = [] # maren's edit - recompute neighbors if not existing try: adata.uns['neighbors'] except KeyError: if verbose: print('computing neigbours for opt_cluster') sc.pp.neighbors(adata) for res in resolutions: sc.tl.louvain(adata, resolution=res, key_added=cluster_key) score = function(adata, label_key, cluster_key, **kwargs) score_all.append(score) if score_max < score: score_max = score res_max = res clustering = adata.obs[cluster_key] del adata.obs[cluster_key] if verbose: print(f'optimised clustering against {label_key}') print(f'optimal cluster resolution: {res_max}') print(f'optimal score: {score_max}') score_all = pd.DataFrame(zip(resolutions, score_all), columns=('resolution', 'score')) if plot: # score vs. resolution profile sns.lineplot(data=score_all, x='resolution', y='score').set_title('Optimal cluster resolution profile') plt.show() if inplace: adata.obs[cluster_key] = clustering return res_max, score_max, score_all else: return res_max, score_max, score_all, clustering