def experiment_seurat_ari(data_names, namespace): datasets, genes_list, n_cells = load_names(data_names, norm=False) datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) X_dimred = reduce_dimensionality(normalize(X)) name = 'data/{}'.format(namespace) Ns = [500, 1000, 2000, 5000, 10000] if not os.path.isfile('{}/matrix.mtx'.format(name)): save_mtx(name, csr_matrix(X), genes) log('Seurat clustering full dataset...') cluster_labels_full = seurat_cluster(name) log('Seurat clustering done.') for N in Ns: gs_idx = gs(X_dimred, N) save_mtx(name + '/gs{}'.format(N), csr_matrix(X[gs_idx, :]), genes) log('Seurat clustering GS N = {}...'.format(N)) seurat_labels = seurat_cluster(name + '/gs{}'.format(N)) log('Seurat clustering GS N = {} done.'.format(N)) cluster_labels = label_approx(X_dimred, X_dimred[gs_idx], seurat_labels) log('N = {}, GS ARI = {}'.format( N, adjusted_rand_score(cluster_labels_full, cluster_labels))) uni_idx = uniform(X_dimred, N) save_mtx(name + '/uni{}'.format(N), csr_matrix(X[uni_idx, :]), genes) log('Seurat clustering uniform N = {}...'.format(N)) seurat_labels = seurat_cluster(name + '/uni{}'.format(N)) log('Seurat clustering uniform N = {} done.'.format(N)) cluster_labels = label_approx(X_dimred, X_dimred[uni_idx], seurat_labels) log('N = {}, Uniform ARI = {}'.format( N, adjusted_rand_score(cluster_labels_full, cluster_labels)))
def load_data(): data_names = [ 'data/norman2019_k562', ] [ X ], [ genes ], n_cells = load_names(data_names, norm=False) qc_idx, perturbs = load_meta(data_names[0]) X = X[qc_idx] X = normalize(X, norm='l1') * 1e5 X = X.log1p() adata = AnnData(X) adata.var_names = genes adata.var_names_make_unique() adata.obs['perturb'] = perturbs sc.pp.highly_variable_genes(adata, n_top_genes=5000) return adata
def kl_divergence(cell_labels, samp_idx, expected): cluster_labels = cell_labels[samp_idx] clusters = sorted(set(cell_labels)) max_cluster = max(clusters) cluster_hist = np.zeros(max_cluster + 1) for c in range(max_cluster + 1): if c in clusters: cluster_hist[c] = np.sum(cluster_labels == c) cluster_hist /= np.sum(cluster_hist) return scipy.stats.entropy(cluster_hist, expected) if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names, norm=False) datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) k = DIMRED U, s, Vt = pca(normalize(X), k=k) X_dimred = U[:, :k] * s[:k] Xs = [] labels = [] translate = X_dimred.max(0) for i in range(3): rand_idx = np.random.choice(X.shape[0], size=int(X.shape[0] / (10**i)), replace=False) Xs.append(X_dimred[rand_idx, :] + (translate * 2 * i))
data_names = [ 'data/mouse_brain/nuclei', 'data/mouse_brain/dropviz/Cerebellum_ALT', 'data/mouse_brain/dropviz/Cortex_noRep5_FRONTALonly', 'data/mouse_brain/dropviz/Cortex_noRep5_POSTERIORonly', 'data/mouse_brain/dropviz/EntoPeduncular', 'data/mouse_brain/dropviz/GlobusPallidus', 'data/mouse_brain/dropviz/Hippocampus', 'data/mouse_brain/dropviz/Striatum', 'data/mouse_brain/dropviz/SubstantiaNigra', 'data/mouse_brain/dropviz/Thalamus', ] if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names) datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names) datasets_dimred, genes = process_data(datasets, genes, verbose=True) t0 = time() datasets_dimred = assemble( datasets_dimred, batch_size=BATCH_SIZE, ) print('Integrated panoramas in {:.3f}s'.format(time() - t0)) t0 = time() datasets_dimred, datasets, genes = correct(datasets, genes_list,
from scipy.sparse import vstack from sklearn.preprocessing import normalize, LabelEncoder import sys from process import load_names NAMESPACE = 'polarized' data_names = [ 'data/macrophage/gmcsf_day6_1', 'data/macrophage/gmcsf_day6_2', 'data/macrophage/mcsf_day6_1', 'data/macrophage/mcsf_day6_2', ] if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names, log1p=True) datasets, genes = merge_datasets(datasets, genes_list) datasets_dimred, genes = process_data(datasets, genes) labels = [] names = [] curr_label = 0 for i, a in enumerate(datasets): labels += list(np.zeros(a.shape[0]) + curr_label) names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) polarized_genes = ['PRDX1'] #embedding = visualize(datasets_dimred,
import numpy as np from process import load_names, merge_datasets from utils import * NAMESPACE = 'human_cordblood_ica' DIMRED = 100 DR_METHOD = 'svd' data_names = [ 'data/ica/ica_cord_blood_h5', ] namespaces = [ 'ica_cord_blood', ] [X], [genes], _ = load_names(data_names) umi_sum = np.sum(X, axis=1) gt_idx = [i for i, s in enumerate(umi_sum) if s >= 500] low_idx = [ idx for idx, gene in enumerate(genes) if gene.startswith('RPS') or gene.startswith('RPL') ] lt_idx = [ i for i, s in enumerate(np.sum(X[:, low_idx], axis=1) / umi_sum) if s <= 0.5 ] qc_idx = sorted(set(gt_idx) & set(lt_idx)) X = csr_matrix(X[qc_idx])
from process import load_names from scanorama import * NAMESPACE = 'hsc' data_names = [ 'data/hsc/hsc_mars', 'data/hsc/hsc_ss2', ] # Computes the probability that the corrected SS2 dataset # comes from the original SS2 distribution or from the same # distribution as the corrected MARS-Seq dataset. if __name__ == '__main__': # Load data. datasets, genes_list, n_cells = load_names(data_names, verbose=False) datasets, genes = merge_datasets(datasets, genes_list, verbose=False) datasets, genes = process_data(datasets, genes) datasets = [normalize(ds, axis=1) for ds in datasets] # Fit initial mixture models. gm_ss2 = (GaussianMixture(n_components=3, n_init=3).fit(datasets[1])) # Do batch correction. datasets = assemble(datasets, verbose=False, knn=KNN, sigma=SIGMA, approx=APPROX) datasets = [normalize(ds, axis=1) for ds in datasets]
from pancreas_tests import * from process import load_names NAMESPACE = 'pancreas' data_names = [ 'data/pancreas/pancreas_inDrop', 'data/pancreas/pancreas_multi_celseq2_expression_matrix', 'data/pancreas/pancreas_multi_celseq_expression_matrix', 'data/pancreas/pancreas_multi_fluidigmc1_expression_matrix', 'data/pancreas/pancreas_multi_smartseq2_expression_matrix', ] if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names) labels = [] names = [] curr_label = 0 for i, a in enumerate(datasets): labels += list(np.zeros(a.shape[0]) + curr_label) names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) datasets_dimred, datasets, genes = correct(datasets, genes_list, ds_names=data_names, return_dimred=True)