def test_fish_rna(save_path): gene_dataset_fish = SmfishDataset(save_path) gene_dataset_seq = CortexDataset( save_path=save_path, genes_to_keep=gene_dataset_fish.gene_names, total_genes=gene_dataset_fish.nb_genes + 50) benchmark_fish_scrna(gene_dataset_seq, gene_dataset_fish)
def test_iwae(save_path): import time dataset = CortexDataset(save_path=save_path) torch.manual_seed(42) vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() start = time.time() trainer = UnsupervisedTrainer(vae, gene_dataset=dataset, ratio_loss=True, k_importance_weighted=5, single_backward=True) trainer.train(n_epochs=10) stop1 = time.time() - start vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() start = time.time() trainer = UnsupervisedTrainer(vae, gene_dataset=dataset, ratio_loss=True, k_importance_weighted=5, single_backward=False) trainer.train(n_epochs=10) stop2 = time.time() - start print('Time single backward : ', stop1) print('Time all elements : ', stop2)
def cortex_benchmark(n_epochs=250, use_cuda=True, save_path='data/', show_plot=True): cortex_dataset = CortexDataset(save_path=save_path) vae = VAE(cortex_dataset.nb_genes) trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=n_epochs) trainer_cortex_vae.train_set.differential_expression_score( 'oligodendrocytes', 'pyramidal CA1', genes=["THY1", "MBP"]) trainer_cortex_vae.test_set.ll() # assert ~ 1200 vae = VAE(cortex_dataset.nb_genes) trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, use_cuda=use_cuda) trainer_cortex_vae.corrupt_posteriors() trainer_cortex_vae.train(n_epochs=n_epochs) trainer_cortex_vae.uncorrupt_posteriors() trainer_cortex_vae.train_set.imputation_benchmark(verbose=(n_epochs > 1), save_path=save_path, show_plot=show_plot) n_samples = 10 if n_epochs == 1 else None # n_epochs == 1 is unit tests trainer_cortex_vae.train_set.show_t_sne(n_samples=n_samples) return trainer_cortex_vae
def load_datasets(dataset_name, save_path="data/", url=None): if dataset_name == "synthetic": gene_dataset = SyntheticDataset() elif dataset_name == "cortex": gene_dataset = CortexDataset() elif dataset_name == "brain_large": gene_dataset = BrainLargeDataset(save_path=save_path) elif dataset_name == "retina": gene_dataset = RetinaDataset(save_path=save_path) elif dataset_name == "cbmc": gene_dataset = CbmcDataset(save_path=save_path) elif dataset_name == "brain_small": gene_dataset = BrainSmallDataset(save_path=save_path) elif dataset_name == "hemato": gene_dataset = HematoDataset(save_path="data/HEMATO/") elif dataset_name == "pbmc": gene_dataset = PbmcDataset(save_path=save_path) elif dataset_name[-5:] == ".loom": gene_dataset = LoomDataset(filename=dataset_name, save_path=save_path, url=url) elif dataset_name[-5:] == ".h5ad": gene_dataset = AnnDataset(dataset_name, save_path=save_path, url=url) elif ".csv" in dataset_name: gene_dataset = CsvDataset(dataset_name, save_path=save_path) else: raise Exception("No such dataset available") return gene_dataset
def cortex_benchmark(n_epochs=250, use_cuda=True, save_path="data/", show_plot=True): cortex_dataset = CortexDataset(save_path=save_path) vae = VAE(cortex_dataset.nb_genes) trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=n_epochs) couple_celltypes = (4, 5) # the couple types on which to study DE cell_idx1 = cortex_dataset.labels.ravel() == couple_celltypes[0] cell_idx2 = cortex_dataset.labels.ravel() == couple_celltypes[1] trainer_cortex_vae.train_set.differential_expression_score( cell_idx1, cell_idx2, genes=["THY1", "MBP"]) trainer_cortex_vae.test_set.reconstruction_error() # assert ~ 1200 vae = VAE(cortex_dataset.nb_genes) trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, use_cuda=use_cuda) trainer_cortex_vae.corrupt_posteriors() trainer_cortex_vae.train(n_epochs=n_epochs) trainer_cortex_vae.uncorrupt_posteriors() trainer_cortex_vae.train_set.imputation_benchmark(save_path=save_path, show_plot=show_plot) n_samples = 10 if n_epochs == 1 else None # n_epochs == 1 is unit tests trainer_cortex_vae.train_set.show_t_sne(n_samples=n_samples) return trainer_cortex_vae
def test_fish_rna(save_path): gene_dataset_fish = SmfishDataset(save_path) gene_dataset_seq = CortexDataset(save_path=save_path, genes_fish=gene_dataset_fish.gene_names, genes_to_keep=[], additional_genes=50) benchmark_fish_scrna(gene_dataset_seq, gene_dataset_fish)
def test_gamma_de(): cortex_dataset = CortexDataset() cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer(cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=2) full = trainer_cortex_vae.create_posterior(trainer_cortex_vae.model, cortex_dataset, indices=np.arange( len(cortex_dataset))) n_samples = 10 M_permutation = 100 cell_idx1 = cortex_dataset.labels.ravel() == 0 cell_idx2 = cortex_dataset.labels.ravel() == 1 full.differential_expression_score(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation) full.differential_expression_gamma(cell_idx1, cell_idx2, n_samples=n_samples, M_permutation=M_permutation)
def test_annealing_procedures(save_path): cortex_dataset = CortexDataset(save_path=save_path) cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer( cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda, n_epochs_kl_warmup=1, ) trainer_cortex_vae.train(n_epochs=2) assert trainer_cortex_vae.kl_weight >= 0.99, "Annealing should be over" trainer_cortex_vae = UnsupervisedTrainer( cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda, n_epochs_kl_warmup=5, ) trainer_cortex_vae.train(n_epochs=2) assert trainer_cortex_vae.kl_weight <= 0.99, "Annealing should be proceeding" # iter trainer_cortex_vae = UnsupervisedTrainer( cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda, n_iter_kl_warmup=1, n_epochs_kl_warmup=None, ) trainer_cortex_vae.train(n_epochs=2) assert trainer_cortex_vae.kl_weight >= 0.99, "Annealing should be over"
def load_datasets(dataset_name, save_path='data/', url=None): if dataset_name == 'synthetic': gene_dataset = SyntheticDataset() elif dataset_name == 'cortex': gene_dataset = CortexDataset() elif dataset_name == 'brain_large': gene_dataset = BrainLargeDataset(save_path=save_path) elif dataset_name == 'retina': gene_dataset = RetinaDataset(save_path=save_path) elif dataset_name == 'cbmc': gene_dataset = CbmcDataset(save_path=save_path) elif dataset_name == 'brain_small': gene_dataset = BrainSmallDataset(save_path=save_path) elif dataset_name == 'hemato': gene_dataset = HematoDataset(save_path='data/HEMATO/') elif dataset_name == 'pbmc': gene_dataset = PbmcDataset(save_path=save_path) elif dataset_name[-5:] == ".loom": gene_dataset = LoomDataset(filename=dataset_name, save_path=save_path, url=url) elif dataset_name[-5:] == ".h5ad": gene_dataset = AnnDataset(dataset_name, save_path=save_path, url=url) elif ".csv" in dataset_name: gene_dataset = CsvDataset(dataset_name, save_path=save_path) else: raise "No such dataset available" return gene_dataset
def test_variance_and_order_and_size(self): to_keep = ["THY1", "sst", "Tomem2", "Crhbp"] total_genes = 10 dataset_full = CortexDataset(save_path="tests/data", total_genes=None) dataset_small = CortexDataset( save_path="tests/data", genes_to_keep=to_keep, total_genes=total_genes ) self.assertListEqual(dataset_small.gene_names[:4].tolist(), to_keep) small_variance = np.std(dataset_small.X[:, 4:], axis=0).argsort()[::-1] self.assertListEqual(small_variance.tolist(), list(range(6))) full_variance = np.std(dataset_full.X, axis=0).argsort()[::-1] variable_genes_all = dataset_full.gene_names[full_variance] genes_truth = (to_keep + [g for g in variable_genes_all if g not in to_keep])[ :total_genes ] self.assertListEqual(dataset_small.gene_names.tolist(), genes_truth)
def cortex_benchmark(n_epochs=250, use_cuda=True, unit_test=False): cortex_dataset = CortexDataset() vae = VAE(cortex_dataset.nb_genes) infer_cortex_vae = VariationalInference(vae, cortex_dataset, use_cuda=use_cuda) infer_cortex_vae.train(n_epochs=n_epochs) infer_cortex_vae.ll('test') # assert ~ 1200 infer_cortex_vae.differential_expression('test') infer_cortex_vae.imputation('test', rate=0.1) # assert ~ 2.3 n_samples = 1000 if not unit_test else 10 infer_cortex_vae.show_t_sne('test', n_samples=n_samples) return infer_cortex_vae
def test_iaf2(save_path): dataset = CortexDataset(save_path=save_path) vae = IALogNormalPoissonVAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches, do_h=True).cuda() trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, ratio_loss=True) trainer.train(n_epochs=1000) print(trainer.train_losses) z, l = trainer.test_set.get_latents(n_samples=5, device='cpu') return
def test_classifier_accuracy(save_path): cortex_dataset = CortexDataset(save_path=save_path) cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels) cls_trainer = ClassifierTrainer(cls, cortex_dataset, metrics_to_monitor=['accuracy'], frequency=1, early_stopping_kwargs={ 'early_stopping_metric': 'accuracy', 'save_best_state_metric': 'accuracy' }) cls_trainer.train(n_epochs=2) cls_trainer.train_set.accuracy()
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) # Sample scale example px_scales = post.scale_sampler(n_samples_per_cell=4, n_samples=None, selection=all_indices)["scale"] assert (px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, ) print(de_dataframe.keys()) assert (de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"]).all() assert (de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"]).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()
def test_sampling_zl(save_path): cortex_dataset = CortexDataset(save_path=save_path) cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer( cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda ) trainer_cortex_vae.train(n_epochs=2) cortex_cls = Classifier((cortex_vae.n_latent + 1), n_labels=cortex_dataset.n_labels) trainer_cortex_cls = ClassifierTrainer( cortex_cls, cortex_dataset, sampling_model=cortex_vae, sampling_zl=True ) trainer_cortex_cls.train(n_epochs=2) trainer_cortex_cls.test_set.accuracy()
def test_cortex(): cortex_dataset = CortexDataset() vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) infer_cortex_vae = VariationalInference(vae, cortex_dataset, train_size=0.1, use_cuda=use_cuda) infer_cortex_vae.train(n_epochs=1) infer_cortex_vae.ll('train') infer_cortex_vae.differential_expression_stats('train') infer_cortex_vae.differential_expression('test') infer_cortex_vae.imputation('train', corruption='uniform') infer_cortex_vae.imputation('test', n_samples=2, corruption='binomial') svaec = SVAEC(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels) infer_cortex_svaec = JointSemiSupervisedVariationalInference( svaec, cortex_dataset, n_labelled_samples_per_class=50, use_cuda=use_cuda) infer_cortex_svaec.train(n_epochs=1) infer_cortex_svaec.accuracy('labelled') infer_cortex_svaec.ll('all') svaec = SVAEC(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels, logreg_classifier=True) infer_cortex_svaec = AlternateSemiSupervisedVariationalInference( svaec, cortex_dataset, n_labelled_samples_per_class=50, use_cuda=use_cuda) infer_cortex_svaec.train(n_epochs=1, lr=1e-2) infer_cortex_svaec.accuracy('unlabelled') infer_cortex_svaec.svc_rf(unit_test=True) cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels) infer_cls = ClassifierInference(cls, cortex_dataset) infer_cls.train(n_epochs=1) infer_cls.accuracy('train')
def test_full_cov(): dataset = CortexDataset() mdl = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches, reconstruction_loss='zinb', n_latent=2, full_cov=True) trainer = UnsupervisedTrainer(model=mdl, gene_dataset=dataset, use_cuda=True, train_size=0.7, frequency=1, early_stopping_kwargs={ 'early_stopping_metric': 'elbo', 'save_best_state_metric': 'elbo', 'patience': 15, 'threshold': 3 }) trainer.train(n_epochs=20, lr=1e-3) assert not np.isnan(trainer.history['ll_test_set']).any()
def test_cortex(save_path): cortex_dataset = CortexDataset(save_path=save_path) vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda) trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.train_set.ll() trainer_cortex_vae.train_set.differential_expression_stats() trainer_cortex_vae.corrupt_posteriors(corruption='binomial') trainer_cortex_vae.corrupt_posteriors() trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.uncorrupt_posteriors() trainer_cortex_vae.train_set.imputation_benchmark(n_samples=1, show_plot=False, title_plot='imputation', save_path=save_path) svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels) trainer_cortex_svaec = JointSemiSupervisedTrainer(svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda) trainer_cortex_svaec.train(n_epochs=1) trainer_cortex_svaec.labelled_set.accuracy() trainer_cortex_svaec.full_dataset.ll() svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels) trainer_cortex_svaec = AlternateSemiSupervisedTrainer(svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda) trainer_cortex_svaec.train(n_epochs=1, lr=1e-2) trainer_cortex_svaec.unlabelled_set.accuracy() data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data() data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data() compute_accuracy_svc(data_train, labels_train, data_test, labels_test, param_grid=[{'C': [1], 'kernel': ['linear']}]) compute_accuracy_rf(data_train, labels_train, data_test, labels_test, param_grid=[{'max_depth': [3], 'n_estimators': [10]}]) cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels) cls_trainer = ClassifierTrainer(cls, cortex_dataset) cls_trainer.train(n_epochs=1) cls_trainer.train_set.accuracy()
def test_vae_ratio_loss(save_path): cortex_dataset = CortexDataset(save_path=save_path) cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer(cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda, ratio_loss=True) trainer_cortex_vae.train(n_epochs=2) dataset = LatentLogPoissonDataset(n_genes=5, n_latent=2, n_cells=300, n_comps=1) vae = LogNormalPoissonVAE(dataset.nb_genes, dataset.n_batches, full_cov=True) trainer_vae = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda, ratio_loss=True) trainer_vae.train(n_epochs=2)
def test_iaf(save_path): enc = EncoderIAF(n_in=5, n_latent=2, n_cat_list=None, n_hidden=12, n_layers=2, t=3).cuda() x = torch.rand(64, 5, device='cuda') z1, _ = enc(x) assert z1.shape == (64, 2) dataset = CortexDataset(save_path=save_path) vae = IAVAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, ratio_loss=True) trainer.train(n_epochs=2) z, labels = trainer.train_set.get_latents(n_samples=10, device='cuda') vae = IALogNormalPoissonVAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda() trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, ratio_loss=True) trainer.train(n_epochs=2) with torch.no_grad(): outputs = vae.inference(x=torch.randint(low=1, high=10, size=(128, dataset.nb_genes), device='cuda', dtype=torch.float), n_samples=3) z, l = trainer.test_set.get_latents(n_samples=5, device='cpu') return
def test_cortex(save_path): cortex_dataset = CortexDataset(save_path=save_path) vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches) trainer_cortex_vae = UnsupervisedTrainer( vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda ) trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.train_set.reconstruction_error() trainer_cortex_vae.train_set.differential_expression_stats() trainer_cortex_vae.train_set.generate_feature_correlation_matrix( n_samples=2, correlation_type="pearson" ) trainer_cortex_vae.train_set.generate_feature_correlation_matrix( n_samples=2, correlation_type="spearman" ) trainer_cortex_vae.train_set.imputation(n_samples=1) trainer_cortex_vae.test_set.imputation(n_samples=5) trainer_cortex_vae.corrupt_posteriors(corruption="binomial") trainer_cortex_vae.corrupt_posteriors() trainer_cortex_vae.train(n_epochs=1) trainer_cortex_vae.uncorrupt_posteriors() trainer_cortex_vae.train_set.imputation_benchmark( n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path ) trainer_cortex_vae.train_set.generate_parameters() n_cells, n_genes = ( len(trainer_cortex_vae.train_set.indices), cortex_dataset.nb_genes, ) n_samples = 3 (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters() assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes) assert dispersions.shape == (n_cells, n_genes) (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters( n_samples=n_samples ) assert dropout.shape == (n_samples, n_cells, n_genes) assert means.shape == (n_samples, n_cells, n_genes,) (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters( n_samples=n_samples, give_mean=True ) assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes) full = trainer_cortex_vae.create_posterior( vae, cortex_dataset, indices=np.arange(len(cortex_dataset)) ) x_new, x_old = full.generate(n_samples=10) assert x_new.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes, 10) assert x_old.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes) trainer_cortex_vae.train_set.imputation_benchmark( n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path ) svaec = SCANVI( cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels ) trainer_cortex_svaec = JointSemiSupervisedTrainer( svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda ) trainer_cortex_svaec.train(n_epochs=1) trainer_cortex_svaec.labelled_set.accuracy() trainer_cortex_svaec.full_dataset.reconstruction_error() svaec = SCANVI( cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels ) trainer_cortex_svaec = AlternateSemiSupervisedTrainer( svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda ) trainer_cortex_svaec.train(n_epochs=1, lr=1e-2) trainer_cortex_svaec.unlabelled_set.accuracy() data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data() data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data() compute_accuracy_svc( data_train, labels_train, data_test, labels_test, param_grid=[{"C": [1], "kernel": ["linear"]}], ) compute_accuracy_rf( data_train, labels_train, data_test, labels_test, param_grid=[{"max_depth": [3], "n_estimators": [10]}], ) cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels) cls_trainer = ClassifierTrainer(cls, cortex_dataset) cls_trainer.train(n_epochs=1) cls_trainer.train_set.accuracy()
def test_fish_rna(): gene_dataset_fish = SmfishDataset() gene_dataset_seq = CortexDataset(genes_fish=gene_dataset_fish.gene_names, genes_to_keep=[], additional_genes=50) benchamrk_fish_scrna(gene_dataset_seq, gene_dataset_fish)
def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset(save_path='tests/data/') cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path='tests/data/') cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4)) cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 2 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50 synthetic_dataset_3 = SyntheticDataset(n_labels=6) synthetic_dataset_3.cell_types = np.arange(6).astype(np.str) synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
def test_differential_expression(save_path): dataset = CortexDataset(save_path=save_path) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices) with tempfile.TemporaryDirectory() as temp_dir: posterior_save_path = os.path.join(temp_dir, "posterior_data") post.save_posterior(posterior_save_path) new_vae = VAE(dataset.nb_genes, dataset.n_batches) new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False) assert np.array_equal(new_post.indices, post.indices) assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X) # Sample scale example px_scales = post.scale_sampler( n_samples_per_cell=4, n_samples=None, selection=all_indices )["scale"] assert ( px_scales.shape[1] == dataset.nb_genes ), "posterior scales should have shape (n_samples, n_genes)" # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, cred_interval_lvls=[0.5, 0.95], ) print(de_dataframe.keys()) assert ( de_dataframe["confidence_interval_0.5_min"] <= de_dataframe["confidence_interval_0.5_max"] ).all() assert ( de_dataframe["confidence_interval_0.95_min"] <= de_dataframe["confidence_interval_0.95_max"] ).all() # DE estimation example de_probabilities = de_dataframe.loc[:, "proba_de"] assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all() # Test totalVI DE sp = os.path.join(save_path, "10X") dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp) n_cells = len(dataset) all_indices = np.arange(n_cells) vae = TOTALVI( dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches ) trainer = TotalTrainer( vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None ) trainer.train(n_epochs=2) post = trainer.create_posterior( vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior ) # Differential expression different models idx_1 = [1, 2, 3] idx_2 = [4, 5, 6, 7] de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="vanilla", use_permutation=True, M_permutation=100, ) de_dataframe = post.differential_expression_score( idx1=idx_1, idx2=idx_2, n_samples=10, mode="change", use_permutation=True, M_permutation=100, )
def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset() cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset() cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400)) cortex_dataset_2.filter_cell_types( ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets( cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 200 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50
def test_populate_from_datasets_cortex(self): cortex_dataset_1 = CortexDataset(save_path="tests/data") cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3), mode="variance") cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path="tests/data") cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4), mode="variance") cortex_dataset_2.filter_cell_types([ "endothelial-mural", "interneurons", "microglia", "oligodendrocytes" ]) cortex_dataset_2.filter_cell_types([2, 0]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2]) self.assertEqual(2, dataset.nb_genes)
def to_tensor(x): """ numpy array to pytorch tensor """ return torch.from_numpy(x.astype('float32')).to(torch_device) def to_array(x): """ pytorch tensor to numpy array """ if hasattr(x, 'todense'): return np.array(x.todense()) if hasattr(x, 'cpu'): return x.data.cpu().numpy() return x # Load dataset cortex = CortexDataset(save_path=SAVE_DATA_PATH) X = cortex.X labels = cortex.cell_types n_labels = len(labels) Y = one_hot(cortex.labels.ravel(), n_labels) # =========================================================================== # scVI # =========================================================================== scvi = VAE(n_input=cortex.nb_genes, n_batch=0, n_labels=0, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layer, dispersion=dispersion,
def test_populate(self): dataset = CortexDataset(save_path="tests/data") unsupervised_training_one_epoch(dataset)
show_plot = True import numpy as np import pandas as pd from sklearn.manifold import TSNE import matplotlib.pyplot as plt from scvi.dataset import CortexDataset, RetinaDataset from scvi.models import * from scvi.inference import UnsupervisedTrainer import torch import ssl ssl._create_default_https_context = ssl._create_unverified_context gene_dataset = CortexDataset(save_path=save_path) n_epochs = 400 if n_epochs_all is None else n_epochs_all lr = 1e-3 use_batches = False use_cuda = True vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=0.75, use_cuda=use_cuda, frequency=5, verbose=True) trainer.train(n_epochs=n_epochs, lr=lr)