def correct_scvi(Xs, genes): import torch torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False from scvi.dataset import AnnDatasetFromAnnData from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs] all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets(all_ann) vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer( vae, all_dataset, train_size=1., use_cuda=True, ) n_epochs = 100 #trainer.train(n_epochs=n_epochs) #torch.save(trainer.model.state_dict(), # 'data/harmonization.vae.pkl') trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl')) trainer.model.eval() full = trainer.create_posterior(trainer.model, all_dataset, indices=np.arange(len(all_dataset))) latent, batch_indices, labels = full.sequential().get_latent() return latent
from umap import UMAP import scanpy as sc # TODO: import the datasets into SCVI objects (sigh!) # scVI wants raw counts, but who knows about those TabulaMurisSenis data # quick and dirty solution for now asubr_scvi = asubr.copy() asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64) ds_atlas = AnnDatasetFromAnnData(asubr_scvi) asub2_scvi = asub2.copy() asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64) ds_new = AnnDatasetFromAnnData(asub2_scvi) all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets([ds_atlas, ds_new]) ############################################################## t0 = time.time() print('Prepare some data structures') vae = VAE( all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene', ) print('Prepare the trainer')