示例#1
0
def correct_scvi(Xs, genes):
    import torch
    torch.manual_seed(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    from scvi.dataset import AnnDatasetFromAnnData
    from scvi.dataset.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE

    all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs]

    all_dataset = GeneExpressionDataset()
    all_dataset.populate_from_datasets(all_ann)

    vae = VAE(all_dataset.nb_genes,
              n_batch=all_dataset.n_batches,
              n_labels=all_dataset.n_labels,
              n_hidden=128,
              n_latent=30,
              n_layers=2,
              dispersion='gene')
    trainer = UnsupervisedTrainer(
        vae,
        all_dataset,
        train_size=1.,
        use_cuda=True,
    )
    n_epochs = 100
    #trainer.train(n_epochs=n_epochs)
    #torch.save(trainer.model.state_dict(),
    #           'data/harmonization.vae.pkl')
    trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl'))
    trainer.model.eval()

    full = trainer.create_posterior(trainer.model,
                                    all_dataset,
                                    indices=np.arange(len(all_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    return latent
示例#2
0
                from umap import UMAP
                import scanpy as sc

                # TODO: import the datasets into SCVI objects (sigh!)
                # scVI wants raw counts, but who knows about those TabulaMurisSenis data
                # quick and dirty solution for now
                asubr_scvi = asubr.copy()
                asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64)
                ds_atlas = AnnDatasetFromAnnData(asubr_scvi)

                asub2_scvi = asub2.copy()
                asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64)
                ds_new = AnnDatasetFromAnnData(asub2_scvi)

                all_dataset = GeneExpressionDataset()
                all_dataset.populate_from_datasets([ds_atlas, ds_new])

                ##############################################################
                t0 = time.time()
                print('Prepare some data structures')
                vae = VAE(
                    all_dataset.nb_genes,
                    n_batch=all_dataset.n_batches,
                    n_labels=all_dataset.n_labels,
                    n_hidden=128,
                    n_latent=30,
                    n_layers=2,
                    dispersion='gene',
                )

                print('Prepare the trainer')