Пример #1
0
def SCANVI_acc(gene_dataset:GeneExpressionDataset, plotname: str,pred1,pred2,coral1,coral2, rep='0'):
    fname = '../%s/scanvi_acc.txt'%(plotname)
    methods = ['scanvi','scanvi1','scanvi2']
    f = open(fname, "w+")
    f.write('method\t' +  "%s\t" * len(gene_dataset.cell_types) % tuple(gene_dataset.cell_types) + "\n")
    for i,method in enumerate(methods):
        vae_posterior = trainVAE(gene_dataset,plotname,rep)
        scanvi = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels, n_layers=2)
        scanvi.load_state_dict(vae_posterior.model.state_dict(), strict=False)
        if method=='scanvi1':
            trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10,
                                                   n_epochs_classifier=50, lr_classification=5 * 1e-3)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0))
            trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1))
        elif method=='scanvi2':
            trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10,
                                                   n_epochs_classifier=50, lr_classification=5 * 1e-3)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1))
            trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0))
        else:
            trainer_scanvi = SemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=50,
                                                   n_epochs_classifier=1, lr_classification=5 * 1e-3)
        trainer_scanvi.train(n_epochs=5)
        labelled_idx = trainer_scanvi.labelled_set.indices
        unlabelled_idx = trainer_scanvi.unlabelled_set.indices
        full = trainer_scanvi.create_posterior(trainer_scanvi.model, gene_dataset, indices=np.arange(len(gene_dataset)))
        labels, labels_pred = full.sequential().compute_predictions()
        shared = set(labels[labelled_idx]).intersection(set(labels[unlabelled_idx]))
        acc = [np.mean(labels_pred[unlabelled_idx][labels[unlabelled_idx] == i] == i) for i in np.unique(labels)]
        for x in np.unique(labels):
            if x not in [*shared] and method!='scanvi':
                acc[x]=-1
        f.write(method + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")

    labels = gene_dataset.labels.ravel()
    batch = gene_dataset.batch_indices.ravel()
    acc = [np.mean(pred1[labels[batch == 1] == i] == i) for i in np.unique(labels)]
    f.write('scmap1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(pred2[labels[batch == 0] == i] == i) for i in np.unique(labels)]
    f.write('scmap2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(coral1[labels[batch == 1] == i] == i) for i in np.unique(labels)]
    f.write('coral1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(coral2[labels[batch == 0] == i] == i) for i in np.unique(labels)]
    f.write('coral2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    f.close()
Пример #2
0
def runScanvi(adata, batch, labels):
    # Use non-normalized (count) data for scanvi!

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE, SCANVI
    from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData
    import numpy as np

    # STEP 1: prepare the data
    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)
    net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    print("scANVI dataset object with {} batches and {} cell types".format(
        net_adata.n_batches, net_adata.n_labels))

    #if hvg is True:
    #    # this also corrects for different batches by default
    #    net_adata.subsample_genes(2000, mode="seurat_v3")

    # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400])  #400
    n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])]))
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    # STEP 2: RUN scVI to initialize scANVI

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_latent=n_latent,
        n_hidden=n_hidden,
        n_layers=n_layers,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs_scVI, lr=1e-3)

    # STEP 3: RUN scANVI

    scanvi = SCANVI(net_adata.nb_genes,
                    net_adata.n_batches,
                    net_adata.n_labels,
                    n_hidden=n_hidden,
                    n_latent=n_latent,
                    n_layers=n_layers,
                    dispersion='gene',
                    reconstruction_loss='nb')
    scanvi.load_state_dict(trainer.model.state_dict(), strict=False)

    # use default parameter from semi-supervised trainer class
    trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata)
    # use all cells as labelled set
    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata)))
    # put one cell in the unlabelled set
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=[0])
    trainer_scanvi.train(n_epochs=n_epochs_scANVI)

    # extract info from posterior
    scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model,
                                                  net_adata,
                                                  indices=np.arange(
                                                      len(net_adata)))
    latent, _, _ = scanvi_full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata