示例#1
0
def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run scVI
    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep] 
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    if (NumGenes == 0):
        #save labels as csv file with header and index column
        labels.to_csv('Labels_scvi.csv')
        data.to_csv('Data_scvi.csv')    
        
        train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
        
        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
    
    n_epochs = 200
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            data2 = data.iloc[:,feat_to_use]
            
            labels.to_csv('Labels_scvi.csv')
            data2.to_csv('Data_scvi.csv')    
            
            train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
            
            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
    
        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time()-start)
    
        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time()-start)
        
        truelab.extend(y_true)
        pred.extend(y_pred)
    
    #write results
    os.chdir(OutputDir)
    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    
    if (NumGenes == 0):  
        truelab.to_csv("scVI_True_Labels.csv", index = False)
        pred.to_csv("scVI_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
def custom_objective_hyperopt(space,
                              is_best_training=False,
                              dataset=None,
                              n_epochs=None):
    """Custom objective function for advanced autotune tutorial."""
    space = defaultdict(dict, space)
    model_tunable_kwargs = space["model_tunable_kwargs"]
    trainer_tunable_kwargs = space["trainer_tunable_kwargs"]
    train_func_tunable_kwargs = space["train_func_tunable_kwargs"]

    trainer_specific_kwargs = {}
    model_specific_kwargs = {}
    train_func_specific_kwargs = {}
    trainer_specific_kwargs["use_cuda"] = bool(torch.cuda.device_count())
    train_func_specific_kwargs["n_epochs"] = n_epochs

    # add hardcoded parameters
    # disable scVI progbar
    trainer_specific_kwargs["show_progbar"] = False
    trainer_specific_kwargs["frequency"] = 1

    # merge params with fixed param precedence
    model_tunable_kwargs.update(model_specific_kwargs)
    trainer_tunable_kwargs.update(trainer_specific_kwargs)
    train_func_tunable_kwargs.update(train_func_specific_kwargs)

    scanvi = SCANVI(dataset.nb_genes, dataset.n_batches, dataset.n_labels,
                    **model_tunable_kwargs)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, dataset,
                                           **trainer_tunable_kwargs)
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=np.squeeze(dataset.batch_indices == 1))
    trainer_scanvi.unlabelled_set.to_monitor = [
        "reconstruction_error", "accuracy"
    ]
    indices_labelled = np.squeeze(dataset.batch_indices == 0)

    if not is_best_training:
        # compute k-fold accuracy on a 20% validation set
        k = 5
        accuracies = np.zeros(k)
        indices_labelled = np.squeeze(dataset.batch_indices == 0)
        for i in range(k):
            indices_labelled_train, indices_labelled_val = train_test_split(
                indices_labelled.nonzero()[0], test_size=0.2)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
                indices=indices_labelled_train)
            trainer_scanvi.labelled_set.to_monitor = [
                "reconstruction_error",
                "accuracy",
            ]
            trainer_scanvi.validation_set = trainer_scanvi.create_posterior(
                indices=indices_labelled_val)
            trainer_scanvi.validation_set.to_monitor = ["accuracy"]
            trainer_scanvi.train(**train_func_tunable_kwargs)
            accuracies[i] = trainer_scanvi.history["accuracy_unlabelled_set"][
                -1]
        return {
            "loss": -accuracies.mean(),
            "space": space,
            "status": STATUS_OK
        }
    else:
        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
            indices=indices_labelled)
        trainer_scanvi.labelled_set.to_monitor = [
            "reconstruction_error", "accuracy"
        ]
        trainer_scanvi.train(**train_func_tunable_kwargs)
        return trainer_scanvi
示例#3
0
def run_scVI(input_dir, output_dir, datafile, labfile, Rfile):
    '''
    Run scVI
	
	Parameters
	----------
	input_dir : directory of the input files
	output_dir : directory of the output files
	datafile : name of the data file
    labfile : name of the label file
    Rfile : file to read the cross validation indices from
    '''
    os.chdir(input_dir)

    # read the Rdata file
    robjects.r['load'](Rfile)

    nfolds = np.array(robjects.r['n_folds'], dtype='int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')
    col = np.array(robjects.r['col_Index'], dtype='int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    os.chdir(input_dir)
    data = pd.read_csv(datafile, index_col=0, sep=',')
    labels = pd.read_csv(labfile,
                         header=0,
                         index_col=None,
                         sep=',',
                         usecols=col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    #save labels as csv file with header and index column
    labels.to_csv('Labels_scvi.csv')
    data.to_csv('Data_scvi.csv')

    train = CsvDataset('Data_scvi.csv',
                       save_path=input_dir,
                       sep=",",
                       labels_file="Labels_scvi.csv",
                       gene_by_cell=False)

    ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
    scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

    n_epochs = 200

    truelab = []
    pred = []
    tr_time = []
    ts_time = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype='int') - 1
        train_ind_i = np.array(train_ind[i], dtype='int') - 1

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
            indices=(train_ind_i).ravel(), shuffle=False)
        trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
            indices=(test_ind_i).ravel(), shuffle=False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy']

        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time() - start)

        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time() - start)

        truelab.extend(y_true)
        pred.extend(y_pred)

    #write results
    os.chdir(output_dir)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    truelab.to_csv("scVI_" + str(col) + "_true.csv", index=False)
    pred.to_csv("scVI_" + str(col) + "_pred.csv", index=False)

    tr_time.to_csv("scVI_" + str(col) + "_training_time.csv", index=False)
    ts_time.to_csv("scVI_" + str(col) + "_test_time.csv", index=False)
示例#4
0
def run_scVI(trainname, testname, n):

    #trainDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Filtered_Segerstolpe_HumanPancreas_data.csv"
    #train = pd.read_csv(trainDataPath,index_col=0,sep=',')
    #trainLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Labels.csv"
    #trainlabels = pd.read_csv(trainLabelsPath, header=0,index_col=None, sep=',')

    #testDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Filtered_Xin_HumanPancreas_data.csv"
    #test = pd.read_csv(testDataPath,index_col=0,sep=',')
    #testLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Labels.csv"
    #testlabels = pd.read_csv(testLabelsPath, header=0,index_col=None, sep=',')

    train = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '.csv',
        index_col=0,
        sep=',')
    test = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '.csv',
        index_col=0,
        sep=',')
    trainlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')
    testlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')

    newdata = pd.concat([train, test], axis=1)
    newlabel = pd.concat([trainlabel, testlabel], axis=0)

    #train = '/Users/yue/Dropbox (Sydney Uni)/scclassify/countmatrix/logcount/xin.csv'

    #save labels as csv file with header and index column
    #trainlabels.to_csv('trainLabels_scvi.csv')
    #train.to_csv('trainData_scvi.csv')

    #testlabels.to_csv('testLabels_scvi.csv')
    #test.to_csv('testData_scvi.csv')

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    newdata.to_csv('data_scvi.csv')
    newlabel.to_csv('labels_scvi.csv')
    data = CsvDataset('data_scvi.csv',
                      save_path="",
                      sep=",",
                      labels_file="labels_scvi.csv",
                      gene_by_cell=True)

    n_epochs = 100

    truelab = []
    pred = []

    ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing

    now = time.time()
    tracemalloc.start()

    scanvi = SCANVI(data.nb_genes, data.n_batches, data.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, data, frequency=5)

    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        indices=(list(range(0, trainlabel.shape[0]))), shuffle=False)
    trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy']
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=(list(
            range(trainlabel.shape[0],
                  trainlabel.shape[0] + testlabel.shape[0]))),
        shuffle=False)
    trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy']

    trainer_scanvi.train(n_epochs)

    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    ## labels of test set are in y_pred
    ## labels are returned in numbers, should be mapped back to the real labels
    ## indices are permutated

    now = time.time()
    tracemalloc.start()

    y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()

    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    truelab.extend(y_true)
    pred.extend(y_pred)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    truelab.to_csv(n + "_scVI_True.csv", index=False)
    pred.to_csv(n + "_scVI_Pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test
示例#5
0
def runScanvi(adata, batch, labels):
    # Use non-normalized (count) data for scanvi!

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE, SCANVI
    from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData
    import numpy as np

    # STEP 1: prepare the data
    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)
    net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    print("scANVI dataset object with {} batches and {} cell types".format(
        net_adata.n_batches, net_adata.n_labels))

    #if hvg is True:
    #    # this also corrects for different batches by default
    #    net_adata.subsample_genes(2000, mode="seurat_v3")

    # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400])  #400
    n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])]))
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    # STEP 2: RUN scVI to initialize scANVI

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_latent=n_latent,
        n_hidden=n_hidden,
        n_layers=n_layers,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs_scVI, lr=1e-3)

    # STEP 3: RUN scANVI

    scanvi = SCANVI(net_adata.nb_genes,
                    net_adata.n_batches,
                    net_adata.n_labels,
                    n_hidden=n_hidden,
                    n_latent=n_latent,
                    n_layers=n_layers,
                    dispersion='gene',
                    reconstruction_loss='nb')
    scanvi.load_state_dict(trainer.model.state_dict(), strict=False)

    # use default parameter from semi-supervised trainer class
    trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata)
    # use all cells as labelled set
    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata)))
    # put one cell in the unlabelled set
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=[0])
    trainer_scanvi.train(n_epochs=n_epochs_scANVI)

    # extract info from posterior
    scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model,
                                                  net_adata,
                                                  indices=np.arange(
                                                      len(net_adata)))
    latent, _, _ = scanvi_full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata
trainer_scanvi = SemiSupervisedTrainer(scanvi,
                                       gene_dataset,
                                       classification_ratio=50,
                                       n_epochs_classifier=1,
                                       lr_classification=5 * 1e-3)
# trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset,
#                                                 n_epochs_classifier=5, lr_classification=5 * 1e-3, kl=1)
labelled = np.where(gene_dataset.batch_indices == 0)[0]
np.random.shuffle(labelled)
unlabelled = np.where(gene_dataset.batch_indices == 1)[0]
np.random.shuffle(unlabelled)
trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=labelled)
trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
    indices=unlabelled)

trainer_scanvi.train(n_epochs=5)

scanvi_labels = trainer_scanvi.full_dataset.sequential().compute_predictions(
)[1]

# predicted_labels = pd.DataFrame([scVI_labels,scanvi_labels],index=['scVI','scANVI'])
predicted_labels = pd.DataFrame(
    [gene_dataset.labels.ravel(), scVI_labels, scanvi_labels],
    index=['labels', 'scVI', 'scANVI'])
predicted_labels.T.to_csv(save_path + 'SIM.pred_labels.%i.mis%.2f.csv' %
                          (rep, misprop))

# get latent space
full_scanvi = trainer.create_posterior(trainer_scanvi.model,
                                       gene_dataset,
                                       indices=np.arange(len(gene_dataset)))