예제 #1
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset()
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset()
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400))
    cortex_dataset_2.filter_cell_types(
        ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(
        cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 200

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2, shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50
예제 #2
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset(save_path='tests/data/')
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset(save_path='tests/data/')
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4))
    cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 2

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2,
                                                               shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50

    synthetic_dataset_3 = SyntheticDataset(n_labels=6)
    synthetic_dataset_3.cell_types = np.arange(6).astype(np.str)
    synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
예제 #3
0
dataset1.update_cells(batch_array.ravel() == 0)

count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.4.csv"),
                           sep=",",
                           index_col=0).T

dataset2 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(
        count_matrix.values, labels=label_array, batch_indices=batch_array),
    gene_names=gene_names,
    cell_types=np.unique(label_array))

dataset2.update_cells(batch_array.ravel() == 1)

gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2)
# gene_dataset.subsample_genes(500)
labels = [
    int(gene_dataset.cell_types[i]) - 1 for i in gene_dataset.labels.ravel()
]
gene_dataset.labels = np.asarray(labels).reshape(len(labels), 1)
gene_dataset.cell_types = dataset2.cell_types
# from scipy import sparse

# gene_dataset.X = sparse.csr_matrix(gene_dataset.X )
gene_dataset.gene_names = gene_dataset.gene_names.astype('int')
dataset1.gene_names = dataset1.gene_names.astype('int')
dataset2.gene_names = dataset2.gene_names.astype('int')
# dataset1, dataset2, gene_dataset = SubsetGenes(dataset1, dataset2, gene_dataset, plotname)

CompareModels(gene_dataset, dataset1, dataset2, plotname, models)
예제 #4
0
    return auc_1, auc_2, spear, kend


save_path = "../symsim_scVI/symsim_result/DE/"

pbmc = PbmcDataset()
de_data  = pbmc.de_metadata
pbmc.update_cells(pbmc.batch_indices.ravel()==0)

donor = Dataset10X('fresh_68k_pbmc_donor_a')
donor.gene_names = donor.gene_symbols
donor.labels = np.repeat(0,len(donor)).reshape(len(donor),1)
donor.cell_types = ['unlabelled']
donor.subsample_genes(donor.nb_genes)

gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, donor)


################## Generate Mis-labels
######################################################################################
labels = np.asarray(gene_dataset.labels.ravel())
# pop1 = np.where(gene_dataset.cell_types=='B cells')[0][0]
# pop2 = np.where(gene_dataset.cell_types=='Dendritic Cells')[0][0]
pop1 = np.where(gene_dataset.cell_types=='CD4 T cells')[0][0]
pop2 = np.where(gene_dataset.cell_types=='CD8 T cells')[0][0]
mislabels = deepcopy(labels)
mises = np.random.choice([0,1],len(mislabels),p=[1-misprop, misprop])
pop1cells = (labels==pop1)
pop2cells = (labels==pop2)
# flip the DE
mislabels[np.logical_and(mises, pop1cells)] = pop2