def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True): self.remote = remote self.save_path = save_path if self.remote: group = to_groups[filename] self.url = ( "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz" % (group, filename, filename, type)) self.save_path = os.path.join(save_path, '10X/%s/' % filename) self.save_name = '%s_gene_bc_matrices' % type self.download_name = self.save_name + '.tar.gz' else: try: assert os.path.isdir(os.path.join(self.save_path, filename)) except AssertionError: print("The file %s was not found in the location you gave" % filename) raise self.save_path = os.path.join(self.save_path, filename) self.dense = dense expression_data, gene_names = self.download_and_preprocess() super(Dataset10X, self).__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data), gene_names=gene_names)
def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True, genecol=0): self.remote = remote self.save_path = save_path self.genecol = genecol if self.remote: group = to_groups[filename] url_skeleton = group_to_url_skeleton[group] self.url = url_skeleton.format(group, filename, filename, type) self.save_path = os.path.join(save_path, '10X/%s/' % filename) self.save_name = '%s_gene_bc_matrices' % type self.download_name = self.save_name + '.tar.gz' else: try: assert os.path.isdir(os.path.join(self.save_path, filename)) except AssertionError: print("The file %s was not found in the location you gave" % filename) raise self.save_path = os.path.join(self.save_path, filename) self.dense = dense expression_data, gene_names = self.download_and_preprocess() super().__init__(*GeneExpressionDataset.get_attributes_from_matrix( expression_data), gene_names=gene_names)
def create_dataset(self, path): print("Reading rds") ro.r("sce<-readRDS('%s')" % path) print("Extracting log counts") log_counts = ro.r("logcounts(sce)") print("Transforming log count to counts") counts = (np.exp(log_counts * np.log(2)) - 1).T.astype(np.int) gene_symbols = ro.r("rowData(sce)$feature_symbol") labels = ro.r("colData(sce)$cell_type1") labels_levels = ro.r("levels(colData(sce)$cell_type1)") if labels_levels is not rpy2.rinterface.NULL: labels = np.array([labels_levels[int(l) - 1] for l in labels]) cell_types = list(np.unique(labels)) labels = np.array([cell_types.index(l) for l in labels]) valid_idx = (counts.sum(axis=1) > 10).ravel() # Filter bad quality cells counts = counts[valid_idx] labels = labels[valid_idx] gene_expression_dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(counts, labels=labels), cell_types=cell_types) gene_expression_dataset.gene_symbols = gene_symbols return gene_expression_dataset
def __init__(self, n_proteins=7): assert n_proteins in ( 2, 5, 7), "Only support: 2, 5 or 7 protein FACS dataset" self.n_proteins = int(n_proteins) expression_data = self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
def training_score_scvi(train, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return st.poisson(mu=lam).logpmf(train).sum()
def generalization_score_scvi(train, test, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) with torch.autograd.set_grad_enabled(False): lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return pois_llik(lam, train, test)
def assign_label(cellid, geneid, labels_map, count, cell_type, seurat): labels = seurat[1:, 4] labels = np.int64(np.asarray(labels)) labels_new = deepcopy(labels) for i, j in enumerate(labels_map): labels_new[labels == i] = j temp = dict(zip(cellid, count)) new_count = [] for x in seurat[1:, 5]: new_count.append(temp[x]) new_count = sparse.vstack(new_count) dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(new_count, labels=labels_new), gene_names=geneid, cell_types=cell_type) return dataset
def __init__(self, filename, save_path='data/', type='filtered', dense=False): group = to_groups[filename] self.url = ( "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz" % (group, filename, filename, type)) self.save_path = save_path + '10X/%s/' % filename self.save_name = '%s_gene_bc_matrices' % type self.dense = dense self.download_name = self.save_name + '.tar.gz' expression_data, gene_names = self.download_and_preprocess() super(Dataset10X, self).__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data), gene_names=gene_names)
sep=",", index_col=0)["pop"].values batch_array = pd.read_csv(os.path.join(save_path, "DE.batchid.csv"), sep=",", index_col=0)["x"].values batch_array -= 1 batch_array = batch_array[:, np.newaxis] count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.2.csv"), sep=",", index_col=0).T gene_names = np.array(count_matrix.columns, dtype=str) dataset1 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( count_matrix.values, labels=label_array, batch_indices=batch_array), gene_names=gene_names, cell_types=np.unique(label_array)) dataset1.update_cells(batch_array.ravel() == 0) count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.4.csv"), sep=",", index_col=0).T dataset2 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( count_matrix.values, labels=label_array, batch_indices=batch_array), gene_names=gene_names, cell_types=np.unique(label_array))
def __init__(self, filename, save_path='/data/scanorama/'): self.save_path = save_path + '%s' % filename count, gene_names = self.preprocess() super(DatasetSCANORAMA, self).__init__( *GeneExpressionDataset.get_attributes_from_matrix(count), gene_names=np.char.upper(gene_names))
def __init__(self): expression_data = self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
use_labels=False use_cuda=False reconstruction_loss="nb" rawcounts = feather.read_dataframe(input_rawcounts) meta = feather.read_dataframe(input_meta) meta.index = meta.loc[:,"cell_name"].values.astype(str) var = feather.read_dataframe(input_var) var.index = var.loc[:,"symbol"].values.astype(str) annobj = anndata.AnnData(X=rawcounts) annobj.obs = meta annobj.var = var X, local_mean, local_var, batch_indices, labels = GeneExpressionDataset.get_attributes_from_matrix(annobj.X) geneExp = GeneExpressionDataset(X, local_mean, local_var, batch_indices, labels, gene_names=annobj.var.index) if bool(batch_id) is not False: use_batches=True plates, plates_ids = pd.factorize(annobj.obs[batch_id]) geneExp.batch_indices = plates.reshape(-1, 1) geneExp.n_batches = np.unique(plates.reshape(-1, 1)).size else: use_batches = False ldvae = LDVAE(geneExp.nb_genes, n_batch=geneExp.n_batches * use_batches, n_latent=latent, n_layers=layer,
def imputation(infer, name, rate=0.1, n_samples=1, n_epochs=1, corruption="uniform"): corrupted_data = copy.deepcopy(infer.gene_dataset.X) if corruption == "uniform": # multiply the entry n with a Ber(0.9) random variable. i, j = np.nonzero(corrupted_data) ix = np.random.choice(range(len(i)), int(np.floor(rate * len(i))), replace=False) i, j = i[ix], j[ix] corrupted_data[i, j] *= np.random.binomial(n=np.ones(len(ix), dtype=np.int64), p=0.9) elif corruption == "binomial": # multiply the entry n with a Bin(n, 0.9) random variable. i, j = (k.ravel() for k in np.indices(corrupted_data.shape)) ix = np.random.choice(range(len(i)), int(np.floor(rate * len(i))), replace=False) i, j = i[ix], j[ix] corrupted_data[i, j] = np.random.binomial( n=corrupted_data[i, j].astype(np.int64), p=0.2) infer.gene_dataset = gene_dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( corrupted_data, batch_indices=infer.gene_dataset.batch_indices, labels=infer.gene_dataset.labels)) original_data_loaders_loop = infer.data_loaders.loop infer.data_loaders.loop = [ 'corrupted_%s' % s for s in infer.data_loaders.loop ] original_keys = list(infer.data_loaders.dict.keys()) for key in original_keys: kwargs = copy.copy(infer.data_loaders.kwargs) kwargs['collate_fn'] = gene_dataset.collate_fn kwargs['sampler'] = copy.copy(infer.data_loaders[key].sampler) infer.data_loaders['corrupted_%s' % key] = DataLoaderWrapper( gene_dataset, use_cuda=infer.use_cuda, **kwargs) infer.train(n_epochs=n_epochs) infer.data_loaders.loop = original_data_loaders_loop original_list = [] imputed_list = [] batch_size = infer.data_loaders.kwargs["batch_size"] // n_samples for tensors, corrupted_tensors in \ zip(infer.data_loaders[name].sequential(batch_size=batch_size), infer.data_loaders['corrupted_%s' % name].sequential(batch_size=batch_size)): batch = tensors[0] actual_batch_size = batch.size(0) dropout_batch, _, _, batch_index, labels = corrupted_tensors px_rate = infer.model.get_sample_rate(dropout_batch, batch_index=batch_index, y=labels, n_samples=n_samples) indices_dropout = torch.nonzero(batch - dropout_batch) i = indices_dropout[:, 0] j = indices_dropout[:, 1] batch = batch.unsqueeze(0).expand( (n_samples, batch.size(0), batch.size(1))) original = np.array(batch[:, i, j].view(-1).cpu()) imputed = np.array(px_rate[:, i, j].view(-1).cpu()) cells_index = np.tile(np.array(i.cpu()), n_samples) original_list += [ original[cells_index == i] for i in range(actual_batch_size) ] imputed_list += [ imputed[cells_index == i] for i in range(actual_batch_size) ] return original_list, imputed_list