def test_anndata_loader(): x = np.random.randint(low=0, high=100, size=(15, 4)) batch_ids = np.random.randint(low=0, high=2, size=(15, )) n_batches = 2 adata = AnnData(X=x, obs=dict(batch=batch_ids)) _ = AnnDatasetFromAnnData(adata, batch_label="batch") dataset = AnnDatasetFromAnnData(adata, batch_label="batch") assert (dataset.n_batches == n_batches ), "AnnDatasetFromAnnData should not modify the anndata object"
def test_sparse_data(self): data = np.random.poisson(0.2, size=(25, 10)) sparse_mat = sp_sparse.csr_matrix(data) ad = anndata.AnnData(sparse_mat) AnnDatasetFromAnnData(ad) sparse_mat = sp_sparse.csc_matrix(data) ad = anndata.AnnData(sparse_mat) AnnDatasetFromAnnData(ad)
def train(self, adata, condition_key, cell_type_key, n_epochs=300, patience=30, lr_reducer=20): le = LabelEncoder() adata.obs['labels'] = le.fit_transform(adata.obs[cell_type_key].values) adata.obs['batch_indices'] = le.fit_transform( adata.obs[condition_key].values) net_adata = AnnDatasetFromAnnData(adata) early_stopping_kwargs = { "early_stopping_metric": "elbo", "save_best_state_metric": "elbo", "patience": patience, "threshold": 0, "reduce_lr_on_plateau": True, "lr_patience": lr_reducer, "lr_factor": 0.1, } self.trainer = UnsupervisedTrainer( self.model, net_adata, train_size=0.8, use_cuda=True, frequency=1, early_stopping_kwargs=early_stopping_kwargs, ) self.trainer.train(n_epochs=n_epochs, lr=0.001)
def run(self): n_epochs = 100 n_latent = 10 n_hidden = 128 n_layers = 2 net_data = self.data.copy() net_data.X = self.data.layers['counts'] del net_data.layers['counts'] net_data.raw = None # Ensure that the raw counts are not accidentally used # Define batch indices le = LabelEncoder() net_data.obs['batch_indices'] = le.fit_transform( net_data.obs[self.batch].values) net_data = AnnDatasetFromAnnData(net_data) vae = VAE(net_data.nb_genes, reconstruction_loss='nb', n_batch=net_data.n_batches, n_layers=n_layers, n_latent=n_latent, n_hidden=n_hidden) trainer = UnsupervisedTrainer(vae, net_data, train_size=1, use_cuda=False) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, net_data, indices=np.arange(len(net_data))) latent, _, _ = full.sequential().get_latent() self.data.obsm['X_emb'] = latent self.dump_to_h5ad("scvi")
def predict(self, adata, cell_type_to_predict, condition_key, cell_type_key, target_condition, source_condition, n_generated_samples=50): cell_type_adata = adata.copy()[adata.obs[cell_type_key] == cell_type_to_predict] real_adata = cell_type_adata[cell_type_adata.obs[condition_key] == target_condition] ctrl_adata = cell_type_adata[cell_type_adata.obs[condition_key] == source_condition] le = LabelEncoder() le.fit([source_condition, target_condition]) real_adata.obs['batch_indices'] = le.transform( real_adata.obs[condition_key].values) ctrl_adata.obs['batch_indices'] = le.transform([target_condition] * ctrl_adata.shape[0]) ctrl_adata = AnnDatasetFromAnnData(ctrl_adata) posterior = self.trainer.create_posterior(self.trainer.model, ctrl_adata, indices=np.arange( len(ctrl_adata))) generated_samples, _ = posterior.sequential().generate( n_generated_samples) reconstructed = generated_samples.mean(axis=2) reconstructed_adata = sc.AnnData(X=reconstructed) reconstructed_adata.obs = ctrl_adata.obs.copy(deep=True) reconstructed_adata.obs[condition_key].replace( source_condition, f'{cell_type_to_predict}_pred_{target_condition}', inplace=True) reconstructed_adata.var_names = cell_type_adata.var_names pred_adata = reconstructed_adata[ reconstructed_adata.obs[condition_key] == f'{cell_type_to_predict}_pred_{target_condition}'] sc.pp.normalize_per_cell(pred_adata) sc.pp.log1p(pred_adata) return pred_adata
def test_data_loader(self): data = np.ones((25, 10)) * 100 paired = np.ones((25, 4)) * np.arange(0, 4) pair_names = ["gabou", "achille", "pedro", "oclivio"] y = CellMeasurement(name="dev", data=paired, columns_attr_name="dev_names", columns=pair_names) dataset = GeneExpressionDataset() dataset.populate_from_data(data, Ys=[y]) ad = dataset.to_anndata() dataset_ad = AnnDatasetFromAnnData( ad, cell_measurements_col_mappings={"dev": "dev_names"}) self.assertTrue((paired == dataset_ad.dev).all()) self.assertTrue((dataset.X == dataset_ad.X).all()) self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
def to_mmd_layer(self, adata, condition_key, cell_type_key): le = LabelEncoder() adata.obs['labels'] = le.fit_transform(adata.obs[cell_type_key].values) adata.obs['batch_indices'] = le.fit_transform( adata.obs[condition_key].values) net_adata = AnnDatasetFromAnnData(adata) posterior = self.trainer.create_posterior(self.trainer.model, net_adata, indices=np.arange( len(net_adata))) latent, _, __ = posterior.sequential().get_latent() latent_adata = sc.AnnData(X=latent) latent_adata.obs = adata.obs.copy(deep=True) return latent_adata
def correct_scvi(Xs, genes): import torch torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False from scvi.dataset import AnnDatasetFromAnnData from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs] all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets(all_ann) vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer( vae, all_dataset, train_size=1., use_cuda=True, ) n_epochs = 100 #trainer.train(n_epochs=n_epochs) #torch.save(trainer.model.state_dict(), # 'data/harmonization.vae.pkl') trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl')) trainer.model.eval() full = trainer.create_posterior(trainer.model, all_dataset, indices=np.arange(len(all_dataset))) latent, batch_indices, labels = full.sequential().get_latent() return latent
def test_use_raw_flag(self): raw_data = np.random.randint(1, 5, size=(4, 7)) ad = anndata.AnnData(raw_data) ad.raw = ad.copy() dataset = AnnDatasetFromAnnData(ad, use_raw=True) np.testing.assert_array_equal(dataset.X, raw_data)
def test_train_one(self): data = np.random.randint(1, 5, size=(4, 7)) ad = anndata.AnnData(data) dataset = AnnDatasetFromAnnData(ad) unsupervised_training_one_epoch(dataset)
def test_init(self): data = np.random.randint(1, 5, size=(3, 7)) ad = anndata.AnnData(data) dataset = AnnDatasetFromAnnData(ad) self.assertEqual(3, dataset.nb_cells) self.assertEqual(7, dataset.nb_genes)
adatas = [] for b in np.unique(anndataset_111.obs["batch_indices"]): adatas.append(anndataset_111[anndataset_111.obs["batch_indices"] == b, :].copy()) adatas[-1].obs["batch_indices"] *= 0 for b in np.unique(anndataset_206.obs["batch_indices"]): adatas.append(anndataset_206[anndataset_206.obs["batch_indices"] == b, :].copy()) adatas[-1].obs["batch_indices"] *= 0 names = ["111_d1", "111_d2", "206_d1", "206_d2"] # Iterate over datasets for n, adata in zip(names, adatas): hvg = adata.var["hvg_encode"] dataset = AnnDatasetFromAnnData(ad=adata[:, hvg]) protein_data = CellMeasurement( name="protein_expression", data=adata.obsm["protein_expression"].astype(np.float32), columns_attr_name="protein_names", columns=adata.uns["protein_names"], ) dataset.initialize_cell_measurement(protein_data) dataset.gene_names = adata[:, hvg].var_names.values set_seed(0) model = TOTALVI(dataset.nb_genes, dataset.protein_expression.shape[1], n_latent=20,) use_cuda = True lr = 4e-3 early_stopping_kwargs = {
def main(): usage = 'solo' parser = ArgumentParser(usage, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument(dest='model_json_file', help='json file to pass VAE parameters') parser.add_argument( dest='data_path', help= 'path to h5ad, loom or 10x directory containing cell by genes counts') parser.add_argument('-d', dest='doublet_depth', default=2., type=float, help='Depth multiplier for a doublet relative to the \ average of its constituents') parser.add_argument('-g', dest='gpu', default=True, action='store_true', help='Run on GPU') parser.add_argument('-a', dest='anndata_output', default=False, action='store_true', help='output modified anndata object with solo scores \ Only works for anndata') parser.add_argument('-o', dest='out_dir', default='solo_out') parser.add_argument('-r', dest='doublet_ratio', default=2., type=float, help='Ratio of doublets to true \ cells') parser.add_argument('-s', dest='seed', default=None, help='Path to previous solo output \ directory. Seed VAE models with previously \ trained solo model. Directory structure is assumed to \ be the same as solo output directory structure. \ should at least have a vae.pt a pickled object of \ vae weights and a latent.npy an np.ndarray of the \ latents of your cells.') parser.add_argument('-k', dest='known_doublets', help='Experimentally defined doublets tsv file. \ Should be a single column of True/False. True \ indicates the cell is a doublet. No header.', type=str) parser.add_argument('-t', dest='doublet_type', help='Please enter \ multinomial, average, or sum', default='multinomial', choices=['multinomial', 'average', 'sum']) parser.add_argument('-e', dest='expected_number_of_doublets', help='Experimentally expected number of doublets', type=int, default=None) parser.add_argument('-p', dest='plot', default=False, action='store_true', help='Plot outputs for solo') parser.add_argument('-l', dest='normal_logging', default=False, action='store_true', help='Logging level set to normal (aka not debug)') parser.add_argument('--random_size', dest='randomize_doublet_size', default=False, action='store_true', help='Sample depth multipliers from Unif(1, \ DoubletDepth) \ to provide a diversity of possible doublet depths.') args = parser.parse_args() if not args.normal_logging: scvi._settings.set_verbosity(10) model_json_file = args.model_json_file data_path = args.data_path if args.gpu and not torch.cuda.is_available(): args.gpu = torch.cuda.is_available() print('Cuda is not available, switching to cpu running!') if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) ################################################## # data # read loom/anndata data_ext = os.path.splitext(data_path)[-1] if data_ext == '.loom': scvi_data = LoomDataset(data_path) elif data_ext == '.h5ad': adata = anndata.read(data_path) if issparse(adata.X): adata.X = adata.X.todense() scvi_data = AnnDatasetFromAnnData(adata) elif os.path.isdir(data_path): scvi_data = Dataset10X(save_path=data_path, measurement_names_column=1, dense=True) cell_umi_depth = scvi_data.X.sum(axis=1) fifth, ninetyfifth = np.percentile(cell_umi_depth, [5, 95]) min_cell_umi_depth = np.min(cell_umi_depth) max_cell_umi_depth = np.max(cell_umi_depth) if fifth * 10 < ninetyfifth: print("""WARNING YOUR DATA HAS A WIDE RANGE OF CELL DEPTHS. PLEASE MANUALLY REVIEW YOUR DATA""") print( f"Min cell depth: {min_cell_umi_depth}, Max cell depth: {max_cell_umi_depth}" ) else: msg = f'{data_path} is not a recognized format.\n' msg += 'must be one of {h5ad, loom, 10x directory}' raise TypeError(msg) num_cells, num_genes = scvi_data.X.shape if args.known_doublets is not None: print('Removing known doublets for in silico doublet generation') print('Make sure known doublets are in the same order as your data') known_doublets = np.loadtxt(args.known_doublets, dtype=str) == 'True' assert len(known_doublets) == scvi_data.X.shape[0] known_doublet_data = make_gene_expression_dataset( scvi_data.X[known_doublets], scvi_data.gene_names) known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0]) singlet_scvi_data = make_gene_expression_dataset( scvi_data.X[~known_doublets], scvi_data.gene_names) singlet_num_cells, _ = singlet_scvi_data.X.shape else: known_doublet_data = None singlet_num_cells = num_cells known_doublets = np.zeros(num_cells, dtype=bool) singlet_scvi_data = scvi_data singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0]) scvi_data.labels = known_doublets.astype(int) ################################################## # parameters # check for parameters if not os.path.exists(model_json_file): raise FileNotFoundError(f'{model_json_file} does not exist.') # read parameters with open(model_json_file, 'r') as model_json_open: params = json.load(model_json_open) # set VAE params vae_params = {} for par in [ 'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch' ]: if par in params: vae_params[par] = params[par] vae_params['n_batch'] = 0 if params.get('ignore_batch', False) else scvi_data.n_batches # training parameters batch_size = params.get('batch_size', 128) valid_pct = params.get('valid_pct', 0.1) learning_rate = params.get('learning_rate', 1e-3) stopping_params = {'patience': params.get('patience', 10), 'threshold': 0} # protect against single example batch while num_cells % batch_size == 1: batch_size = int(np.round(1.25 * batch_size)) print('Increasing batch_size to %d to avoid single example batch.' % batch_size) ################################################## # VAE vae = VAE(n_input=singlet_scvi_data.nb_genes, n_labels=2, reconstruction_loss='nb', log_variational=True, **vae_params) if args.seed: if args.gpu: device = torch.device('cuda') vae.load_state_dict(torch.load(os.path.join(args.seed, 'vae.pt'))) vae.to(device) else: map_loc = 'cpu' vae.load_state_dict( torch.load(os.path.join(args.seed, 'vae.pt'), map_location=map_loc)) # save latent representation utrainer = \ UnsupervisedTrainer(vae, singlet_scvi_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['reconstruction_error'], use_cuda=args.gpu, early_stopping_kwargs=stopping_params, batch_size=batch_size) full_posterior = utrainer.create_posterior(utrainer.model, singlet_scvi_data, indices=np.arange( len(singlet_scvi_data))) latent, _, _ = full_posterior.sequential(batch_size).get_latent() np.save(os.path.join(args.out_dir, 'latent.npy'), latent.astype('float32')) else: stopping_params['early_stopping_metric'] = 'reconstruction_error' stopping_params['save_best_state_metric'] = 'reconstruction_error' # initialize unsupervised trainer utrainer = \ UnsupervisedTrainer(vae, singlet_scvi_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['reconstruction_error'], use_cuda=args.gpu, early_stopping_kwargs=stopping_params, batch_size=batch_size) utrainer.history['reconstruction_error_test_set'].append(0) # initial epoch utrainer.train(n_epochs=2000, lr=learning_rate) # drop learning rate and continue utrainer.early_stopping.wait = 0 utrainer.train(n_epochs=500, lr=0.5 * learning_rate) # save VAE torch.save(vae.state_dict(), os.path.join(args.out_dir, 'vae.pt')) # save latent representation full_posterior = utrainer.create_posterior(utrainer.model, singlet_scvi_data, indices=np.arange( len(singlet_scvi_data))) latent, _, _ = full_posterior.sequential(batch_size).get_latent() np.save(os.path.join(args.out_dir, 'latent.npy'), latent.astype('float32')) ################################################## # simulate doublets non_zero_indexes = np.where(singlet_scvi_data.X > 0) cells = non_zero_indexes[0] genes = non_zero_indexes[1] cells_ids = defaultdict(list) for cell_id, gene in zip(cells, genes): cells_ids[cell_id].append(gene) # choose doublets function type if args.doublet_type == 'average': doublet_function = create_average_doublet elif args.doublet_type == 'sum': doublet_function = create_summed_doublet else: doublet_function = create_multinomial_doublet cell_depths = singlet_scvi_data.X.sum(axis=1) num_doublets = int(args.doublet_ratio * singlet_num_cells) if known_doublet_data is not None: num_doublets -= known_doublet_data.X.shape[0] # make sure we are making a non negative amount of doublets assert num_doublets >= 0 in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32') # for desired # doublets for di in range(num_doublets): # sample two cells i, j = np.random.choice(singlet_num_cells, size=2) # generate doublets in_silico_doublets[di, :] = \ doublet_function(singlet_scvi_data.X, i, j, doublet_depth=args.doublet_depth, cell_depths=cell_depths, cells_ids=cells_ids, randomize_doublet_size=args.randomize_doublet_size) # merge datasets # we can maybe up sample the known doublets # concatentate classifier_data = GeneExpressionDataset() classifier_data.populate_from_data( X=np.vstack([scvi_data.X, in_silico_doublets]), labels=np.hstack( [np.ravel(scvi_data.labels), np.ones(in_silico_doublets.shape[0])]), remap_attributes=False) assert (len(np.unique(classifier_data.labels.flatten())) == 2) ################################################## # classifier # model classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate']) # trainer stopping_params['early_stopping_metric'] = 'accuracy' stopping_params['save_best_state_metric'] = 'accuracy' strainer = ClassifierTrainer(classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=args.gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params, batch_size=batch_size) # initial strainer.train(n_epochs=1000, lr=learning_rate) # drop learning rate and continue strainer.early_stopping.wait = 0 strainer.train(n_epochs=300, lr=0.1 * learning_rate) torch.save(classifier.state_dict(), os.path.join(args.out_dir, 'classifier.pt')) ################################################## # post-processing # use logits for predictions for better results logits_classifier = Classifier(n_input=(vae.n_latent + 1), n_hidden=params['cl_hidden'], n_layers=params['cl_layers'], n_labels=2, dropout_rate=params['dropout_rate'], logits=True) logits_classifier.load_state_dict(classifier.state_dict()) # using logits leads to better performance in for ranking logits_strainer = ClassifierTrainer(logits_classifier, classifier_data, train_size=(1. - valid_pct), frequency=2, metrics_to_monitor=['accuracy'], use_cuda=args.gpu, sampling_model=vae, sampling_zl=True, early_stopping_kwargs=stopping_params, batch_size=batch_size) # models evaluation mode vae.eval() classifier.eval() logits_classifier.eval() print('Train accuracy: %.4f' % strainer.train_set.accuracy()) print('Test accuracy: %.4f' % strainer.test_set.accuracy()) # compute predictions manually # output logits train_y, train_score = strainer.train_set.compute_predictions(soft=True) test_y, test_score = strainer.test_set.compute_predictions(soft=True) # train_y == true label # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score train_score = train_score[:, 1] train_y = train_y.astype('bool') test_score = test_score[:, 1] test_y = test_y.astype('bool') train_auroc = roc_auc_score(train_y, train_score) test_auroc = roc_auc_score(test_y, test_score) print('Train AUROC: %.4f' % train_auroc) print('Test AUROC: %.4f' % test_auroc) train_fpr, train_tpr, train_t = roc_curve(train_y, train_score) test_fpr, test_tpr, test_t = roc_curve(test_y, test_score) train_t = np.minimum(train_t, 1 + 1e-9) test_t = np.minimum(test_t, 1 + 1e-9) train_acc = np.zeros(len(train_t)) for i in range(len(train_t)): train_acc[i] = np.mean(train_y == (train_score > train_t[i])) test_acc = np.zeros(len(test_t)) for i in range(len(test_t)): test_acc[i] = np.mean(test_y == (test_score > test_t[i])) # write predictions # softmax predictions order_y, order_score = strainer.compute_predictions(soft=True) _, order_pred = strainer.compute_predictions() doublet_score = order_score[:, 1] np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores.npy'), doublet_score[:num_cells]) np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores_sim.npy'), doublet_score[num_cells:]) # logit predictions logit_y, logit_score = logits_strainer.compute_predictions(soft=True) logit_doublet_score = logit_score[:, 1] np.save(os.path.join(args.out_dir, 'logit_scores.npy'), logit_doublet_score[:num_cells]) np.save(os.path.join(args.out_dir, 'logit_scores_sim.npy'), logit_doublet_score[num_cells:]) # update threshold as a function of Solo's estimate of the number of # doublets # essentially a log odds update # TODO put in a function diff = np.inf counter_update = 0 solo_scores = doublet_score[:num_cells] logit_scores = logit_doublet_score[:num_cells] d_s = (args.doublet_ratio / (args.doublet_ratio + 1)) while (diff > .01) | (counter_update < 5): # calculate log odss calibration for logits d_o = np.mean(solo_scores) c = np.log(d_o / (1 - d_o)) - np.log(d_s / (1 - d_s)) # update soloe scores solo_scores = 1 / (1 + np.exp(-(logit_scores + c))) # update while conditions diff = np.abs(d_o - np.mean(solo_scores)) counter_update += 1 np.save(os.path.join(args.out_dir, 'softmax_scores.npy'), solo_scores) if args.expected_number_of_doublets is not None: k = len(solo_scores) - args.expected_number_of_doublets if args.expected_number_of_doublets / len(solo_scores) > .5: print('''Make sure you actually expect more than half your cells to be doublets. If not change your -e parameter value''') assert k > 0 idx = np.argpartition(solo_scores, k) threshold = np.max(solo_scores[idx[:k]]) is_solo_doublet = solo_scores > threshold else: is_solo_doublet = solo_scores > .5 is_doublet = known_doublets new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0] is_doublet[new_doublets_idx] = True np.save(os.path.join(args.out_dir, 'is_doublet.npy'), is_doublet[:num_cells]) np.save(os.path.join(args.out_dir, 'is_doublet_sim.npy'), is_doublet[num_cells:]) np.save(os.path.join(args.out_dir, 'preds.npy'), order_pred[:num_cells]) np.save(os.path.join(args.out_dir, 'preds_sim.npy'), order_pred[num_cells:]) smoothed_preds = knn_smooth_pred_class(X=latent, pred_class=is_doublet[:num_cells]) np.save(os.path.join(args.out_dir, 'smoothed_preds.npy'), smoothed_preds) if args.anndata_output and data_ext == '.h5ad': adata.obs['is_doublet'] = is_doublet[:num_cells] adata.obs['logit_scores'] = logit_doublet_score[:num_cells] adata.obs['softmax_scores'] = doublet_score[:num_cells] adata.write(os.path.join(args.out_dir, "soloed.h5ad")) if args.plot: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns # plot ROC plt.figure() plt.plot(train_fpr, train_tpr, label='Train') plt.plot(test_fpr, test_tpr, label='Test') plt.gca().set_xlabel('False positive rate') plt.gca().set_ylabel('True positive rate') plt.legend() plt.savefig(os.path.join(args.out_dir, 'roc.pdf')) plt.close() # plot accuracy plt.figure() plt.plot(train_t, train_acc, label='Train') plt.plot(test_t, test_acc, label='Test') plt.axvline(0.5, color='black', linestyle='--') plt.gca().set_xlabel('Threshold') plt.gca().set_ylabel('Accuracy') plt.legend() plt.savefig(os.path.join(args.out_dir, 'accuracy.pdf')) plt.close() # plot distributions plt.figure() sns.distplot(test_score[test_y], label='Simulated') sns.distplot(test_score[~test_y], label='Observed') plt.legend() plt.savefig(os.path.join(args.out_dir, 'train_v_test_dist.pdf')) plt.close() plt.figure() sns.distplot(doublet_score[:num_cells], label='Observed') plt.legend() plt.savefig(os.path.join(args.out_dir, 'real_cells_dist.pdf')) plt.close() scvi_umap = umap.UMAP(n_neighbors=16).fit_transform(latent) fig, ax = plt.subplots(1, 1, figsize=(10, 10)) ax.scatter(scvi_umap[:, 0], scvi_umap[:, 1], c=doublet_score[:num_cells], s=8, cmap="GnBu") ax.set_xlabel("UMAP 1") ax.set_ylabel("UMAP 2") ax.set_xticks([], []) ax.set_yticks([], []) fig.savefig(os.path.join(args.out_dir, 'umap_solo_scores.pdf'))
def load_posterior(dir_path: str, model: nn.Module, use_cuda: Optional[Union[bool, str]] = "auto", **posterior_kwargs): """Function to use in order to retrieve a posterior that was saved using the ``save_posterior`` method Because of pytorch model loading usage, this function needs a scVI model object initialized with exact same parameters that during training. Because saved posteriors correspond to already trained models, data is loaded sequentially using a ``SequentialSampler``. Parameters ---------- dir_path directory containing the posterior properties to be retrieved. model scVI initialized model. use_cuda Specifies if the computations should be perfomed with a GPU. Default: ``True`` If ``auto``, then cuda availability is inferred, with a preference to load on GPU. If ``False``, the model will be loaded on the CPU, even if it was trained using a GPU. **posterior_kwargs additional parameters to feed to the posterior constructor. Returns ------- >>> model = VAE(nb_genes, n_batches, n_hidden=128, n_latent=10) >>> trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) >>> trainer.train(n_epochs=200) >>> trainer.train_set.save_posterior("./my_run_train_posterior") >>> model = VAE(nb_genes, n_batches, n_hidden=128, n_latent=10) >>> post = load_posterior("./my_run_train_posterior", model=model) """ # Avoid circular imports from scvi.inference.total_inference import TotalPosterior from scvi.inference.jvae_trainer import JPosterior from scvi.inference.posterior import Posterior from scvi.inference.annotation import AnnotationPosterior post_type_path = os.path.join(dir_path, "posterior_type.txt") dataset_path = os.path.join(dir_path, "anndata_dataset.h5ad") model_path = os.path.join(dir_path, "model_params.pt") indices_path = os.path.join(dir_path, "indices.npy") data_loader_kwargs_path = os.path.join(dir_path, "data_loader_kwargs.h5") # Infering posterior type with open(post_type_path, "r") as post_file: post_class_str = post_file.readline() str_to_classes = dict( TotalPosterior=TotalPosterior, JPosterior=JPosterior, Posterior=Posterior, AnnotationPosterior=AnnotationPosterior, ) if post_class_str not in str_to_classes: raise ValueError("Posterior type {} not eligible for loading".format( post_class_str)) post_class = str_to_classes[post_class_str] # Loading dataset and associated measurements ad = anndata.read_h5ad(filename=dataset_path) key = "cell_measurements_col_mappings" if key in ad.uns: cell_measurements_col_mappings = ad.uns[key] else: cell_measurements_col_mappings = dict() dataset = AnnDatasetFromAnnData( ad=ad, cell_measurements_col_mappings=cell_measurements_col_mappings) # Loading scVI model if use_cuda == "auto": use_cuda = torch.cuda.is_available() use_cuda = use_cuda and torch.cuda.is_available() if use_cuda: model.load_state_dict(torch.load(model_path)) model.cuda() else: device = torch.device("cpu") model.load_state_dict(torch.load(model_path, map_location=device)) model.eval() # Loading data loader options and posterior indices = np.load(file=indices_path) data_loader_kwargs = pd.read_hdf(data_loader_kwargs_path, key="data_loader").to_dict() my_post = post_class(model=model, gene_dataset=dataset, shuffle=False, indices=indices, use_cuda=use_cuda, data_loader_kwargs=data_loader_kwargs, **posterior_kwargs) return my_post
def test_protected_X(self): data = np.random.poisson(0.2, size=(25, 10)) ad = anndata.AnnData(data) ad.obs["_X"] = np.zeros(25) AnnDatasetFromAnnData(ad)
def runScanvi(adata, batch, labels): # Use non-normalized (count) data for scanvi! # Check for counts data layer if 'counts' not in adata.layers: raise TypeError( 'Adata does not contain a `counts` layer in `adata.layers[`counts`]`' ) from scvi.models import VAE, SCANVI from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer from sklearn.preprocessing import LabelEncoder from scvi.dataset import AnnDatasetFromAnnData import numpy as np # STEP 1: prepare the data net_adata = adata.copy() net_adata.X = adata.layers['counts'] del net_adata.layers['counts'] # Ensure that the raw counts are not accidentally used del net_adata.raw # Note that this only works from anndata 0.7 # Define batch indices le = LabelEncoder() net_adata.obs['batch_indices'] = le.fit_transform( net_adata.obs[batch].values) net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values) net_adata = AnnDatasetFromAnnData(net_adata) print("scANVI dataset object with {} batches and {} cell types".format( net_adata.n_batches, net_adata.n_labels)) #if hvg is True: # # this also corrects for different batches by default # net_adata.subsample_genes(2000, mode="seurat_v3") # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400]) #400 n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])])) n_latent = 30 n_hidden = 128 n_layers = 2 # STEP 2: RUN scVI to initialize scANVI vae = VAE( net_adata.nb_genes, reconstruction_loss='nb', n_batch=net_adata.n_batches, n_latent=n_latent, n_hidden=n_hidden, n_layers=n_layers, ) trainer = UnsupervisedTrainer( vae, net_adata, train_size=1.0, use_cuda=False, ) trainer.train(n_epochs=n_epochs_scVI, lr=1e-3) # STEP 3: RUN scANVI scanvi = SCANVI(net_adata.nb_genes, net_adata.n_batches, net_adata.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion='gene', reconstruction_loss='nb') scanvi.load_state_dict(trainer.model.state_dict(), strict=False) # use default parameter from semi-supervised trainer class trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata) # use all cells as labelled set trainer_scanvi.labelled_set = trainer_scanvi.create_posterior( trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata))) # put one cell in the unlabelled set trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior( indices=[0]) trainer_scanvi.train(n_epochs=n_epochs_scANVI) # extract info from posterior scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model, net_adata, indices=np.arange( len(net_adata))) latent, _, _ = scanvi_full.sequential().get_latent() adata.obsm['X_emb'] = latent return adata
def scvi( adata: AnnData, n_hidden: int = 128, n_latent: int = 10, n_layers: int = 1, dispersion: str = "gene", n_epochs: int = 400, lr: int = 1e-3, train_size: int = 1.0, batch_key: Optional[str] = None, use_highly_variable_genes: bool = True, subset_genes: Optional[Sequence[Union[int, str]]] = None, linear_decoder: bool = False, copy: bool = False, use_cuda: bool = True, return_posterior: bool = True, trainer_kwargs: dict = {}, model_kwargs: dict = {}, ) -> Optional[AnnData]: """\ SCVI [Lopez18]_. Fits scVI model onto raw count data given an anndata object scVI uses stochastic optimization and deep neural networks to aggregate information across similar cells and genes and to approximate the distributions that underlie observed expression values, while accounting for batch effects and limited sensitivity. To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.), set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can be used to inspect which genes contribute to variation in the dataset. It may also be used for all scVI tasks, like differential expression, batch correction, imputation, etc. However, batch correction may be less powerful as it assumes a linear model. .. note:: More information and bug reports `here <https://github.com/YosefLab/scVI>`__. Parameters ---------- adata An anndata file with `X` attribute of unnormalized count data n_hidden Number of nodes per hidden layer n_latent Dimensionality of the latent space n_layers Number of hidden layers used for encoder and decoder NNs dispersion One of the following * `'gene'` - dispersion parameter of NB is constant per gene across cells * `'gene-batch'` - dispersion can differ between different batches * `'gene-label'` - dispersion can differ between different labels * `'gene-cell'` - dispersion can differ for every gene in every cell n_epochs Number of epochs to train lr Learning rate train_size The train size, either a float between 0 and 1 or an integer for the number of training samples to use batch_key Column name in anndata.obs for batches. If None, no batch correction is performed If not None, batch correction is performed per batch category use_highly_variable_genes If true, uses only the genes in anndata.var["highly_variable"] subset_genes Optional list of indices or gene names to subset anndata. If not None, use_highly_variable_genes is ignored linear_decoder If true, uses LDVAE model, which is an implementation of [Svensson20]_. copy If true, a copy of anndata is returned return_posterior If true, posterior object is returned use_cuda If true, uses cuda trainer_kwargs Extra arguments for UnsupervisedTrainer model_kwargs Extra arguments for VAE or LDVAE model Returns ------- If `copy` is true, anndata is returned. If `return_posterior` is true, the posterior object is returned If both `copy` and `return_posterior` are true, a tuple of anndata and the posterior are returned in that order. `adata.obsm['X_scvi']` stores the latent representations `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial If linear_decoder is true: `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a genes by n_latent matrix. """ warnings.warn( "scvi via scanpy external API is no longer supported. " + "Please use the new scvi-tools package from `scvi-tools.org`", FutureWarning, ) try: from scvi.models import VAE, LDVAE from scvi.inference import UnsupervisedTrainer from scvi.dataset import AnnDatasetFromAnnData except ImportError: raise ImportError( "Please install scvi package from https://github.com/YosefLab/scVI" ) # check if observations are unnormalized using first 10 # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69 if len(adata) > 10: X_subset = adata.X[:10] else: X_subset = adata.X norm_error = ( 'Make sure that the dataset (adata.X) contains unnormalized count data.' ) if sp.sparse.issparse(X_subset): assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error else: assert np.all(X_subset.astype(int) == X_subset), norm_error if subset_genes is not None: adata_subset = adata[:, subset_genes] elif use_highly_variable_genes and "highly_variable" in adata.var: adata_subset = adata[:, adata.var["highly_variable"]] else: adata_subset = adata if batch_key is not None: codes, uniques = pd.factorize(adata_subset.obs[batch_key]) adata_subset.obs['_tmp_scvi_batch'] = codes n_batches = len(uniques) else: n_batches = 0 dataset = AnnDatasetFromAnnData(adata_subset.copy(), batch_label='_tmp_scvi_batch') if linear_decoder: vae = LDVAE( n_input=dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers_encoder=n_layers, dispersion=dispersion, **model_kwargs, ) else: vae = VAE( dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion=dispersion, **model_kwargs, ) trainer = UnsupervisedTrainer( model=vae, gene_dataset=dataset, use_cuda=use_cuda, train_size=train_size, **trainer_kwargs, ) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset))) latent, batch_indices, labels = full.sequential().get_latent() if copy: adata = adata.copy() adata.obsm['X_scvi'] = latent adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale() adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation() if linear_decoder: loadings = vae.get_loadings() df = pd.DataFrame(loadings, index=adata_subset.var_names) adata.uns['ldvae_loadings'] = df if copy and return_posterior: return adata, full elif copy: return adata elif return_posterior: return full
def runScvi(adata, batch, hvg=None): # Use non-normalized (count) data for scvi! # Expects data only on HVGs checkSanity(adata, batch, hvg) # Check for counts data layer if 'counts' not in adata.layers: raise TypeError( 'Adata does not contain a `counts` layer in `adata.layers[`counts`]`' ) from scvi.models import VAE from scvi.inference import UnsupervisedTrainer from sklearn.preprocessing import LabelEncoder from scvi.dataset import AnnDatasetFromAnnData # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization n_epochs = np.min([round((20000 / adata.n_obs) * 400), 400]) n_latent = 30 n_hidden = 128 n_layers = 2 net_adata = adata.copy() net_adata.X = adata.layers['counts'] del net_adata.layers['counts'] # Ensure that the raw counts are not accidentally used del net_adata.raw # Note that this only works from anndata 0.7 # Define batch indices le = LabelEncoder() net_adata.obs['batch_indices'] = le.fit_transform( net_adata.obs[batch].values) net_adata = AnnDatasetFromAnnData(net_adata) vae = VAE( net_adata.nb_genes, reconstruction_loss='nb', n_batch=net_adata.n_batches, n_layers=n_layers, n_latent=n_latent, n_hidden=n_hidden, ) trainer = UnsupervisedTrainer( vae, net_adata, train_size=1.0, use_cuda=False, ) trainer.train(n_epochs=n_epochs, lr=1e-3) full = trainer.create_posterior(trainer.model, net_adata, indices=np.arange(len(net_adata))) latent, _, _ = full.sequential().get_latent() adata.obsm['X_emb'] = latent return adata
# SCVI ############################################################## import time from scvi.dataset import AnnDatasetFromAnnData from scvi.dataset.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import SCANVI, VAE from umap import UMAP import scanpy as sc # TODO: import the datasets into SCVI objects (sigh!) # scVI wants raw counts, but who knows about those TabulaMurisSenis data # quick and dirty solution for now asubr_scvi = asubr.copy() asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64) ds_atlas = AnnDatasetFromAnnData(asubr_scvi) asub2_scvi = asub2.copy() asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64) ds_new = AnnDatasetFromAnnData(asub2_scvi) all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets([ds_atlas, ds_new]) ############################################################## t0 = time.time() print('Prepare some data structures') vae = VAE( all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels,