def fit_transform(self, data): self.model = scscope.train(data, self.k, use_mask=True, batch_size=128, max_epoch=500, epoch_per_check=10, T=2, exp_batch_idx_input=[], encoder_layers=[1000, 500, 200, 500], decoder_layers=[200, 500, 1000], learning_rate=0.001, beta1=0.5, num_gpus=1) embedding, _, _ = scscope.predict(data, self.model, batch_effect=[]) return embedding
def main(cmd_args): dataset = cb.data.ExprDataSet.read_dataset( cmd_args.input, sparsify=True ).normalize() if cmd_args.clean is not None: dataset = utils.clean_dataset(dataset, cmd_args.clean) if cmd_args.genes is not None: dataset = dataset[:, dataset.uns[cmd_args.genes]] dataset = dataset.exprs.log1p().toarray() start_time = time.time() model = DeepImpute.train( dataset, cmd_args.n_latent, max_epoch=cmd_args.n_epochs, random_seed=cmd_args.seed ) latent, _imputed_val, _batch_effect = DeepImpute.predict(dataset, model) cb.data.write_hybrid_path( time.time() - start_time, "//".join([cmd_args.output, "time"]) ) cb.data.write_hybrid_path( latent, "//".join([cmd_args.output, "latent"]) )
def RUN_MAIN(): # 1. Load gene expression matrix of simulated data # gene expression with simulated dropouts counts_drop = pd.read_csv('counts_1.csv', header=0, index_col=0) # ground trouth subpopulation assignment cellinfo = pd.read_csv('cellinfo_1.csv', header=0, index_col=0) group = cellinfo.Group label_ground_truth = [] for g in group: g = int(g.split('Group')[1]) label_ground_truth.append(g) # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size) # matrix of cells x genes gene_expression = sc.AnnData(counts_drop.values) # normalize each cell to have same count number sc.pp.normalize_per_cell(gene_expression) # update datastructure to use normalized data gene_expression = gene_expression.X latent_dim = 50 # 3. scScope learning if gene_expression.shape[0] >= 100000: DI_model = DeepImpute.train(gene_expression, latent_dim, T=2, batch_size=512, max_epoch=10, num_gpus=4) else: DI_model = DeepImpute.train(gene_expression, latent_dim, T=2, batch_size=64, max_epoch=300, num_gpus=4) # 4. latent representations and imputed expressions latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model) # 5. graph clustering if latent_code.shape[0] <= 10000: label, _, _ = phenograph.cluster(latent_code) else: label = DeepImpute.scalable_cluster(latent_code) # evaluate ARI = adjusted_rand_score(label, label_ground_truth) print(ARI) X_embedded = TSNE(n_components=2).fit_transform(latent_code) # visualization of the subpopulation using tSNE plt.figure() for i in range(5): idx = np.nonzero(label == i)[0] plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1]) plt.show()
raw_library_size = input_bc_gene_mat.sum(0) filt_adata = sc.AnnData(input_bc_gene_mat) # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size) # normalize each cell to have same count number sc.pp.normalize_per_cell(filt_adata) # update datastructure to use normalized data gene_expression_norm = filt_adata.X latent_dim = 50 # 3. scScope learning DI_model = DeepImpute.train(gene_expression_norm, latent_dim, num_gpus=FLAGS["num_gpus"], max_epoch=FLAGS["epoch"]) # 4. latent representations and imputed expressions latent_code, imputed_val, predicted_batch_effect = DeepImpute.predict( gene_expression_norm, DI_model) with h5py.File("{}/{}".format(output_dir, output_feature_h5), "w") as f: f["cell_id"] = cell_id.astype(h5py.special_dtype(vlen=str)) ff_dset_feature = f.create_dataset("feature", shape=(cell_id.size, latent_code.shape[1]), chunks=(1, latent_code.shape[1]), dtype=np.float32) ff_dset_feature[...] = latent_code imputed_val_resume = imputed_val * raw_library_size / filt_adata.X.sum( 1, keepdims=True) with h5py.File("{}/{}".format(output_dir, output_raw_h5), "w") as f: f["cell_id"] = cell_id.astype(h5py.special_dtype(vlen=str)) f["gene_name"] = gene_name[gene_filter].astype( h5py.special_dtype(vlen=str))
import time import argparse parser = argparse.ArgumentParser() parser.add_argument('--counts_file', metavar='counts_file', default=None, required=True, help='counts_file file') parser.add_argument('--num_pc', metavar='N', type=int, default=None, help='num_pc file') parser.add_argument('--out', metavar='out', required=True, help='output file') args = parser.parse_args() # 1. Load gene expression matrix of simulated data counts_drop = pd.read_csv(args.counts_file, header=0, index_col=0) # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size) # matrix of cells x genes start_time = time.time() gene_expression = sc.AnnData(counts_drop.values) # normalize each cell to have same count number sc.pp.normalize_per_cell(gene_expression) # update datastructure to use normalized data gene_expression = gene_expression.X # 3. scScope learning if gene_expression.shape[0] >= 100000: DI_model = DeepImpute.train(gene_expression, args.num_pc, T=2, batch_size=512, max_epoch=10, num_gpus=4) else: DI_model = DeepImpute.train(gene_expression, args.num_pc, T=2, batch_size=64, max_epoch=300, num_gpus=4) # 4. latent representations and imputed expressions latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model) elapsed_time = time.time() - start_time np.savetxt(args.out, latent_code, fmt='%.4f')
X = pd.read_csv(args.input, index_col=0) X = X.transpose() import scscope as DeepImpute DI_model = DeepImpute.train(X.values, args.latent_code_dim, use_mask=not args.no_mask, batch_size=args.batch_size, max_epoch=args.n_epochs, epoch_per_check=args.n_epochs + 1, T=args.T, exp_batch_idx_input=[], encoder_layers=[], decoder_layers=[], learning_rate=args.lr, beta1=args.beta1, num_gpus=1) latent_code, imputed_val, _ = DeepImpute.predict(X.values, DI_model) make_sure_dir_exists(args.outputdir) filename_latent = os.path.join(args.outputdir, "latent.csv") filename_imputation = os.path.join(args.outputdir, "imputed_values.csv") pd.DataFrame(latent_code.T, columns=X.index.values).to_csv(filename_latent) pd.DataFrame(imputed_val.T, columns=X.index.values, index=X.columns.values).to_csv(filename_imputation) print("Done!")