def compute_diffexp_ttest(self, maskA, maskB, top_n=None, lfc_cutoff=None): if top_n is None: top_n = self.dataset_config.diffexp__top_n if lfc_cutoff is None: lfc_cutoff = self.dataset_config.diffexp__lfc_cutoff return diffexp_generic.diffexp_ttest(self, maskA, maskB, top_n, lfc_cutoff)
def test_cxg_generic(self): """Test a cxg adaptor with the generic adaptor""" adaptor = self.load_dataset(f"{FIXTURES_ROOT}/pbmc3k.cxg") maskA = self.get_mask(adaptor, 1, 10) maskB = self.get_mask(adaptor, 2, 10) # run it directly results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB, 10) self.check_1_10_2_10(results)
def DEG(data): adata = None; genes = data['genes'] data['genes'] = [] comGrp = 'cellGrp' if 'combine' in data.keys(): if data['DEmethod']=='default': combUpdate, obs = getObs(data) if combUpdate and len(data['grp'])>1: obs[comGrp] = obs[data['grp'][0]] for i in data['grp']: if i!=data['grp'][0]: obs[comGrp] += ":"+obs[i] mask = [obs[comGrp].isin([data['comGrp'][i]]) for i in [0,1]] else: data['figOpt']['scale'] = 'false' adata = createData(data) comGrp = data['grp'][0] adata = adata[adata.obs[comGrp].isin(data['comGrp'])] else: mask = [pd.Series(range(data['cellN'])).isin(data['cells'][one].values()) for one in data['comGrp']] for one in data['comGrp']: oneD = data.copy() oneD['cells'] = data['cells'][one] oneD['genes'] = [] oneD['grp'] = [] oneD['figOpt']['scale']='false' #oneD = {'cells':data['cells'][one], # 'genes':[], # 'grp':[], # 'figOpt':{'scale':'false'}, # 'url':data['url']} D = createData(oneD) D.obs[comGrp] = one if adata is None: adata = D else: adata = adata.concatenate(D) if data['DEmethod']=='default': if sum(mask[0]==True)<10 or sum(mask[1]==True)<10: raise ValueError('Less than 10 cells in a group!') with app.get_data_adaptor(url_dataroot=data['url_dataroot'],dataset=data['dataset']) as scD: res = diffDefault.diffexp_ttest(scD,mask[0].to_numpy(),mask[1].to_numpy(),scD.data.shape[0]) gNames = list(scD.data.var["name_0"]) deg = pd.DataFrame(res,columns=['gID','log2fc','pval','qval']) gName = pd.Series([gNames[i] for i in deg['gID']],name='gene') deg = pd.concat([deg,gName],axis=1).loc[:,['gene','log2fc','pval','qval']] else: if not 'AnnData' in str(type(adata)): raise ValueError('No data extracted by user selection') adata.obs.astype('category') nm = None if data['DEmethod']=='wald': nm = 'nb' res = de.test.two_sample(adata,comGrp,test=data['DEmethod'],noise_model=nm) deg = res.summary() deg = deg.sort_values(by=['qval']).loc[:,['gene','log2fc','pval','qval']] deg['log2fc'] = -1 * deg['log2fc'] ## plot in R strF = ('/tmp/DEG%f.csv' % time.time()) deg.to_csv(strF,index=False) #ppr.pprint([strExePath+'/volcano.R',strF,';'.join(genes),data['figOpt']['img'],str(data['figOpt']['fontsize']),str(data['figOpt']['dpi']),str(data['logFC']),data['comGrp'][1],data['comGrp'][0]]) res = subprocess.run([strExePath+'/volcano.R',strF,';'.join(genes),data['figOpt']['img'],str(data['figOpt']['fontsize']),str(data['figOpt']['dpi']),str(data['logFC']),data['comGrp'][1],data['comGrp'][0]],capture_output=True)# img = res.stdout.decode('utf-8') os.remove(strF) ##### gInfo = getVar(data) deg.index = deg['gene'] deg = pd.concat([deg,gInfo],axis=1,sort=False) #return deg.to_csv() if not data['topN']=='All': deg = deg.iloc[range(int(data['topN'])),] #deg.loc[:,'log2fc'] = deg.loc[:,'log2fc'].apply(lambda x: '%.2f'%x) #deg.loc[:,'pval'] = deg.loc[:,'pval'].apply(lambda x: '%.4E'%x) #deg.loc[:,'qval'] = deg.loc[:,'qval'].apply(lambda x: '%.4E'%x) return json.dumps([deg.to_csv(),img])#json.dumps([deg.values.tolist(),img])
def sparse_diffexp(self, apply_col_shift): with tempfile.TemporaryDirectory() as dirname: # create a sparse matrix h5adfile = os.path.join(dirname, "sparse.h5ad") create_test_h5ad(h5adfile, 2000, 2000, 10, apply_col_shift) adaptor_anndata = self.load_dataset( h5adfile, extra_dataset_config=dict(embeddings__names=[])) adata = adaptor_anndata.data sparsename = os.path.join(dirname, "sparse.cxg") write_cxg(adata=adata, container=sparsename, title="sparse", sparse_threshold=11) adaptor_sparse = self.load_dataset(sparsename) assert adaptor_sparse.open_array("X").schema.sparse assert adaptor_sparse.has_array("X_col_shift") == apply_col_shift densename = os.path.join(dirname, "dense.cxg") write_cxg(adata=adata, container=densename, title="dense", sparse_threshold=0) adaptor_dense = self.load_dataset(densename) assert not adaptor_dense.open_array("X").schema.sparse assert not adaptor_dense.has_array("X_col_shift") maskA = self.get_mask(adaptor_anndata, 1, 10) maskB = self.get_mask(adaptor_anndata, 2, 10) diffexp_results_anndata = diffexp_generic.diffexp_ttest( adaptor_anndata, maskA, maskB, 10) diffexp_results_sparse = diffexp_cxg.diffexp_ttest( adaptor_sparse, maskA, maskB, 10) diffexp_results_dense = diffexp_cxg.diffexp_ttest( adaptor_dense, maskA, maskB, 10) self.compare_diffexp_results(diffexp_results_anndata, diffexp_results_sparse) self.compare_diffexp_results(diffexp_results_anndata, diffexp_results_dense) topcols = np.array([x[0] for x in diffexp_results_anndata]) cols_anndata = self.get_X_col(adaptor_anndata, topcols) cols_sparse = self.get_X_col(adaptor_sparse, topcols) cols_dense = self.get_X_col(adaptor_dense, topcols) assert cols_anndata.shape[0] == adaptor_sparse.get_shape()[0] assert cols_anndata.shape[1] == len(diffexp_results_anndata) def convert(mat, cols): return decode_matrix_fbs(encode_matrix_fbs( mat, col_idx=cols)).to_numpy() cols_anndata = convert(cols_anndata, topcols) cols_sparse = convert(cols_sparse, topcols) cols_dense = convert(cols_dense, topcols) x = adaptor_sparse.get_X_array() assert x.shape == adaptor_sparse.get_shape() for row in range(cols_anndata.shape[0]): for col in range(cols_anndata.shape[1]): vanndata = cols_anndata[row][col] vsparse = cols_sparse[row][col] vdense = cols_dense[row][col] self.assertTrue(np.isclose(vanndata, vsparse, 1e-6, 1e-6)) self.assertTrue(np.isclose(vanndata, vdense, 1e-6, 1e-6))
def main(): parser = argparse.ArgumentParser("A command to test diffexp") parser.add_argument("dataset", help="name of a dataset to load") parser.add_argument("-na", "--numA", type=int, help="number of rows in group A") parser.add_argument("-nb", "--numB", type=int, help="number of rows in group B") parser.add_argument("-va", "--varA", help="obs variable:value to use for group A") parser.add_argument("-vb", "--varB", help="obs variable:value to use for group B") parser.add_argument("-t", "--trials", default=1, type=int, help="number of trials") parser.add_argument("-a", "--alg", choices=("default", "generic", "cxg"), default="default", help="algorithm to use") parser.add_argument("-s", "--show", default=False, action="store_true", help="show the results") parser.add_argument("-n", "--new-selection", default=False, action="store_true", help="change the selection between each trial") parser.add_argument("--seed", default=1, type=int, help="set the random seed") args = parser.parse_args() app_config = AppConfig() app_config.update_server_config(single_dataset__datapath=args.dataset) app_config.update_server_config(app__verbose=True) app_config.complete_config() loader = MatrixDataLoader(args.dataset) adaptor = loader.open(app_config) if args.show: if isinstance(adaptor, CxgAdaptor): adaptor.open_array("X").schema.dump() random.seed(args.seed) np.random.seed(args.seed) rows = adaptor.get_shape()[0] if args.numA: filterA = random.sample(range(rows), args.numA) elif args.varA: vname, vval = args.varA.split(":") filterA = get_filter_from_obs(adaptor, vname, vval) else: print("must supply numA or varA") sys.exit(1) if args.numB: filterB = random.sample(range(rows), args.numB) elif args.varB: vname, vval = args.varB.split(":") filterB = get_filter_from_obs(adaptor, vname, vval) else: print("must supply numB or varB") sys.exit(1) for i in range(args.trials): if args.new_selection: if args.numA: filterA = random.sample(range(rows), args.numA) if args.numB: filterB = random.sample(range(rows), args.numB) maskA = np.zeros(rows, dtype=bool) maskA[filterA] = True maskB = np.zeros(rows, dtype=bool) maskB[filterB] = True t1 = time.time() if args.alg == "default": results = adaptor.compute_diffexp_ttest(maskA, maskB) elif args.alg == "generic": results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB) elif args.alg == "cxg": if not isinstance(adaptor, CxgAdaptor): print("cxg only works with CxgAdaptor") sys.exit(1) results = diffexp_cxg.diffexp_ttest(adaptor, maskA, maskB) t2 = time.time() print("TIME=", t2 - t1) if args.show: for res in results: print(res)
def main(): parser = argparse.ArgumentParser("A command to test diffexp") parser.add_argument("dataset", help="name of a dataset to load") parser.add_argument("-na", "--numA", type=int, required=True, help="number of rows in group A") parser.add_argument("-nb", "--numB", type=int, required=True, help="number of rows in group B") parser.add_argument("-t", "--trials", default=1, type=int, help="number of trials") parser.add_argument("-a", "--alg", choices=("default", "generic", "cxg"), default="default", help="algorithm to use") parser.add_argument("-s", "--show", default=False, action="store_true", help="show the results") parser.add_argument("-n", "--new-selection", default=False, action="store_true", help="change the selection between each trial") parser.add_argument("--seed", default=1, type=int, help="set the random seed") args = parser.parse_args() app_config = AppConfig() app_config.single_dataset__datapath = args.dataset app_config.server__verbose = True app_config.complete_config() loader = MatrixDataLoader(args.dataset) adaptor = loader.open(app_config) if args.show: if isinstance(adaptor, CxgAdaptor): adaptor.open_array("X").schema.dump() numA = args.numA numB = args.numB rows = adaptor.get_shape()[0] random.seed(args.seed) if not args.new_selection: samples = random.sample(range(rows), numA + numB) filterA = samples[:numA] filterB = samples[numA:] for i in range(args.trials): if args.new_selection: samples = random.sample(range(rows), numA + numB) filterA = samples[:numA] filterB = samples[numA:] maskA = np.zeros(rows, dtype=bool) maskA[filterA] = True maskB = np.zeros(rows, dtype=bool) maskB[filterB] = True t1 = time.time() if args.alg == "default": results = adaptor.compute_diffexp_ttest(maskA, maskB) elif args.alg == "generic": results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB) elif args.alg == "cxg": if not isinstance(adaptor, CxgAdaptor): print("cxg only works with CxgAdaptor") sys.exit(1) results = diffexp_cxg.diffexp_ttest(adaptor, maskA, maskB) t2 = time.time() print("TIME=", t2 - t1) if args.show: for res in results: print(res)