def run(args): # Set logging level. logging_debug_opt = False LOGGER.addHandler(create_logging_handler(logging_debug_opt)) LOGGER.setLevel(logging.DEBUG) LOGGER.info("Using configuration {}.".format(args.config_filename)) cfg = ConfigParser() cfg.read(args.config_filename) in_fname = cfg['data']['modules'] if not args.input else args.input LOGGER.info("Loading modules from {}.".format(in_fname)) # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml if in_fname.endswith('.yaml'): modules = load_from_yaml(in_fname) else: with open(in_fname, 'rb') as f: modules = pickle.load(f) # Filter out modules with to few genes. min_genes = int(cfg['parameters']['min_genes']) modules = list(filter(lambda m: len(m) >= min_genes, modules)) LOGGER.info("Loading databases.") def name(fname): return os.path.splitext(os.path.basename(fname))[0] db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";"))) dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] LOGGER.info("Calculating regulons.") motif_annotations_fname = cfg['data']['motif_annotations'] mode = cfg['parameters']['mode'] with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar(): df = prune2df(dbs, modules, motif_annotations_fname, rank_threshold=int(cfg['parameters']['rank_threshold']), auc_threshold=float(cfg['parameters']['auc_threshold']), nes_threshold=float(cfg['parameters']['nes_threshold']), client_or_address=mode, module_chunksize=cfg['parameters']['chunk_size'], num_workers=args.num_workers) LOGGER.info("Writing results to file.") df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
cor_p_thr = 0.001 memory_limit = 50e9 n_cells = 600 grouping_variable = 'celltype' if __name__ == '__main__': if sum(metadata.columns == grouping_variable) < 1: exit('Grouping variable not found in metadata.') ## Load randing databases db_fnames = glob.glob(db_folder) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames] dbs ## Initialize cluster local_cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, processes=False, memory_limit=memory_limit) custom_client = Client(local_cluster) ## Load TFs tf_names = load_tf_names(TFs_file) ## Collect here regulons passing correlation filter cortest_passed_regulons = []
def db(): return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME)
def db(): return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME, NOMENCLATURE)
#---------------functions----------------- def name(fname): return os.path.splitext(os.path.basename(fname))[0] ################################################################# ##1. load df (already loaded in setup) #df_cnt=pd.read_csv(f_in, index_col=0) #2. tf genes tf_name=load_tf_names(f_tf) #3. ranking databases (only 2 mm10 dbs) l_fname=list(Path(fd_db).glob('*.feather')) l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname] #3. run if __name__ =='__main__': # #1. Inference of co-expression modules # print('Inference...') # df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True) # df_adj.to_csv(f'{fd_out}/adj.csv', index=False) #2. prune df_adj=pd.read_csv(f'{fd_out}/adj.csv') #if missing, always stuck at 98% print('Prune...') l_mod=list(modules_from_adjacencies(df_adj, df_cnt)) with ProgressBar(): df_prune = prune2df(l_db, l_mod, f_motif)
auc_mtx = aucell(expr, regulons, num_workers=ppn) tfs = [tf.strip('(+)') for tf in auc_mtx.columns] auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix)) print('finished calculation for %s' % (prefix)) if __name__ == "__main__": wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes' os.chdir(wkdir) db = [ RankingDatabase( fname= '../data/pySCENIC/ref/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather', name='hg38__refseq-r80__10kb_up_and_down_tss.mc9nr') ] tf_names = load_tf_names('../data/pySCENIC/ref/hs_hgnc_curated_tfs.txt') CellTypes = [ 'MCF7', 'T47D WT', 'T47D KO', 'MM134', 'SUM44', 'BCK4', 'MCF10A', 'HEK293' ] for cell in CellTypes: tmp = adata_raw[adata_raw.obs['CellType'] == cell] RawSplicedCts = pd.DataFrame(tmp.layers['spliced'].todense(), index=tmp.obs.index, columns=tmp.var.index) # cell X gene