def run(args): # Set logging level. logging_debug_opt = False LOGGER.addHandler(create_logging_handler(logging_debug_opt)) LOGGER.setLevel(logging.DEBUG) LOGGER.info("Using configuration {}.".format(args.config_filename)) cfg = ConfigParser() cfg.read(args.config_filename) in_fname = cfg['data']['modules'] if not args.input else args.input LOGGER.info("Loading modules from {}.".format(in_fname)) # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml if in_fname.endswith('.yaml'): modules = load_from_yaml(in_fname) else: with open(in_fname, 'rb') as f: modules = pickle.load(f) # Filter out modules with to few genes. min_genes = int(cfg['parameters']['min_genes']) modules = list(filter(lambda m: len(m) >= min_genes, modules)) LOGGER.info("Loading databases.") def name(fname): return os.path.splitext(os.path.basename(fname))[0] db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";"))) dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] LOGGER.info("Calculating regulons.") motif_annotations_fname = cfg['data']['motif_annotations'] mode = cfg['parameters']['mode'] with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar(): df = prune2df(dbs, modules, motif_annotations_fname, rank_threshold=int(cfg['parameters']['rank_threshold']), auc_threshold=float(cfg['parameters']['auc_threshold']), nes_threshold=float(cfg['parameters']['nes_threshold']), client_or_address=mode, module_chunksize=cfg['parameters']['chunk_size'], num_workers=args.num_workers) LOGGER.info("Writing results to file.") df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
def run_regression(self): data_df = self.data.to_df() utils.Debug.vprint( "Calculating {m} adjacencies".format(m=self.adjacency_method), level=0) # Get adjacencies adj_method = ADJ_METHODS[self.adjacency_method] if MPControl.is_dask: client_or_address = MPControl.client.client MPControl.client.check_cluster_state() else: client_or_address = 'local' adjacencies = adj_method(data_df, tf_names=self.tf_names, verbose=True, client_or_address=client_or_address, seed=self.random_seed) if self.do_scenic: # Convert adjacencies to modules modules = list(modules_from_adjacencies(adjacencies, data_df)) # Load feather (rank) databases dbs = [ RankingDatabase(fname=self._feather_rank_file, name="RANKING_PRIOR") ] utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0) # Prune to df df = prune2df(dbs, modules, self._motif_link_table_file, client_or_address=client_or_address) return self.reprocess_scenic_output_to_inferelator_results( df, self.priors_data) else: return self.reprocess_adj_to_inferelator_results(adjacencies)
def calcTFs( expr, tf_names, db, prefix, motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl', out_path='../data/pySCENIC', ppn=8): """Computes motifs, regulons and trancriptional factor activation using pySCENIC. Arguments --------- expr: `pandas DataFrame` cell X gene raw counts; FPKM; not TPM as coexpression will be calculated tf_names: `list` (`str`) curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt db: `list` (`FeatherRankingDatabase()`) feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")] prefix: `str` (default: `None`) Specify name to save files (eg, cell line names) Returns ------- Do not return but write files (the calc takes too long...) """ # Inference of co-expression modules adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True) modules = list(modules_from_adjacencies(adjacencies, expr)) # Calculate a list of enriched motifs and the corresponding target genes for all modules. with ProgressBar(): df = prune2df(db, modules, motif_path, num_workers=ppn) # Create regulons from this table of enriched motifs. regulons = df2regulons(df) # Save the enriched motifs and the discovered regulons to disk. with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f: pickle.dump(regulons, f) auc_mtx = aucell(expr, regulons, num_workers=ppn) tfs = [tf.strip('(+)') for tf in auc_mtx.columns] auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix)) print('finished calculation for %s' % (prefix))
## Derive potential regulons from co-expression modules if not os.path.isfile(modules_fname): modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False)) pickle.dump(modules, open(modules_fname, 'wb')) else: modules = pickle.load(open(modules_fname, 'rb')) del adjacencies ## Prune modules for targets with cis regulatory footprints (aka RcisTarget) ### Calculate a list of enriched motifs and the corresponding target genes for all modules. if not os.path.isfile(motifs_fname): df = prune2df(dbs, modules, motif_annotations, num_workers=n_cores) df.to_csv(motifs_fname) else: df = pd.read_csv(motifs_fname) del modules ### Create regulons from this table of enriched motifs. if not os.path.isfile(regulons_fname): regulons = df2regulons(df) pickle.dump(regulons, open(regulons_fname, 'wb')) else: regulons = pickle.load(open(regulons_fname, 'rb'))
adjacencies = grnboost2( ex_matrix, tf_names=tf_names, verbose=True) # runs improved GRNBoost instance of GENIE3 modules = list(modules_from_adjacencies( adjacencies, ex_matrix)) # identifies modules from GENIE3 # save GRNBoost2 product so we don't have to repeat again adjacencies.to_csv("grnboost_output.csv") # load product in case something goes wrong adjacencies = pd.read_csv("grnboost_output.csv", index_col=0) # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS with ProgressBar( ): # calculate a list of enriched motifs and the corresponding target genes for all modules df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt") regulons = df2regulons( df) # create regulons from this table of enriched motifs # save the discovered motifs and regulons df.to_csv(motifs_filename) with open(regulons_filename, "wb") as f: pickle.dump(regulons, f) # load the discovered motifs and regulons if saved previously df = load_motifs(motifs_filename) with open(regulons_filename, "rb") as f: regulons = pickle.load(f) # AUCell process: finds enrichment of each discovered regulon auc_matrix = aucell(ex_matrix, regulons, num_workers=4)
adjacencies.to_csv(out_file, sep='\t', index=False, header=False) print("grnboost done") modules = list( modules_from_adjacencies(adjacencies, ex_matrix, rho_mask_dropouts=True)) #print("writing modules") #with open(MODULES_FNAME, 'wb') as f: # pickle.dump(modules, f) print("Finding Enriched modules") # Calculate a list of enriched motifs and the corresponding target genes for all modules. with ProgressBar(): df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME) df.head() # Create regulons from this table of enriched motifs. print("creating regulons") regulons = df2regulons(df) print("writing regulons") # Save the enriched motifs and the discovered regulons to disk. #df.to_csv(MOTIFS_FNAME) #with open(REGULONS_FNAME, "wb") as f: # pickle.dump(regulons, f) print("Finding AUC of cells") auc_mtx = aucell(ex_matrix, regulons, num_workers=1) auc_file = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".csv")
print("STARTING PHASE II") if runOnCluster: if RegulonsViaDask: if calcRegulonsWithIntermediateDf: ##df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local" #df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local" #print("DEFINED df, type:") #print(type(df)) #regulons = df2regulons(df, NOMENCLATURE) from dask.diagnostics import ProgressBar with ProgressBar(): df = prune2df( dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local" print("DEFINED df, type:") print(type(df)) regulons = df2regulons(df, NOMENCLATURE) else: from dask.diagnostics import ProgressBar with ProgressBar(): #regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local" regulons = prune( dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local"
l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname] #3. run if __name__ =='__main__': # #1. Inference of co-expression modules # print('Inference...') # df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True) # df_adj.to_csv(f'{fd_out}/adj.csv', index=False) #2. prune df_adj=pd.read_csv(f'{fd_out}/adj.csv') #if missing, always stuck at 98% print('Prune...') l_mod=list(modules_from_adjacencies(df_adj, df_cnt)) with ProgressBar(): df_prune = prune2df(l_db, l_mod, f_motif) df_prune.to_csv(f'{fd_out}/prune.csv') #3. create regulon print('Regulon...') regulon=df2regulons(df_prune) #4. Save the enriched motifs and the discovered regulons with open(f'{fd_out}/regulon.pkl', "wb") as f: pickle.dump(regulon, f) #5. auc print('AUC...') with open(f'{fd_out}/regulon.pkl', "rb") as f: #if missing, always stuck regulon=pickle.load(f)
DATABASES_GLOB = os.path.join("resources/network_analysis", "mm10_*.mc9nr.feather") MOTIF_ANNOTATIONS_FNAME = os.path.join("resources/network_analysis", "motifs-v9-nr.mgi-m0.001-o0.0.tbl") db_fnames = glob.glob(DATABASES_GLOB) def name(fname): return os.path.splitext(os.path.basename(fname))[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] print(dbs) modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, num_workers=4) print("prune2df done, now saving") with open(snakemake.output[0], "wb") as f: pickle.dump(df, f) print("df2regulons carrying out") regulons = df2regulons(df) print("prunedone, now saving") with open(snakemake.output[1], "wb") as f: pickle.dump(regulons, f)