Exemplo n.º 1
0
def run(args):
    # Set logging level.
    logging_debug_opt = False
    LOGGER.addHandler(create_logging_handler(logging_debug_opt))
    LOGGER.setLevel(logging.DEBUG)

    LOGGER.info("Using configuration {}.".format(args.config_filename))
    cfg = ConfigParser()
    cfg.read(args.config_filename)

    in_fname = cfg['data']['modules'] if not args.input else args.input
    LOGGER.info("Loading modules from {}.".format(in_fname))
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    if in_fname.endswith('.yaml'):
        modules = load_from_yaml(in_fname)
    else:
        with open(in_fname, 'rb') as f:
            modules = pickle.load(f)
    # Filter out modules with to few genes.
    min_genes = int(cfg['parameters']['min_genes'])
    modules = list(filter(lambda m: len(m) >= min_genes, modules))

    LOGGER.info("Loading databases.")

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";")))
    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    LOGGER.info("Calculating regulons.")
    motif_annotations_fname = cfg['data']['motif_annotations']
    mode = cfg['parameters']['mode']
    with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar():
        df = prune2df(dbs,
                      modules,
                      motif_annotations_fname,
                      rank_threshold=int(cfg['parameters']['rank_threshold']),
                      auc_threshold=float(cfg['parameters']['auc_threshold']),
                      nes_threshold=float(cfg['parameters']['nes_threshold']),
                      client_or_address=mode,
                      module_chunksize=cfg['parameters']['chunk_size'],
                      num_workers=args.num_workers)

    LOGGER.info("Writing results to file.")
    df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
cor_p_thr = 0.001
memory_limit = 50e9
n_cells = 600
grouping_variable = 'celltype'


if __name__ == '__main__':

	if sum(metadata.columns == grouping_variable) < 1:
		exit('Grouping variable not found in metadata.')

	## Load randing databases
	db_fnames = glob.glob(db_folder)
	def name(fname):
		return os.path.basename(fname).split(".")[0]
	dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
	dbs


	## Initialize cluster
	local_cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, processes=False, memory_limit=memory_limit)
	custom_client = Client(local_cluster)


	## Load TFs
	tf_names = load_tf_names(TFs_file)


	## Collect here regulons passing correlation filter
	cortest_passed_regulons = []
Exemplo n.º 3
0
def db():
    return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME)
Exemplo n.º 4
0
def db():
    return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME,
                           NOMENCLATURE)
Exemplo n.º 5
0
#---------------functions-----------------
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]


#################################################################
##1. load df (already loaded in setup)
#df_cnt=pd.read_csv(f_in, index_col=0)

#2. tf genes
tf_name=load_tf_names(f_tf)

#3. ranking databases (only 2 mm10 dbs)
l_fname=list(Path(fd_db).glob('*.feather'))
l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname]

#3. run
if __name__ =='__main__':
#	#1. Inference of co-expression modules
#	print('Inference...')
#	df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True)
#	df_adj.to_csv(f'{fd_out}/adj.csv', index=False)
	
	#2. prune
	df_adj=pd.read_csv(f'{fd_out}/adj.csv')  #if missing, always stuck at 98%
	print('Prune...')
	l_mod=list(modules_from_adjacencies(df_adj, df_cnt))

	with ProgressBar():
		df_prune = prune2df(l_db, l_mod, f_motif)
Exemplo n.º 6
0
    auc_mtx = aucell(expr, regulons, num_workers=ppn)
    tfs = [tf.strip('(+)') for tf in auc_mtx.columns]
    auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix))

    print('finished calculation for %s' % (prefix))


if __name__ == "__main__":

    wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes'
    os.chdir(wkdir)

    db = [
        RankingDatabase(
            fname=
            '../data/pySCENIC/ref/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather',
            name='hg38__refseq-r80__10kb_up_and_down_tss.mc9nr')
    ]
    tf_names = load_tf_names('../data/pySCENIC/ref/hs_hgnc_curated_tfs.txt')

    CellTypes = [
        'MCF7', 'T47D WT', 'T47D KO', 'MM134', 'SUM44', 'BCK4', 'MCF10A',
        'HEK293'
    ]

    for cell in CellTypes:

        tmp = adata_raw[adata_raw.obs['CellType'] == cell]
        RawSplicedCts = pd.DataFrame(tmp.layers['spliced'].todense(),
                                     index=tmp.obs.index,
                                     columns=tmp.var.index)  # cell X gene