示例#1
0
def calc_modules(adjacencies,
                 exp_mtx,
                 name,
                 rho_dichotomize,
                 rho_threshold=None,
                 mask_dropouts=None):
    if rho_dichotomize:
        print('{} - {} masking - rho threshold {}'.format(
            name, "with" if mask_dropouts else "without", rho_threshold))

        out_fname = os.path.join(
            RESOURCES_FOLDER,
            "{}.{}.{}".format(name, rho_threshold, MODULES_EXT)
            if mask_dropouts else "{}.{}.no_mask.{}".format(
                name, rho_threshold, MODULES_EXT))
    else:
        print('{} - all'.format(name))

        out_fname = os.path.join(RESOURCES_FOLDER,
                                 "{}.all.{}".format(name, MODULES_EXT))

    if os.path.isfile(out_fname):
        return

    modules = list(
        modules_from_adjacencies(adjacencies,
                                 exp_mtx,
                                 rho_dichotomize=rho_dichotomize,
                                 rho_threshold=rho_threshold,
                                 rho_mask_dropouts=mask_dropouts))
    print(len(modules))

    with open(out_fname, 'wb') as f:
        pickle.dump(modules, f)
示例#2
0
def adjacencies2modules(args):
    try:
        adjacencies = load_adjacencies(args.module_fname.name)
    except ValueError as e:
        LOGGER.error(e)
        sys.exit(1)

    LOGGER.info("Loading expression matrix.")
    try:
        ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
                                 (args.transpose == 'yes'),
                                 False, # sparse loading is disabled here for now
                                 args.cell_id_attribute,
                                 args.gene_attribute)
    except ValueError as e:
        LOGGER.error(e)
        sys.exit(1)

    return modules_from_adjacencies(adjacencies,
                                    ex_mtx,
                                    thresholds=args.thresholds,
                                    top_n_targets=args.top_n_targets,
                                    top_n_regulators=args.top_n_regulators,
                                    min_genes=args.min_genes,
                                    rho_mask_dropouts=args.mask_dropouts,
                                    keep_only_activating=(args.all_modules != "yes"))
示例#3
0
def _df2modules(args):
    ext = os.path.splitext(args.module_fname.name)[1]
    adjacencies = pd.read_csv(args.module_fname.name, sep=FILE_EXTENSION2SEPARATOR[ext])
    ex_mtx = _load_expression_matrix(args)
    return modules_from_adjacencies(adjacencies, ex_mtx,
                                    thresholds=args.thresholds,
                                    top_n_targets=args.top_n_targets,
                                    top_n_regulators=args.top_n_regulators,
                                    min_genes=args.min_genes,
                                    keep_only_activating=(args.all_modules != "yes"))
示例#4
0
def _df2modules(args):
    ext = os.path.splitext(args.module_fname.name)[1]
    adjacencies = pd.read_csv(args.module_fname.name,
                              sep=FILE_EXTENSION2SEPARATOR[ext])
    ex_mtx = pd.read_csv(args.expression_mtx_fname,
                         sep='\t',
                         header=0,
                         index_col=0)
    return modules_from_adjacencies(adjacencies,
                                    ex_mtx,
                                    nomenclature=args.nomenclature,
                                    thresholds=args.thresholds,
                                    top_n_targets=args.top_n_targets,
                                    top_n_regulators=args.top_n_regulators,
                                    min_genes=args.min_genes)
示例#5
0
    def run_regression(self):

        data_df = self.data.to_df()

        utils.Debug.vprint(
            "Calculating {m} adjacencies".format(m=self.adjacency_method),
            level=0)

        # Get adjacencies
        adj_method = ADJ_METHODS[self.adjacency_method]

        if MPControl.is_dask:
            client_or_address = MPControl.client.client
            MPControl.client.check_cluster_state()
        else:
            client_or_address = 'local'

        adjacencies = adj_method(data_df,
                                 tf_names=self.tf_names,
                                 verbose=True,
                                 client_or_address=client_or_address,
                                 seed=self.random_seed)

        if self.do_scenic:

            # Convert adjacencies to modules
            modules = list(modules_from_adjacencies(adjacencies, data_df))

            # Load feather (rank) databases
            dbs = [
                RankingDatabase(fname=self._feather_rank_file,
                                name="RANKING_PRIOR")
            ]

            utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0)

            # Prune to df
            df = prune2df(dbs,
                          modules,
                          self._motif_link_table_file,
                          client_or_address=client_or_address)

            return self.reprocess_scenic_output_to_inferelator_results(
                df, self.priors_data)

        else:

            return self.reprocess_adj_to_inferelator_results(adjacencies)
示例#6
0
def calcTFs(
        expr,
        tf_names,
        db,
        prefix,
        motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl',
        out_path='../data/pySCENIC',
        ppn=8):
    """Computes motifs, regulons and trancriptional factor activation using pySCENIC.

    Arguments
    ---------
    expr: `pandas DataFrame` 
        cell X gene raw counts; FPKM; not TPM as coexpression will be calculated
    tf_names: `list` (`str`)
        curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt
    db: `list` (`FeatherRankingDatabase()`)
        feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")]
    prefix: `str` (default: `None`)
        Specify name to save files (eg, cell line names)

    Returns
    -------
    Do not return but write files (the calc takes too long...)
    """

    # Inference of co-expression modules
    adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True)
    modules = list(modules_from_adjacencies(adjacencies, expr))

    # Calculate a list of enriched motifs and the corresponding target genes for all modules.
    with ProgressBar():
        df = prune2df(db, modules, motif_path, num_workers=ppn)

    # Create regulons from this table of enriched motifs.
    regulons = df2regulons(df)

    # Save the enriched motifs and the discovered regulons to disk.
    with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f:
        pickle.dump(regulons, f)

    auc_mtx = aucell(expr, regulons, num_workers=ppn)
    tfs = [tf.strip('(+)') for tf in auc_mtx.columns]
    auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix))

    print('finished calculation for %s' % (prefix))
		os.chdir(data_folder_iter)



		## Run GRNBoost2 (faster equivalent of GENIE3) from arboreto to infer co-expression modules
		if not os.path.isfile(network_fname):
			adjacencies = grnboost2(data_train, tf_names=tf_names, verbose=True, client_or_address=custom_client, seed=i)
			adjacencies.to_csv(network_fname, sep=',', header=True, index=False, compression='gzip')
		else:
			adjacencies = pd.read_csv(network_fname)



		## Derive potential regulons from co-expression modules
		if not os.path.isfile(modules_fname):
			modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False))
			pickle.dump(modules, open(modules_fname, 'wb'))
		else:
			modules = pickle.load(open(modules_fname, 'rb'))
		
		del adjacencies



		## Prune modules for targets with cis regulatory footprints (aka RcisTarget)

		### Calculate a list of enriched motifs and the corresponding target genes for all modules.
		if not os.path.isfile(motifs_fname):
			df = prune2df(dbs, modules, motif_annotations, num_workers=n_cores)
			df.to_csv(motifs_fname)
		else:
示例#8
0
    databases_glob = os.path.join(
        "mm10__*.feather")  # loads cisTarget databases into memory
    db_fnames = glob.glob(databases_glob)

    def name(fname):
        return os.path.basename(fname).split(".")[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    # GENIE3 process: returns co-expression modules
    adjacencies = grnboost2(
        ex_matrix, tf_names=tf_names,
        verbose=True)  # runs improved GRNBoost instance of GENIE3
    modules = list(modules_from_adjacencies(
        adjacencies, ex_matrix))  # identifies modules from GENIE3

    # save GRNBoost2 product so we don't have to repeat again
    adjacencies.to_csv("grnboost_output.csv")

    # load product in case something goes wrong
    adjacencies = pd.read_csv("grnboost_output.csv", index_col=0)

    # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS
    with ProgressBar(
    ):  # calculate a list of enriched motifs and the corresponding target genes for all modules
        df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt")
    regulons = df2regulons(
        df)  # create regulons from this table of enriched motifs

    # save the discovered motifs and regulons
示例#9
0
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]
    print(dbs)
    print("running grnboost")
    print("tf_names head")
    print(tf_names[1:5])
    #print("gene names head")
    #print(ex_matrix.iloc[1:5,1:5])
    adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
    adjacencies.head()
    print("identify modules")
    adjacencies.to_csv(out_file, sep='\t', index=False, header=False)
    print("grnboost done")
    modules = list(
        modules_from_adjacencies(adjacencies,
                                 ex_matrix,
                                 rho_mask_dropouts=True))

    #print("writing modules")
    #with open(MODULES_FNAME, 'wb') as f:
    #	pickle.dump(modules, f)

    print("Finding Enriched modules")
    # Calculate a list of enriched motifs and the corresponding target genes for all modules.

    with ProgressBar():
        df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
        df.head()

    # Create regulons from this table of enriched motifs.
    print("creating regulons")
示例#10
0
                            client_or_address=client)
    print("DEFINED adjacencies, type and head:")

    adjacencies.to_csv(ADJACENCIES_FNAME, sep='\t')

    #load adjacencies
    adjacencies = pd.read_csv(ADJACENCIES_FNAME,
                              sep='\t',
                              header=0,
                              index_col=0)
    print("READ IN adjacencies, type and  head:")

    print(type(adjacencies))
    print(adjacencies.head())

    modules = list(modules_from_adjacencies(adjacencies, ex_matrix))
    print("DEFINED modules, type:")

    #write modules in a file as binary and as text
    with open(MODULES_BIN_FNAME, "wb") as f:
        pickle.dump(modules, f)

    modules_txt = open(MODULES_FNAME, "w")
    modules_txt.write(str(modules))
    modules_txt.close()

    #read in modules
    with open(MODULES_BIN_FNAME, "rb") as f:
        modules = pickle.load(f)
    print("LOADED modules, type:")
示例#11
0
#3. ranking databases (only 2 mm10 dbs)
l_fname=list(Path(fd_db).glob('*.feather'))
l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname]

#3. run
if __name__ =='__main__':
#	#1. Inference of co-expression modules
#	print('Inference...')
#	df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True)
#	df_adj.to_csv(f'{fd_out}/adj.csv', index=False)
	
	#2. prune
	df_adj=pd.read_csv(f'{fd_out}/adj.csv')  #if missing, always stuck at 98%
	print('Prune...')
	l_mod=list(modules_from_adjacencies(df_adj, df_cnt))

	with ProgressBar():
		df_prune = prune2df(l_db, l_mod, f_motif)
	df_prune.to_csv(f'{fd_out}/prune.csv')
	
	#3. create regulon
	print('Regulon...')
	regulon=df2regulons(df_prune)

	#4. Save the enriched motifs and the discovered regulons
	with open(f'{fd_out}/regulon.pkl', "wb") as f:
		pickle.dump(regulon, f)
	
	#5. auc
	print('AUC...')