def test_filter_genes_only(): st_out = ac.filter_genes_only([st1, st5]) assert len(st_out) == 2 st_out = ac.filter_genes_only([st6, st7]) assert len(st_out) == 0 st_out = ac.filter_genes_only([st4]) assert len(st_out) == 0 st_out = ac.filter_genes_only([st3], specific_only=True) assert len(st_out) == 0
def main(args): # This file takes about 32 GB to load if not args.infile: args.infile = './Data/indra_raw/bioexp_all_raw.pkl' if not args.outfile: args.outfile = './filtered_indra_network.sif' # Load statements from file stmts_raw = assemble_corpus.load_statements(args.infile) # Expand families, fix grounding errors and run run preassembly stmts_fixed = assemble_corpus.run_preassembly( assemble_corpus.map_grounding( assemble_corpus.expand_families(stmts_raw))) # Default filtering: specific (unique) genes that are grounded. stmts_filtered = assemble_corpus.filter_grounded_only( assemble_corpus.filter_genes_only(stmts_fixed, specific_only=True)) # Custom filters if args.human_only: stmts_filtered = assemble_corpus.filter_human_only(stmts_filtered) if args.filter_direct: stmts_filtered = assemble_corpus.filter_direct(stmts_filtered) binary_stmts = [s for s in stmts_filtered if len(s.agent_list()) == 2 and s.agent_list()[0] is not None] rows = [] for s in binary_stmts: rows.append([ag.name for ag in s.agent_list()]) # Write rows to .sif file with open(args.outfile, 'w', newline='') as csvfile: wrtr = csv.writer(csvfile, delimiter='\t') for row in rows: wrtr.writerow(row)
def get_indra_phos_stmts(): stmts = by_gene_role_type(stmt_type='Phosphorylation') stmts += by_gene_role_type(stmt_type='Dephosphorylation') stmts = ac.map_grounding(stmts) # Expand families before site mapping stmts = ac.expand_families(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.map_sequence(stmts) ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl') stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_phos_stmts_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl') return stmts
def get_indra_expression(): #inc_stmts = by_gene_role_type(stmt_type='IncreaseAmount') #dec_stmts = by_gene_role_type(stmt_type='DecreaseAmount') #stmts = inc_stmts + dec_stmts #ac.dump_statements(stmts, 'indra_regulate_amount_stmts.pkl') #stmts = ac.load_statements('indra_regulate_amount_stmts.pkl') #stmts = ac.map_grounding(stmts) # Expand families before site mapping #stmts = ac.expand_families(stmts) #stmts = ac.filter_grounded_only(stmts) #stmts = ac.map_sequence(stmts) #stmts = ac.run_preassembly(stmts, poolsize=4, # save='indra_regulate_amount_pre.pkl') stmts = ac.load_statements('indra_regulate_amount_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts) stmts = [s for s in stmts if s.agent_list()[0] is not None] return stmts
def regulons_from_stmts(stmts, filename): regulons = defaultdict(set) stmts = ac.filter_genes_only(stmts) stmts = ac.filter_human_only(stmts) for stmt in stmts: kinase = stmt.enz.name # Blacklist annoying stmts from NCI-PID if (kinase == 'BRAF' or kinase == 'RAF1') and \ (stmt.sub.name == 'MAPK1' or stmt.sub.name == 'MAPK3'): continue if stmt.residue and stmt.position: site = '%s_%s%s' % (stmt.sub.name, stmt.residue, stmt.position) regulons[kinase].add(site) rows = [] for kinase, sites in regulons.items(): rows.append([kinase, 'Description'] + [s for s in sites]) with open(filename, 'wt') as f: csvwriter = csv.writer(f, delimiter='\t') csvwriter.writerows(rows)
def get_indra_reg_act_stmts(): try: stmts = ac.load_statements('sources/indra_reg_act_stmts.pkl') return stmts except: pass stmts = [] for stmt_type in ('Activation', 'Inhibition', 'ActiveForm'): print("Getting %s statements from INDRA DB" % stmt_type) stmts += by_gene_role_type(stmt_type=stmt_type) stmts = ac.map_grounding(stmts, save='sources/indra_reg_act_gmap.pkl') stmts = ac.filter_grounded_only(stmts) stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_reg_act_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_reg_act_stmts.pkl') return stmts
def load_statements_from_synapse(synapse_id='syn11273504'): syn = synapseclient.Synapse() syn.login() # Obtain a pointer and download the data syn_data = syn.get(synapse_id) stmts = [] for row in read_unicode_csv(syn_data.path, delimiter='\t'): sub_name, site_info = row[0].split(':') res = site_info[0] pos = site_info[1:] gene_list = row[1].split(',') for enz_name in gene_list: enz = Agent(enz_name, db_refs=get_ids(enz_name)) sub = Agent(sub_name, db_refs=get_ids(sub_name)) stmt = Phosphorylation(enz, sub, res, pos) stmts.append(stmt) stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def test_filter_genes_only(): st_out = ac.filter_genes_only([st1, st5]) assert len(st_out) == 2 st_out = ac.filter_genes_only([st6, st7]) assert len(st_out) == 0 st_out = ac.filter_genes_only([st4]) assert len(st_out) == 0 st_out = ac.filter_genes_only([st3], specific_only=True) assert len(st_out) == 0 # Can we remove statements with non-gene bound conditions? st_out = ac.filter_genes_only([st18]) # remove_bound defaults to False assert len(st_out) == 0 st_out = ac.filter_genes_only([st18], remove_bound=False) assert len(st_out) == 0 # Can we remove non-gene bound conditions? st18_copy = deepcopy(st18) assert len(st18_copy.sub.bound_conditions) == 1 st_out = ac.filter_genes_only([st18_copy], remove_bound=True) assert len(st_out[0].sub.bound_conditions) == 0
#prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reach_stmts = ac.filter_no_hypothesis(reach_stmts) #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl')) extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl')) reading_stmts = reach_stmts + extra_stmts reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts + extra_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl'), poolsize=4) ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly
reassemble = False if not reassemble: stmts = ac.load_statements(pjoin(outf, 'preassembled.pkl')) #stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) else: #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl')) assemble_models = [] assemble_models.append('sif') assemble_models.append('pysb') assemble_models.append('cx') ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes,
def get_phosphosite_stmts(): stmts = ac.load_statements('sources/phosphosite_stmts.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
default_prior_list = [t[0] for t in kin_ctr[0:200]] default_prior_filename = 'priors/indra_nkconf2_combined_default200.txt' save_default_prior(default_prior_list, default_prior_filename) syn_file = synapseclient.File(default_prior_filename, parent='syn11272284') syn.store(syn_file) import sys sys.exit() """ with open('sources/stmt_cache.pkl', 'rb') as f: syn_stmts, omni_stmts, phos_stmts, indra_stmts = pickle.load(f) """ #db_stmts = syn_stmts + omni_stmts + phos_stmts all_stmts = ac.filter_genes_only(all_stmts, specific_only=True) all_prior = to_prior(all_stmts) ext_prior_100 = add_regulators(reg_stmts, all_prior, max_features=100) save_prior(ext_prior_100, 'regulators_prior_100.txt') ext_prior_200 = add_regulators(reg_stmts, all_prior, max_features=200) save_prior(ext_prior_200, 'regulators_prior_200.txt') print("Phosphosite: %d of %d peptides" % (coverage(ov_sites, get_stmt_sites(phos_stmts)), len(ov_sites))) print("Phosphosite + NetworKIN: %d of %d peptides" % (coverage(ov_sites, get_stmt_sites(syn_stmts)), len(ov_sites))) print("Omnipath (incl. PSP, Signor, et al.): %d of %d peptides" % (coverage(ov_sites, get_stmt_sites(omni_stmts)), len(ov_sites))) print("REACH/INDRA: %d of %d peptides" %
return pd.DataFrame(tf_df) wd = __file__ INDRA_SIF = os.path.join(os.pardir, 'input', 'sif.pkl') with open(INDRA_SIF, 'rb') as fh: SIF = pickle.load(fh) n_stmt_type = list(SIF.columns).index('stmt_type') n_stmt_hash = list(SIF.columns).index('stmt_hash') hash_set = set() for r, c in SIF.iterrows(): if c[n_stmt_type] == 'IncreaseAmount' or c[n_stmt_type] == 'DecreaseAmount': hash_set.add(c[n_stmt_hash]) #stmts = download_statements(hash_set) indra_stmts = list(stmts.values()) with open('../output/all_stmts.pkl', 'wb') as fh: pickle.dump(indra_stmts, fh) indra_stmts = filter_human_only(indra_stmts) indra_stmts = filter_genes_only(indra_stmts) indra_stmts = filter_transcription_factor(indra_stmts) indra_stmts_db_only = filter_db_only(indra_stmts) indra_stmts_df = make_dataframe(indra_stmts) indra_stmts_df.to_csv('../output/indra_all_tf.csv') indra_stmts_db_only_df = make_dataframe(indra_stmts_db_only) indra_stmts_db_only_df.to_csv('../output/indra_db_only_tf.csv')
return if __name__ == "__main__": stmts = "../work/phospho_stmts.pkl" prize_outpath = "../work/pybel_prize.tsv" interactome_path = "../work/big_pybel_interactome2.tsv" site_file = "../work/gsea_sites.rnk" # Load the statements linking kinases/regulators to phospho sites # in the data stmts = ac.load_statements(stmts) # Employ filters to reduce network size stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts) # In this data, statements of these two types will not act on # a short enough timescale to play a meaningful role stmts = ac.filter_by_type(stmts, DecreaseAmount, invert=True) stmts = ac.filter_by_type(stmts, IncreaseAmount, invert=True) stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_enzyme_kinase(stmts) # Assemble a pybel graph from statements pba = PybelAssembler(stmts) pb_graph = make_model(pba) signed_graph = to_signed_nodes(pb_graph) gn_dict = get_gene_node_dict(signed_graph) # Next we have to load the data file and assign values to