Пример #1
0
def test_filter_genes_only():
    st_out = ac.filter_genes_only([st1, st5])
    assert len(st_out) == 2
    st_out = ac.filter_genes_only([st6, st7])
    assert len(st_out) == 0
    st_out = ac.filter_genes_only([st4])
    assert len(st_out) == 0
    st_out = ac.filter_genes_only([st3], specific_only=True)
    assert len(st_out) == 0
def main(args):
    # This file takes about 32 GB to load
    if not args.infile:
        args.infile = './Data/indra_raw/bioexp_all_raw.pkl'
    if not args.outfile:
        args.outfile = './filtered_indra_network.sif'

    # Load statements from file
    stmts_raw = assemble_corpus.load_statements(args.infile)

    # Expand families, fix grounding errors and run run preassembly
    stmts_fixed = assemble_corpus.run_preassembly(
                    assemble_corpus.map_grounding(
                        assemble_corpus.expand_families(stmts_raw)))

    # Default filtering: specific (unique) genes that are grounded.
    stmts_filtered = assemble_corpus.filter_grounded_only(
                         assemble_corpus.filter_genes_only(stmts_fixed, specific_only=True))
    # Custom filters
    if args.human_only:
        stmts_filtered = assemble_corpus.filter_human_only(stmts_filtered)
    if args.filter_direct:
        stmts_filtered = assemble_corpus.filter_direct(stmts_filtered)

    binary_stmts = [s for s in stmts_filtered if len(s.agent_list()) == 2 and s.agent_list()[0] is not None]
    rows = []
    for s in binary_stmts:
        rows.append([ag.name for ag in s.agent_list()])

    # Write rows to .sif file
    with open(args.outfile, 'w', newline='') as csvfile:
        wrtr = csv.writer(csvfile, delimiter='\t')
        for row in rows:
            wrtr.writerow(row)
Пример #3
0
def get_indra_phos_stmts():
    stmts = by_gene_role_type(stmt_type='Phosphorylation')
    stmts += by_gene_role_type(stmt_type='Dephosphorylation')
    stmts = ac.map_grounding(stmts)
    # Expand families before site mapping
    stmts = ac.expand_families(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.map_sequence(stmts)
    ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl')
    stmts = ac.run_preassembly(stmts,
                               poolsize=4,
                               save='sources/indra_phos_stmts_pre.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl')
    return stmts
Пример #4
0
def get_indra_expression():
    #inc_stmts = by_gene_role_type(stmt_type='IncreaseAmount')
    #dec_stmts = by_gene_role_type(stmt_type='DecreaseAmount')
    #stmts = inc_stmts + dec_stmts
    #ac.dump_statements(stmts, 'indra_regulate_amount_stmts.pkl')
    #stmts = ac.load_statements('indra_regulate_amount_stmts.pkl')
    #stmts = ac.map_grounding(stmts)
    # Expand families before site mapping
    #stmts = ac.expand_families(stmts)
    #stmts = ac.filter_grounded_only(stmts)
    #stmts = ac.map_sequence(stmts)
    #stmts = ac.run_preassembly(stmts, poolsize=4,
    #                           save='indra_regulate_amount_pre.pkl')
    stmts = ac.load_statements('indra_regulate_amount_pre.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts)
    stmts = [s for s in stmts if s.agent_list()[0] is not None]
    return stmts
Пример #5
0
def regulons_from_stmts(stmts, filename):
    regulons = defaultdict(set)
    stmts = ac.filter_genes_only(stmts)
    stmts = ac.filter_human_only(stmts)
    for stmt in stmts:
        kinase = stmt.enz.name
        # Blacklist annoying stmts from NCI-PID
        if (kinase == 'BRAF' or kinase == 'RAF1') and \
           (stmt.sub.name == 'MAPK1' or stmt.sub.name == 'MAPK3'):
            continue
        if stmt.residue and stmt.position:
            site = '%s_%s%s' % (stmt.sub.name, stmt.residue, stmt.position)
            regulons[kinase].add(site)
    rows = []
    for kinase, sites in regulons.items():
        rows.append([kinase, 'Description'] + [s for s in sites])
    with open(filename, 'wt') as f:
        csvwriter = csv.writer(f, delimiter='\t')
        csvwriter.writerows(rows)
Пример #6
0
def get_indra_reg_act_stmts():
    try:
        stmts = ac.load_statements('sources/indra_reg_act_stmts.pkl')
        return stmts
    except:
        pass
    stmts = []
    for stmt_type in ('Activation', 'Inhibition', 'ActiveForm'):
        print("Getting %s statements from INDRA DB" % stmt_type)
        stmts += by_gene_role_type(stmt_type=stmt_type)
    stmts = ac.map_grounding(stmts, save='sources/indra_reg_act_gmap.pkl')
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.run_preassembly(stmts,
                               poolsize=4,
                               save='sources/indra_reg_act_pre.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    ac.dump_statements(stmts, 'sources/indra_reg_act_stmts.pkl')
    return stmts
Пример #7
0
def load_statements_from_synapse(synapse_id='syn11273504'):
    syn = synapseclient.Synapse()
    syn.login()
    # Obtain a pointer and download the data
    syn_data = syn.get(synapse_id)
    stmts = []
    for row in read_unicode_csv(syn_data.path, delimiter='\t'):
        sub_name, site_info = row[0].split(':')
        res = site_info[0]
        pos = site_info[1:]
        gene_list = row[1].split(',')
        for enz_name in gene_list:
            enz = Agent(enz_name, db_refs=get_ids(enz_name))
            sub = Agent(sub_name, db_refs=get_ids(sub_name))
            stmt = Phosphorylation(enz, sub, res, pos)
            stmts.append(stmt)
    stmts = ac.map_sequence(stmts)
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    return stmts
Пример #8
0
def test_filter_genes_only():
    st_out = ac.filter_genes_only([st1, st5])
    assert len(st_out) == 2
    st_out = ac.filter_genes_only([st6, st7])
    assert len(st_out) == 0
    st_out = ac.filter_genes_only([st4])
    assert len(st_out) == 0
    st_out = ac.filter_genes_only([st3], specific_only=True)
    assert len(st_out) == 0

    # Can we remove statements with non-gene bound conditions?
    st_out = ac.filter_genes_only([st18])  # remove_bound defaults to False
    assert len(st_out) == 0
    st_out = ac.filter_genes_only([st18], remove_bound=False)
    assert len(st_out) == 0

    # Can we remove non-gene bound conditions?
    st18_copy = deepcopy(st18)
    assert len(st18_copy.sub.bound_conditions) == 1
    st_out = ac.filter_genes_only([st18_copy], remove_bound=True)
    assert len(st_out[0].sub.bound_conditions) == 0
Пример #9
0
        #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.map_grounding(prior_stmts,
                                       save=pjoin(outf, 'gmapped_prior.pkl'))
        reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl'))
        reach_stmts = ac.filter_no_hypothesis(reach_stmts)
        #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl'))
        extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl'))
        reading_stmts = reach_stmts + extra_stmts
        reading_stmts = ac.map_grounding(reading_stmts,
                                         save=pjoin(outf,
                                                    'gmapped_reading.pkl'))
        stmts = prior_stmts + reading_stmts + extra_stmts

        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_genes_only(stmts, specific_only=False)
        stmts = ac.filter_human_only(stmts)
        stmts = ac.expand_families(stmts)
        stmts = ac.filter_gene_list(stmts, data_genes, 'one')
        stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl'))
        #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl'))
        stmts = ac.run_preassembly(stmts,
                                   return_toplevel=False,
                                   save=pjoin(outf, 'preassembled.pkl'),
                                   poolsize=4)

    ### PySB assembly
    if 'pysb' in assemble_models:
        pysb_model = assemble_pysb(stmts, data_genes,
                                   pjoin(outf, 'korkut_model_pysb.py'))
    ### SIF assembly
Пример #10
0
    reassemble = False
    if not reassemble:
        stmts = ac.load_statements(pjoin(outf, 'preassembled.pkl'))
        #stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
    else:
        #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.map_grounding(prior_stmts,
                                       save=pjoin(outf, 'gmapped_prior.pkl'))
        reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl'))
        reading_stmts = ac.map_grounding(reading_stmts,
                                    save=pjoin(outf, 'gmapped_reading.pkl'))
        stmts = prior_stmts + reading_stmts

        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_genes_only(stmts, specific_only=False)
        stmts = ac.filter_human_only(stmts)
        stmts = ac.expand_families(stmts)
        stmts = ac.filter_gene_list(stmts, data_genes, 'one')
        stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl'))
        stmts = ac.run_preassembly(stmts, return_toplevel=False,
                                   save=pjoin(outf, 'preassembled.pkl'))

    assemble_models = []
    assemble_models.append('sif')
    assemble_models.append('pysb')
    assemble_models.append('cx')

    ### PySB assembly
    if 'pysb' in assemble_models:
        pysb_model = assemble_pysb(stmts, data_genes,
Пример #11
0
def get_phosphosite_stmts():
    stmts = ac.load_statements('sources/phosphosite_stmts.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    return stmts
Пример #12
0
    default_prior_list = [t[0] for t in kin_ctr[0:200]]
    default_prior_filename = 'priors/indra_nkconf2_combined_default200.txt'
    save_default_prior(default_prior_list, default_prior_filename)
    syn_file = synapseclient.File(default_prior_filename, parent='syn11272284')
    syn.store(syn_file)

    import sys
    sys.exit()
    """
    with open('sources/stmt_cache.pkl', 'rb') as f:
        syn_stmts, omni_stmts, phos_stmts, indra_stmts = pickle.load(f)
    """

    #db_stmts = syn_stmts + omni_stmts + phos_stmts
    all_stmts = ac.filter_genes_only(all_stmts, specific_only=True)

    all_prior = to_prior(all_stmts)

    ext_prior_100 = add_regulators(reg_stmts, all_prior, max_features=100)
    save_prior(ext_prior_100, 'regulators_prior_100.txt')
    ext_prior_200 = add_regulators(reg_stmts, all_prior, max_features=200)
    save_prior(ext_prior_200, 'regulators_prior_200.txt')

    print("Phosphosite: %d of %d peptides" %
          (coverage(ov_sites, get_stmt_sites(phos_stmts)), len(ov_sites)))
    print("Phosphosite + NetworKIN: %d of %d peptides" %
          (coverage(ov_sites, get_stmt_sites(syn_stmts)), len(ov_sites)))
    print("Omnipath (incl. PSP, Signor, et al.): %d of %d peptides" %
          (coverage(ov_sites, get_stmt_sites(omni_stmts)), len(ov_sites)))
    print("REACH/INDRA: %d of %d peptides" %
    return pd.DataFrame(tf_df)


wd = __file__

INDRA_SIF = os.path.join(os.pardir, 'input', 'sif.pkl')
with open(INDRA_SIF, 'rb') as fh:
    SIF = pickle.load(fh)

n_stmt_type = list(SIF.columns).index('stmt_type')
n_stmt_hash = list(SIF.columns).index('stmt_hash')
hash_set = set()
for r, c in SIF.iterrows():
    if c[n_stmt_type] == 'IncreaseAmount' or c[n_stmt_type] == 'DecreaseAmount':
        hash_set.add(c[n_stmt_hash])

#stmts = download_statements(hash_set)
indra_stmts = list(stmts.values())
with open('../output/all_stmts.pkl', 'wb') as fh:
    pickle.dump(indra_stmts, fh)

indra_stmts = filter_human_only(indra_stmts)
indra_stmts = filter_genes_only(indra_stmts)
indra_stmts = filter_transcription_factor(indra_stmts)
indra_stmts_db_only = filter_db_only(indra_stmts)

indra_stmts_df = make_dataframe(indra_stmts)
indra_stmts_df.to_csv('../output/indra_all_tf.csv')

indra_stmts_db_only_df = make_dataframe(indra_stmts_db_only)
indra_stmts_db_only_df.to_csv('../output/indra_db_only_tf.csv')
Пример #14
0
    return


if __name__ == "__main__":
    stmts = "../work/phospho_stmts.pkl"
    prize_outpath = "../work/pybel_prize.tsv"
    interactome_path = "../work/big_pybel_interactome2.tsv"
    site_file = "../work/gsea_sites.rnk"
    # Load the statements linking kinases/regulators to phospho sites
    # in the data
    stmts = ac.load_statements(stmts)

    # Employ filters to reduce network size
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts)
    # In this data, statements of these two types will not act on
    # a short enough timescale to play a meaningful role
    stmts = ac.filter_by_type(stmts, DecreaseAmount, invert=True)
    stmts = ac.filter_by_type(stmts, IncreaseAmount, invert=True)
    stmts = ac.filter_by_type(stmts, Complex, invert=True)
    stmts = ac.filter_enzyme_kinase(stmts)

    # Assemble a pybel graph from statements
    pba = PybelAssembler(stmts)
    pb_graph = make_model(pba)

    signed_graph = to_signed_nodes(pb_graph)
    gn_dict = get_gene_node_dict(signed_graph)
    # Next we have to load the data file and assign values to