elif opts.slim_out == "all": only_direct = False else: p.print_help() sys.exit(1) # load DAGs go_dag = GODag(obo_file) goslim_dag = GODag(slim_obo_file) # in case a single term is given as input: if opts.term: if opts.term not in go_dag: print(("term %s not found!" % opts.term), file=sys.stderr) sys.exit(1) direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag) # output either all or only direct slims, depending on user command if only_direct: slim_terms_str = ";".join(direct_anc) else: slim_terms_str = ";".join(all_anc) print(slim_terms_str) # in case a association file is given as input if opts.ass_file_name: assert os.path.exists(opts.ass_file_name), ("file %s not found!" % opts.ass_file_name) assocs = read_associations(opts.ass_file_name) for protein_product, go_terms in assocs.items(): all_direct_anc = set() all_covered_anc = set()
expected_results = { 'GO:0000005': (set(['GO:0000002', 'GO:0000003']), set(['GO:0000001', 'GO:0000002', 'GO:0000003'])), 'GO:0000006': (set(['GO:0000003']), set(['GO:0000001', 'GO:0000003'])), 'GO:0000007': (set(['GO:0000004']), set(['GO:0000001', 'GO:0000003', 'GO:0000004'])), 'GO:0000008': (set(['GO:0000003']), set(['GO:0000001', 'GO:0000003'])), 'GO:0000009': (set(['GO:0000004']), set(['GO:0000001', 'GO:0000003', 'GO:0000004'])), 'GO:0000010': (set(['GO:0000002', 'GO:0000003']), set(['GO:0000001', 'GO:0000002', 'GO:0000003'])) } tests_succeed = True for go_term, (exp_direct, exp_all) in expected_results.items(): sys.stderr.write("Testing for term '{}' ...\n".format(go_term)) direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag) if direct_anc != exp_direct or all_anc != exp_all: tests_succeed = False sys.stderr.write("failed.\n") else: sys.stderr.write("success!\n") if tests_succeed: print("All test passed successfully!") sys.exit(0) else: sys.stderr.write("[ERROR] At least one test failed.\n") sys.exit(1)
all_go_accs_in_a_protein = set() all_goslim_anc_accs_in_a_protein = set() all_goslim_covered_anc = set() go_accs = set(interpro_go.loc[interpro_go['Protein Accession'] == protein]['GO Accession']) for go_acc in go_accs: if not pd.isnull(go_acc): all_go_accs_in_a_protein |= set(go_acc.split('|')) if len(all_go_accs_in_a_protein) > 0: for go_term in all_go_accs_in_a_protein: if go_term not in go: continue if USE_SLIM: direct_anc, all_anc = mapslim(go_term, go, goslim) all_goslim_anc_accs_in_a_protein |= all_anc all_goslim_covered_anc |= (all_anc - direct_anc) query_term = go.query_term(go_term) output_table = output_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_term.namespace], 'GO Accession': [go_term], 'GO Description': [query_term.name], 'GO Level':[query_term.level]}), ignore_index=True) if USE_SLIM: if ONLY_DIRECT: all_goslim_direct_anc_accs_in_a_protein = all_goslim_anc_accs_in_a_protein - all_goslim_covered_anc for goslim_term in all_goslim_direct_anc_accs_in_a_protein: query_goslim_term = goslim.query_term(goslim_term) output_slim_table = output_slim_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_goslim_term.namespace], 'GOSlim Accession': [goslim_term], 'GOSlim Description': [query_goslim_term.name], 'GOSlim Level':[query_goslim_term.level]}), ignore_index=True) else: for goslim_term in all_goslim_anc_accs_in_a_protein: query_goslim_term = goslim.query_term(goslim_term)
from goatools.mapslim import mapslim from collections import Counter go_dag = GODag('/home/gstupp/goatools/go.obo') goslim_dag = GODag('/home/gstupp/goatools3/goslim_generic.obo') goslim_meta = GODag('/home/gstupp/goatools3/goslim_metagenomics.obo') ana_DTA = '/home/gstupp/01_2015_mass_spec/H1_11082014/1108_Gly1_2014_12_15_15_29205/dtaselect_results_sfp0.01_p2/DTASelect-filter.txt' parser = blazmass_tools.dta_select_parser(ana_DTA, small=True) ps = [get_domains(p) for p in parser] set_go = set(chain(*[p['set_go'] for p in ps if p['set_go'] is not None])) for p in ps: if p['set_go']: p['go_slim'] = set( chain(*[ mapslim(go_term, go_dag, goslim_meta)[0] for go_term in p['set_go'] if go_term in go_dag ])) else: p['go_slim'] = None go_slim = Counter(chain(*[p['go_slim'] for p in ps if p['go_slim']])) labels = { go_term: go_dag.query_term(go_term).name for go_term in go_slim.keys() } [labels[go] for (go, x) in go_slim.most_common(n=10)] import plot_tools cmap = plt.cm.jet colors = cmap(np.linspace(0., 1., len(go_slim.keys())))