def test_combine_duplicates(): raf = Agent('RAF1') mek = Agent('MEK1') erk = Agent('ERK2') p1 = Phosphorylation(raf, mek, evidence=Evidence(text='foo')) p2 = Phosphorylation(raf, mek, evidence=Evidence(text='bar')) p3 = Phosphorylation(raf, mek, evidence=Evidence(text='baz')) p4 = Phosphorylation(raf, mek, evidence=Evidence(text='beep')) p5 = Phosphorylation(mek, erk, evidence=Evidence(text='foo')) p6 = Dephosphorylation(mek, erk, evidence=Evidence(text='bar')) p7 = Dephosphorylation(mek, erk, evidence=Evidence(text='baz')) p8 = Dephosphorylation(mek, erk, evidence=Evidence(text='beep')) p9 = Dephosphorylation(Agent('SRC'), Agent('KRAS'), evidence=Evidence(text='beep')) stmts = [p1, p2, p3, p4, p5, p6, p7, p8, p9] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() # The statements come out sorted by their matches_key assert(len(pa.unique_stmts) == 4) assert(pa.unique_stmts[0].matches(p6)) # MEK dephos ERK assert(len(pa.unique_stmts[0].evidence) == 3) assert(pa.unique_stmts[1].matches(p9)) # SRC dephos KRAS assert(len(pa.unique_stmts[1].evidence) == 1) assert(pa.unique_stmts[2].matches(p5)) # MEK phos ERK assert(len(pa.unique_stmts[2].evidence) == 1) assert(pa.unique_stmts[3].matches(p1)) # RAF phos MEK assert(len(pa.unique_stmts[3].evidence) == 4)
def test_complex_refinement_order(): st1 = Complex([Agent('MED23'), Agent('ELK1')]) st2 = Complex([Agent('ELK1', mods=[ModCondition('phosphorylation')]), Agent('MED23')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 1
def test_duplicates(): src = Agent('SRC', db_refs = {'HGNC': '11283'}) ras = Agent('RAS', db_refs = {'FA': '03663'}) st1 = Phosphorylation(src, ras) st2 = Phosphorylation(src, ras) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 1
def analyze(filename, plot=False): # Load the file results = load_file(filename) # Put together a list of all statements all_stmts = [stmt for paper_stmts in results.values() for stmt in paper_stmts] # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) map_stmts = gmap.rename_agents(map_stmts) # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() # Map GO IDs to genes and associated statements logger.info('Building map from GO IDs to stmts') go_gene_map = {} go_name_map = {} for stmt in pa.unique_stmts: (bp_name, go, gene) = go_gene_pair(stmt) if bp_name is None and go is None and gene is None: continue go_gene_list = go_gene_map.get(go, []) go_gene_list.append((gene, stmt)) go_gene_map[go] = go_gene_list go_name_set = go_name_map.get(go, set([])) go_name_set.add(bp_name) go_name_map[go] = go_name_set # Iterate over all of the GO IDs and compare the annotated genes in GO # to the ones from the given statements go_stmt_map = {} for ix, go_id in enumerate(go_gene_map.keys()): logger.info('Getting genes for %s (%s) from GO (%d of %d)' % (go_id, ','.join(list(go_name_map[go_id])), ix+1, len(go_gene_map.keys()))) genes_from_go = get_genes_for_go_id(go_id) gene_stmt_list = go_gene_map[go_id] in_go = [] not_in_go = [] for (gene, stmt) in gene_stmt_list: if gene in genes_from_go: in_go.append(stmt) else: not_in_go.append(stmt) go_stmt_map[go_id] = {'names': list(go_name_map[go_id]), 'in_go': in_go, 'not_in_go': not_in_go} with open('go_stmt_map.pkl', 'wb') as f: pickle.dump(go_stmt_map, f, protocol=2) if plot: plot_stmt_counts(go_stmt_map, 'go_stmts.pdf')
def test_agent_text_storage(): A1 = Agent('A', db_refs={'TEXT': 'A'}) A2 = Agent('A', db_refs={'TEXT': 'alpha'}) B1 = Agent('B', db_refs={'TEXT': 'bag'}) B2 = Agent('B', db_refs={'TEXT': 'bug'}) C = Agent('C') D = Agent('D') inp = [ Complex([A1, B1], evidence=Evidence(text='A complex bag.')), Complex([B2, A2], evidence=Evidence(text='bug complex alpha once.')), Complex([B2, A2], evidence=Evidence(text='bug complex alpha again.')), Complex([A1, C, B2], evidence=Evidence(text='A complex C bug.')), Phosphorylation(A1, B1, evidence=Evidence(text='A phospo bags.')), Phosphorylation(A2, B2, evidence=Evidence(text='alpha phospho bugs.')), Conversion(D, [A1, B1], [C, D], evidence=Evidence(text='D: A bag -> C D')), Conversion(D, [B1, A2], [C, D], evidence=Evidence(text='D: bag a -> C D')), Conversion(D, [B2, A2], [D, C], evidence=Evidence(text='D: bug a -> D C')), Conversion(D, [B1, A1], [C, D], evidence=Evidence(text='D: bag A -> C D')), Conversion(D, [A1], [A1, C], evidence=Evidence(text='D: A -> A C')) ] pa = Preassembler(hierarchies, inp) unq1 = pa.combine_duplicates() assert len(unq1) == 5, len(unq1) assert all([len(ev.annotations['prior_uuids']) == 1 for s in unq1 for ev in s.evidence if len(s.evidence) > 1]),\ 'There can only be one prior evidence per uuid at this stage.' ev_uuid_dict = {ev.annotations['prior_uuids'][0]: ev.annotations['agents'] for s in unq1 for ev in s.evidence} for s in inp: raw_text = [ag.db_refs.get('TEXT') for ag in s.agent_list(deep_sorted=True)] assert raw_text == ev_uuid_dict[s.uuid]['raw_text'],\ str(raw_text) + '!=' + str(ev_uuid_dict[s.uuid]['raw_text']) # Now run pa on the above corpus plus another statement. inp2 = unq1 + [ Complex([A1, C, B1], evidence=Evidence(text='A complex C bag.')) ] pa2 = Preassembler(hierarchies, inp2) unq2 = pa2.combine_duplicates() assert len(unq2) == 5, len(unq2) old_ev_list = [] new_ev = None for s in unq2: for ev in s.evidence: if ev.text == inp2[-1].evidence[0].text: new_ev = ev else: old_ev_list.append(ev) assert all([len(ev.annotations['prior_uuids']) == 2 for ev in old_ev_list]) assert new_ev assert len(new_ev.annotations['prior_uuids']) == 1
def test_homodimer_refinement(): egfr = Agent('EGFR') erbb = Agent('ERBB2') st1 = Complex([erbb, erbb]) st2 = Complex([erbb, egfr]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_duplicates_copy(): src = Agent('SRC', db_refs = {'HGNC': '11283'}) ras = Agent('RAS', db_refs = {'FA': '03663'}) st1 = Phosphorylation(src, ras, evidence=[Evidence(text='Text 1')]) st2 = Phosphorylation(src, ras, evidence=[Evidence(text='Text 2')]) stmts = [st1, st2] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() assert len(pa.unique_stmts) == 1 assert len(stmts) == 2 assert len(stmts[0].evidence) == 1 assert len(stmts[1].evidence) == 1
def test_flatten_stmts(): st1 = Phosphorylation(Agent('MAP3K5'), Agent('RAF1'), 'S', '338') st2 = Phosphorylation(None, Agent('RAF1'), 'S', '338') st3 = Phosphorylation(None, Agent('RAF1')) st4 = Phosphorylation(Agent('PAK1'), Agent('RAF1'), 'S', '338') st5 = Phosphorylation(None, Agent('RAF1'), evidence=Evidence(text='foo')) pa = Preassembler(hierarchies, stmts=[st1, st2, st3, st4, st5]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 2 assert len(flatten_stmts(pa.unique_stmts)) == 4 assert len(flatten_stmts(pa.related_stmts)) == 4
def test_duplicates_copy(): src = Agent('SRC', db_refs={'HGNC': '11283'}) ras = Agent('RAS', db_refs={'FA': '03663'}) st1 = Phosphorylation(src, ras, evidence=[Evidence(text='Text 1')]) st2 = Phosphorylation(src, ras, evidence=[Evidence(text='Text 2')]) stmts = [st1, st2] pa = Preassembler(bio_ontology, stmts=stmts) pa.combine_duplicates() assert len(pa.unique_stmts) == 1 assert len(stmts) == 2 assert len(stmts[0].evidence) == 1 assert len(stmts[1].evidence) == 1
def test_flatten_stmts(): st1 = Phosphorylation(Agent('MAP3K5'), Agent('RAF1'), 'S', '338') st2 = Phosphorylation(None, Agent('RAF1'), 'S', '338') st3 = Phosphorylation(None, Agent('RAF1')) st4 = Phosphorylation(Agent('PAK1'), Agent('RAF1'), 'S', '338') st5 = Phosphorylation(None, Agent('RAF1'), evidence=Evidence(text='foo')) pa = Preassembler(hierarchies, stmts=[st1, st2, st3, st4, st5]) pa.combine_duplicates() pa.combine_related() assert (len(pa.related_stmts) == 2) assert (len(flatten_stmts(pa.unique_stmts)) == 4) assert (len(flatten_stmts(pa.related_stmts)) == 4)
def test_activation_refinement(): subj = Agent('alcohol', db_refs={'CHEBI': 'CHEBI:16236', 'HMDB': 'HMDB00108', 'PUBCHEM': '702', 'TEXT': 'alcohol'}) obj = Agent('endotoxin', db_refs={'TEXT': 'endotoxin'}) st1 = Inhibition(subj, obj) st2 = Activation(subj, obj) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_combine_evidence_exact_duplicates(): raf = Agent('RAF1') mek = Agent('MEK1') p1 = Phosphorylation(raf, mek, evidence=Evidence(text='foo')) p2 = Phosphorylation(raf, mek, evidence=Evidence(text='bar')) p3 = Phosphorylation(raf, mek, evidence=Evidence(text='bar')) stmts = [p1, p2, p3] pa = Preassembler(bio_ontology, stmts=stmts) pa.combine_duplicates() # The statements come out sorted by their matches_key assert len(pa.unique_stmts) == 1 assert len(pa.unique_stmts[0].evidence) == 2 assert set(ev.text for ev in pa.unique_stmts[0].evidence) == \ set(['foo', 'bar'])
def test_combine_evidence_exact_duplicates_different_raw_text(): raf1 = Agent('RAF1', db_refs={'TEXT': 'Raf'}) raf2 = Agent('RAF1', db_refs={'TEXT': 'RAF'}) mek = Agent('MEK1') p1 = Phosphorylation(raf1, mek, evidence=Evidence(text='foo')) p2 = Phosphorylation(raf1, mek, evidence=Evidence(text='bar')) p3 = Phosphorylation(raf2, mek, evidence=Evidence(text='bar')) stmts = [p1, p2, p3] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() # The statements come out sorted by their matches_key assert len(pa.unique_stmts) == 1 assert len(pa.unique_stmts[0].evidence) == 3 assert set(ev.text for ev in pa.unique_stmts[0].evidence) == \ set(['foo', 'bar', 'bar'])
def test_duplicates_sorting(): mc = ModCondition('phosphorylation') map2k1_1 = Agent('MAP2K1', mods=[mc]) mc1 = ModCondition('phosphorylation', 'serine', '218') mc2 = ModCondition('phosphorylation', 'serine', '222') mc3 = ModCondition('phosphorylation', 'serine', '298') map2k1_2 = Agent('MAP2K1', mods=[mc1, mc2, mc3]) mapk3 = Agent('MAPK3') st1 = Phosphorylation(map2k1_1, mapk3, position='218') st2 = Phosphorylation(map2k1_2, mapk3) st3 = Phosphorylation(map2k1_1, mapk3, position='218') stmts = [st1, st2, st3] pa = Preassembler(bio_ontology, stmts=stmts) pa.combine_duplicates() assert len(pa.unique_stmts) == 2
def test_event_assemble_location(): rainfall = Concept('rainfall') loc1 = RefContext(name='x', db_refs={'GEOID': '1'}) loc2 = RefContext(name='x', db_refs={'GEOID': '2'}) ev1 = Event(rainfall, context=WorldContext(geo_location=loc1)) ev2 = Event(rainfall, context=WorldContext(geo_location=loc2)) pa = Preassembler(hierarchies=hierarchies, stmts=[ev1, ev2], matches_fun=None) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 1 pa = Preassembler(hierarchies=hierarchies, stmts=[ev1, ev2], matches_fun=location_matches) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2
def test_association_refinement(): health = 'UN/entities/human/health' food = 'UN/entities/human/food' food_security = 'UN/entities/human/food/food_security' eh = Event(Concept('health', db_refs={'UN': [(health, 1.0)]})) ef = Event(Concept('food', db_refs={'UN': [(food, 1.0)]})) efs = Event(Concept('food security', db_refs={'UN': [(food_security, 1.0)]})) st1 = Association([eh, ef], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ef, eh], evidence=[Evidence(source_api='eidos2')]) st3 = Association([eh, efs], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ef, efs], evidence=[Evidence(source_api='eidos4')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() # debugging assert len(unique_stmts) == 3 rel_stmts = pa.combine_related() assert len(rel_stmts) == 2 eh_efs_stmt = [st for st in rel_stmts if (st.members[0].concept.name in {'health', 'food security'} and st.members[1].concept.name in {'health', 'food security'})][0] assert len(eh_efs_stmt.supported_by) == 1 assert (eh_efs_stmt.supported_by[0].members[0].concept.name in {'food', 'health'}) assert (eh_efs_stmt.supported_by[0].members[1].concept.name in {'food', 'health'})
def test_activation_refinement(): subj = Agent('alcohol', db_refs={ 'CHEBI': 'CHEBI:16236', 'HMDB': 'HMDB00108', 'PUBCHEM': '702', 'TEXT': 'alcohol' }) obj = Agent('endotoxin', db_refs={'TEXT': 'endotoxin'}) st1 = Inhibition(subj, obj) st2 = Activation(subj, obj) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_association_refinement(): health = 'UN/entities/human/health' food = 'UN/entities/human/food' food_security = 'UN/entities/human/food/food_security' eh = Event(Concept('health', db_refs={'UN': [(health, 1.0)]})) ef = Event(Concept('food', db_refs={'UN': [(food, 1.0)]})) efs = Event( Concept('food security', db_refs={'UN': [(food_security, 1.0)]})) st1 = Association([eh, ef], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ef, eh], evidence=[Evidence(source_api='eidos2')]) st3 = Association([eh, efs], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ef, efs], evidence=[Evidence(source_api='eidos4')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() # debugging assert len(unique_stmts) == 3 rel_stmts = pa.combine_related() assert len(rel_stmts) == 2 eh_efs_stmt = [ st for st in rel_stmts if (st.members[0].concept.name in {'health', 'food security'} and st.members[1].concept.name in {'health', 'food security'}) ][0] assert len(eh_efs_stmt.supported_by) == 1 assert (eh_efs_stmt.supported_by[0].members[0].concept.name in {'food', 'health'}) assert (eh_efs_stmt.supported_by[0].members[1].concept.name in {'food', 'health'})
def test_association_refinement(): unrelated = 'wm/concept/causal_factor/wild_food_sources' parent = 'wm/concept/causal_factor/health_and_life' child = 'wm/concept/causal_factor/health_and_life/' \ 'living_condition/food_safety' parent_event = Event(Concept('parent', db_refs={'WM': [(parent, 1.0)]})) unrelated_event = \ Event(Concept('unrelated', db_refs={'WM': [(unrelated, 1.0)]})) child_event = Event(Concept('child', db_refs={'WM': [(child, 1.0)]})) st1 = Association([parent_event, unrelated_event], evidence=[Evidence(source_api='eidos1')]) st2 = Association([unrelated_event, parent_event], evidence=[Evidence(source_api='eidos2')]) st3 = Association([parent_event, child_event], evidence=[Evidence(source_api='eidos3')]) st4 = Association([unrelated_event, child_event], evidence=[Evidence(source_api='eidos4')]) pa = Preassembler(world_ontology, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3 top_level_stmts = pa.combine_related() assert len(top_level_stmts) == 2, top_level_stmts names = { tuple(sorted(e.concept.name for e in stmt.members)): stmt for stmt in top_level_stmts } stmt = names[('child', 'unrelated')] assert len(stmt.supported_by) == 1 assert {e.concept.name for e in stmt.supported_by[0].members} == \ {'parent', 'unrelated'}
def test_uppro_assembly(): ag1 = Agent('x', db_refs={'UP': 'P01019', 'UPPRO': 'PRO_0000032457'}) ag2 = Agent('y', db_refs={'UP': 'P01019', 'UPPRO': 'PRO_0000032458'}) assert ag1.get_grounding() == ('UPPRO', ag1.db_refs['UPPRO']) assert ag2.get_grounding() == ('UPPRO', ag2.db_refs['UPPRO']) stmt1 = Phosphorylation(None, ag1) stmt2 = Phosphorylation(None, ag2) assert stmt1.matches_key() != stmt2.matches_key() pa = Preassembler(bio_ontology, [stmt1, stmt2]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2, unique_stmts from indra.tools import assemble_corpus as ac stmts = ac.map_grounding([stmt1, stmt2]) pa = Preassembler(bio_ontology, stmts) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2
def test_combine_evidence_exact_duplicates(): raf = Agent('RAF1') mek = Agent('MEK1') p1 = Phosphorylation(raf, mek, evidence=Evidence(text='foo')) p2 = Phosphorylation(raf, mek, evidence=Evidence(text='bar')) p3 = Phosphorylation(raf, mek, evidence=Evidence(text='bar')) stmts = [p1, p2, p3] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() # The statements come out sorted by their matches_key assert len(pa.unique_stmts) == 1 assert len(pa.unique_stmts[0].evidence) == 2 assert set(ev.text for ev in pa.unique_stmts[0].evidence) == \ set(['foo', 'bar'])
def test_duplicates_sorting(): mc = ModCondition('phosphorylation') map2k1_1 = Agent('MAP2K1', mods=[mc]) mc1 = ModCondition('phosphorylation', 'serine', '218') mc2 = ModCondition('phosphorylation', 'serine', '222') mc3 = ModCondition('phosphorylation', 'serine', '298') map2k1_2 = Agent('MAP2K1', mods=[mc1, mc2, mc3]) mapk3 = Agent('MAPK3') #ras = Agent('MAPK3', db_refs = {'FA': '03663'}) #nras = Agent('NRAS', db_refs = {'FA': '03663'}) st1 = Phosphorylation(map2k1_1, mapk3, position='218') st2 = Phosphorylation(map2k1_2, mapk3) st3 = Phosphorylation(map2k1_1, mapk3, position='218') stmts = [st1, st2, st3] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() assert len(pa.unique_stmts) == 2
def test_duplicates_sorting(): mc = ModCondition('phosphorylation') map2k1_1 = Agent('MAP2K1', mods=[mc]) mc1 = ModCondition('phosphorylation', 'serine', '218') mc2 = ModCondition('phosphorylation', 'serine', '222') mc3 = ModCondition('phosphorylation', 'serine', '298') map2k1_2 = Agent('MAP2K1', mods=[mc1, mc2, mc3]) mapk3 = Agent('MAPK3') #ras = Agent('MAPK3', db_refs = {'FA': '03663'}) #nras = Agent('NRAS', db_refs = {'FA': '03663'}) st1 = Phosphorylation(map2k1_1, mapk3, position='218') st2 = Phosphorylation(map2k1_2, mapk3) st3 = Phosphorylation(map2k1_1, mapk3, position='218') stmts = [st1, st2, st3] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_duplicates() assert len(pa.unique_stmts) == 2
def test_agent_coordinates(): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'reach_coordinates.json') stmts = reach.process_json_file(path).statements pa = Preassembler(bio_ontology, stmts) unique_stmt = pa.combine_duplicates()[0] agent_annots = [ev.annotations['agents'] for ev in unique_stmt.evidence] assert all(a['raw_text'] == ['MEK1', 'ERK2'] for a in agent_annots) assert {tuple(a['coords']) for a in agent_annots} == {((21, 25), (0, 4)), ((0, 4), (15, 19))}
def test_grounding_aggregation(): braf1 = Agent('BRAF', db_refs={'TEXT': 'braf', 'HGNC': '1097'}) braf2 = Agent('BRAF', db_refs={'TEXT': 'BRAF'}) braf3 = Agent('BRAF', db_refs={'TEXT': 'Braf', 'UP': 'P15056'}) st1 = Phosphorylation(None, braf1) st2 = Phosphorylation(None, braf2) st3 = Phosphorylation(None, braf3) pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3
def test_grounding_aggregation(): braf1 = Agent('BRAF', db_refs={'TEXT': 'braf', 'HGNC': '1097'}) braf2 = Agent('BRAF', db_refs={'TEXT': 'BRAF'}) braf3 = Agent('BRAF', db_refs={'TEXT': 'Braf', 'UP': 'P15056'}) st1 = Phosphorylation(None, braf1) st2 = Phosphorylation(None, braf2) st3 = Phosphorylation(None, braf3) pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3
def test_grounding_aggregation_complex(): mek = Agent('MEK') braf1 = Agent('BRAF', db_refs={'TEXT': 'braf', 'HGNC': '1097'}) braf2 = Agent('BRAF', db_refs={'TEXT': 'BRAF', 'dummy': 'dummy'}) braf3 = Agent('BRAF', db_refs={'TEXT': 'Braf', 'UP': 'P15056'}) st1 = Complex([mek, braf1]) st2 = Complex([braf2, mek]) st3 = Complex([mek, braf3]) pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3
def test_grounding_aggregation_complex(): mek = Agent('MEK') braf1 = Agent('BRAF', db_refs={'TEXT': 'braf', 'HGNC': '1097'}) braf2 = Agent('BRAF', db_refs={'TEXT': 'BRAF', 'dummy': 'dummy'}) braf3 = Agent('BRAF', db_refs={'TEXT': 'Braf', 'UP': 'P15056'}) st1 = Complex([mek, braf1]) st2 = Complex([braf2, mek]) st3 = Complex([mek, braf3]) pa = Preassembler(bio_ontology, stmts=[st1, st2, st3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3, unique_stmts
def test_agent_coordinates(): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'reach_coordinates.json') stmts = reach.process_json_file(path).statements pa = Preassembler(hierarchies, stmts) unique_stmt = pa.combine_duplicates()[0] evidence_list = unique_stmt.evidence agent_annots = [ev.annotations['agents'] for ev in unique_stmt.evidence] assert all(a['raw_text'] == ['MEK1', 'ERK2'] for a in agent_annots) assert {tuple(a['coords']) for a in agent_annots} == {((21, 25), (0, 4)), ((0, 4), (15, 19))}
def test_preassemble_related_complex(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) kras = Agent('KRAS', db_refs={'HGNC': '6407'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) st1 = Complex([kras, hras]) st2 = Complex([kras, ras]) st3 = Complex([hras, kras]) st4 = Complex([ras, kras]) pa = Preassembler(hierarchies, [st1, st2, st3, st4]) uniq = pa.combine_duplicates() assert len(uniq) == 2 top = pa.combine_related() assert len(top) == 1
def test_preassemble_related_complex(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) kras = Agent('KRAS', db_refs={'HGNC': '6407'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) st1 = Complex([kras, hras]) st2 = Complex([kras, ras]) st3 = Complex([hras, kras]) st4 = Complex([ras, kras]) pa = Preassembler(bio_ontology, [st1, st2, st3, st4]) uniq = pa.combine_duplicates() assert len(uniq) == 2 top = pa.combine_related() assert len(top) == 1
def extract_phos(): with open(stmts_fname, 'rb') as fh: model = pickle.load(fh) stmts = [] for pmid, pmid_stmts in model.items(): for stmt in pmid_stmts: if isinstance(stmt, Phosphorylation): stmts.append(stmt) logger.info('%d phosphorylations in RAS Machine' % len(stmts)) stmts = [s for s in stmts if s.enz is not None] logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts)) stmts_grounded = filter_grounded(stmts) logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded)) stmts_enzkinase = filter_enzkinase(stmts_grounded) logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase)) sm = SiteMapper(default_site_map) stmts_valid, _ = sm.map_sites(stmts_enzkinase) logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid)) pa = Preassembler(hierarchies, stmts_valid) stmts_unique = pa.combine_duplicates() logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique)) stmts_unique = pa.combine_related() logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique)) with open('mapped_unique_phos.pkl', 'wb') as fh: pickle.dump(stmts_unique, fh, protocol=2) # Filter RAS Machine statements for direct and not hypothesis stmts = filter_direct(stmts_unique) logger.info('%d direct phosphorylations in RAS Machine' % len(stmts)) stmts = filter_non_hypothesis(stmts) logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts)) with open('filtered_phos.pkl', 'wb') as fh: pickle.dump(stmts, fh, protocol=2) return stmts
def test_influence_duplicate(): gov = 'wm/concept/causal_factor/social_and_political/government' agr = 'wm/concept/causal_factor/agriculture/crop_production' cgov = Event(Concept('government', db_refs={'WM': [(gov, 1.0)]})) cagr = Event(Concept('agriculture', db_refs={'WM': [(agr, 1.0)]})) print(cgov.matches_key()) stmt1 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos1')]) stmt2 = Influence(cagr, cgov, evidence=[Evidence(source_api='eidos2')]) stmt3 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos3')]) pa = Preassembler(world_ontology, [stmt1, stmt2, stmt3]) unique_stmts = pa.combine_duplicates() unique_stmts = sorted(unique_stmts, key=lambda x: len(x.evidence)) assert len(unique_stmts) == 2 assert len(unique_stmts[0].evidence) == 1 assert len(unique_stmts[1].evidence) == 2, unique_stmts sources = [e.source_api for e in unique_stmts[1].evidence] assert set(sources) == {'eidos1', 'eidos3'}
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = ac.filter_grounded_only(statements, score_threshold=0.4) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) print('%d top-level statements' % len(top_stmts)) return top_stmts
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = map_onto(statements) ac.dump_statements(statements, 'pi_mtg_demo_unfiltered.pkl') statements = ac.filter_grounded_only(statements, score_threshold=0.7) #statements = ac.filter_by_db_refs(statements, 'UN', # ['conflict', 'food_security', 'precipitation'], policy='one', # match_suffix=True) statements = ac.filter_by_db_refs( statements, 'UN', [ 'conflict', 'food_security', 'flooding', 'food_production', 'human_migration', 'drought', 'food_availability', 'market', 'food_insecurity' ], policy='all', match_suffix=True) assume_polarity(statements) statements = filter_has_polarity(statements) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) #related_stmts = ac.filter_belief(related_stmts, 0.8) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) pa.stmts = top_stmts print('%d top-level statements' % len(top_stmts)) conflicts = pa.find_contradicts() top_stmts = remove_contradicts(top_stmts, conflicts) ac.dump_statements(top_stmts, 'pi_mtg_demo.pkl') return top_stmts
def extract_phos(): with open(stmts_fname, 'rb') as fh: model = pickle.load(fh) stmts = [] for pmid, pmid_stmts in model.items(): for stmt in pmid_stmts: if isinstance(stmt, Phosphorylation): stmts.append(stmt) logger.info('%d phosphorylations in RAS Machine' % len(stmts)) stmts = [s for s in stmts if s.enz is not None] logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts)) stmts_grounded = filter_grounded(stmts) logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded)) stmts_enzkinase = filter_enzkinase(stmts_grounded) logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase)) sm = SiteMapper(default_site_map) stmts_valid, _ = sm.map_sites(stmts_enzkinase) logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid)) pa = Preassembler(hierarchies, stmts_valid) stmts_unique = pa.combine_duplicates() logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique)) stmts_unique = pa.combine_related() logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique)) with open('mapped_unique_phos.pkl', 'wb') as fh: pickle.dump(stmts_unique, fh, protocol=2) # Filter RAS Machine statements for direct and not hypothesis stmts = filter_direct(stmts_unique) logger.info('%d direct phosphorylations in RAS Machine' % len(stmts)) stmts = filter_non_hypothesis(stmts) logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts)) with open('filtered_phos.pkl', 'wb') as fh: pickle.dump(stmts, fh, protocol=2) return stmts
def test_influence_duplicate(): gov = 'UN/entities/human/government/government_entity' agr = 'UN/entities/natural/crop_technology' cgov = Event(Concept('government', db_refs={'UN': [(gov, 1.0)]})) cagr = Event(Concept('agriculture', db_refs={'UN': [(agr, 1.0)]})) stmt1 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos1')]) stmt2 = Influence(cagr, cgov, evidence=[Evidence(source_api='eidos2')]) stmt3 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos3')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [stmt1, stmt2, stmt3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2 assert len(unique_stmts[0].evidence) == 2 assert len(unique_stmts[1].evidence) == 1 sources = [e.source_api for e in unique_stmts[0].evidence] assert set(sources) == set(['eidos1', 'eidos3'])
def test_influence_duplicate(): gov = 'UN/entities/human/government/government_entity' agr = 'UN/entities/natural/crop_technology' cgov = Event(Concept('government', db_refs={'UN': [(gov, 1.0)]})) cagr = Event(Concept('agriculture', db_refs={'UN': [(agr, 1.0)]})) stmt1 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos1')]) stmt2 = Influence(cagr, cgov, evidence=[Evidence(source_api='eidos2')]) stmt3 = Influence(cgov, cagr, evidence=[Evidence(source_api='eidos3')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [stmt1, stmt2, stmt3]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2 assert len(unique_stmts[0].evidence) == 2 assert len(unique_stmts[1].evidence) == 1 sources = [e.source_api for e in unique_stmts[0].evidence] assert set(sources) == set(['eidos1', 'eidos3'])
def test_association_duplicate(): ev1 = Event(Concept('a')) ev2 = Event(Concept('b')) ev3 = Event(Concept('c')) # Order of members does not matter st1 = Association([ev1, ev2], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ev1, ev3], evidence=[Evidence(source_api='eidos2')]) st3 = Association([ev2, ev1], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ev2, ev3], evidence=[Evidence(source_api='eidos4')]) st5 = Association([ev2, ev3], evidence=[Evidence(source_api='eidos5')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') pa = Preassembler(world_ontology, [st1, st2, st3, st4, st5]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3 assert len(unique_stmts[0].evidence) == 2 assert len(unique_stmts[1].evidence) == 1 assert len(unique_stmts[2].evidence) == 2 sources = [e.source_api for e in unique_stmts[0].evidence] assert set(sources) == {'eidos1', 'eidos3'}
def test_association_duplicate(): ev1 = Event(Concept('a')) ev2 = Event(Concept('b')) ev3 = Event(Concept('c')) # Order of members does not matter st1 = Association([ev1, ev2], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ev1, ev3], evidence=[Evidence(source_api='eidos2')]) st3 = Association([ev2, ev1], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ev2, ev3], evidence=[Evidence(source_api='eidos4')]) st5 = Association([ev2, ev3], evidence=[Evidence(source_api='eidos5')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [st1, st2, st3, st4, st5]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3 assert len(unique_stmts[0].evidence) == 2 assert len(unique_stmts[1].evidence) == 1 assert len(unique_stmts[2].evidence) == 2 sources = [e.source_api for e in unique_stmts[0].evidence] assert set(sources) == set(['eidos1', 'eidos3'])
def preassemble(self, filters=None): """Preassemble the Statements collected in the model. Use INDRA's GroundingMapper, Preassembler and BeliefEngine on the IncrementalModel and save the unique statements and the top level statements in class attributes. Currently the following filter options are implemented: - grounding: require that all Agents in statements are grounded - model_one: require that at least one Agent is in the incremental model - model_all: require that all Agents are in the incremental model - prior_one: require that at least one Agent is in the prior model - prior_all: require that all Agents are in the prior model Note that model_one -> prior_all are increasingly more restrictive options. Parameters ---------- filters : Optional[list[str]] A list of filter options to apply when choosing the statements. See description above for more details. Default: None """ stmts = self.get_statements() logger.info("%d raw Statements in total" % len(stmts)) # Fix grounding logger.info("Running grounding map") twg = gm.agent_texts_with_grounding(stmts) prot_map = gm.protein_map_from_twg(twg) gm.default_grounding_map.update(prot_map) gmap = gm.GroundingMapper(gm.default_grounding_map) stmts = gmap.map_agents(stmts, do_rename=True) logger.info("%d Statements after grounding map" % len(stmts)) # Fix sites sm = SiteMapper(default_site_map) stmts, _ = sm.map_sites(stmts) logger.info("%d Statements with valid sequence" % len(stmts)) if filters: if "grounding" in filters: # Filter out ungrounded statements logger.info("Running grounding filter") stmts = self._relevance_filter(stmts, ["grounding"]) logger.info("%s Statements after filter" % len(stmts)) if "human_only" in filters: # Filter out non-human proteins logger.info("Running non-human protein filter") stmts = self._relevance_filter(stmts, ["human_only"]) logger.info("%s Statements after filter" % len(stmts)) for rel_key in ("prior_one", "model_one", "prior_all", "model_all"): if rel_key in filters: logger.info("Running %s relevance filter" % rel_key) stmts = self._relevance_filter(stmts, [rel_key]) logger.info("%s Statements after filter" % len(stmts)) # Combine duplicates logger.info("Preassembling %d Statements" % len(stmts)) pa = Preassembler(hierarchies, stmts) self.unique_stmts = pa.combine_duplicates() logger.info("%d unique Statements" % len(self.unique_stmts)) # Run BeliefEngine on unique statements be = BeliefEngine() be.set_prior_probs(self.unique_stmts) # Build statement hierarchy self.unique_stmts = pa.combine_related(return_toplevel=False) self.toplevel_stmts = [st for st in self.unique_stmts if not st.supports] logger.info("%d top-level Statements" % len(self.toplevel_stmts)) # Run BeliefEngine on hierarchy be.set_hierarchy_probs(self.unique_stmts)
def analyze(filename): results = load_file(filename) all_stmts = [stmt for paper_stmts in results.values() for stmt in paper_stmts] # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) map_stmts = gmap.rename_agents(map_stmts) # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() # Get complexes complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)] # Get HGNC grounding protein_complexes = [s for s in complexes if all([True if 'HGNC' in ag.db_refs.keys() else False for ag in s.agent_list()])] logger.info('Mapping gene IDs to gene symbols') gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes for ag in stmt.members])) genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids] # Get complexes from BioGrid and combine duplicates num_genes_per_query = 50 start_indices = range(0, len(genes), num_genes_per_query) end_indices = [i + num_genes_per_query if i + num_genes_per_query < len(genes) else len(genes) for i in start_indices] bg_complexes = [] for i in range(len(start_indices)): logger.info("Querying biogrid for %s" % str(genes[start_indices[i]:end_indices[i]])) bg_complexes += (bg.get_statements( genes[start_indices[i]:end_indices[i]])) # Filter out Biogrid statements not involving genes in the gene list # (this will make duplicate removal more efficient bg_filt = [] for stmt in bg_complexes: if stmt.members[0].name in genes and \ stmt.members[1].name in genes: bg_filt.append(stmt) # Might as well free up some memory del bg_complexes logger.info("Combining duplicates with biogrid...") pa = Preassembler(hierarchies, bg_filt + protein_complexes) pa.combine_duplicates() indra_only = [] bg_only = [] indra_and_bg = [] for stmt in pa.unique_stmts: evidence_source_list = set([]) for e in stmt.evidence: evidence_source_list.add(e.source_api) if 'reach' in evidence_source_list and \ 'biogrid' in evidence_source_list: indra_and_bg.append(stmt) elif 'reach' in evidence_source_list and \ 'biogrid' not in evidence_source_list: indra_only.append(stmt) elif 'reach' not in evidence_source_list and \ 'biogrid' in evidence_source_list: bg_only.append(stmt) rows = [] for stmt in indra_only: rows.append([stmt.members[0].name, stmt.members[1].name, str(len(stmt.evidence))]) write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t') return {'indra_only': indra_only, 'bg_only': bg_only, 'indra_and_bg': indra_and_bg}
help='Basename for output files.', required=True) args = parser.parse_args() # Load statements and filter to grounded only stmts = ac.load_statements(args.input_file) stmts = ac.filter_grounded_only(stmts) # Sort by TextRefs by_tr, no_tr = stmts_by_text_refs(stmts) # Combine duplicates in each statement list by_tr_pa = {} for tr, stmt_list in by_tr.items(): pa = Preassembler(bio_ontology, stmt_list) uniq_stmts = pa.combine_duplicates() by_tr_pa[tr] = uniq_stmts # Filter to MESH term for "Coronavirus" mesh_id = 'D017934' mesh_children = get_mesh_children(mesh_id) # Include parent term in list mesh_children.append(mesh_id) mesh_pmids = get_pmids_for_mesh_terms(mesh_children) # SLOW! # Get the subset of statements from these PMIDs mesh_stmts = filter_stmts_to_pmids(by_tr_pa, mesh_pmids) # Sort text refs by numbers of statements all_stmts_sorted = sort_by_uniq_stmts(by_tr_pa) mesh_stmts_sorted = sort_by_uniq_stmts(mesh_stmts)
def run_preassembly(self, stmts, print_summary=True): """Run complete preassembly procedure on the given statements. Results are returned as a dict and stored in the attribute :py:attr:`results`. They are also saved in the pickle file `<basename>_results.pkl`. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Statements to preassemble. print_summary : bool If True (default), prints a summary of the preassembly process to the console. Returns ------- dict A dict containing the following entries: - `raw`: the starting set of statements before preassembly. - `duplicates1`: statements after initial de-duplication. - `valid`: statements found to have valid modification sites. - `mapped`: mapped statements (list of :py:class:`indra.preassembler.sitemapper.MappedStatement`). - `mapped_stmts`: combined list of valid statements and statements after mapping. - `duplicates2`: statements resulting from de-duplication of the statements in `mapped_stmts`. - `related2`: top-level statements after combining the statements in `duplicates2`. """ # First round of preassembly: remove duplicates before sitemapping pa1 = Preassembler(hierarchies, stmts) logger.info("Combining duplicates") pa1.combine_duplicates() # Map sites logger.info("Mapping sites") (valid, mapped) = sm.map_sites(pa1.unique_stmts) # Combine valid and successfully mapped statements into single list correctly_mapped_stmts = [] for ms in mapped: if all([ True if mm[1] is not None else False for mm in ms.mapped_mods ]): correctly_mapped_stmts.append(ms.mapped_stmt) mapped_stmts = valid + correctly_mapped_stmts # Second round of preassembly: de-duplicate and combine related pa2 = Preassembler(hierarchies, mapped_stmts) logger.info("Combining duplicates again") pa2.combine_duplicates() pa2.combine_related() # Fill out the results dict self.results = {} self.results['raw'] = stmts self.results['duplicates1'] = pa1.unique_stmts self.results['valid'] = valid self.results['mapped'] = mapped self.results['mapped_stmts'] = mapped_stmts self.results['duplicates2'] = pa2.unique_stmts self.results['related2'] = pa2.related_stmts # Print summary if print_summary: logger.info("\nStarting number of statements: %d" % len(stmts)) logger.info("After duplicate removal: %d" % len(pa1.unique_stmts)) logger.info("Unique statements with valid sites: %d" % len(valid)) logger.info("Unique statements with invalid sites: %d" % len(mapped)) logger.info("After post-mapping duplicate removal: %d" % len(pa2.unique_stmts)) logger.info("After combining related statements: %d" % len(pa2.related_stmts)) # Save the results if we're caching if self.basename is not None: results_filename = '%s_results.pkl' % self.basename with open(results_filename, 'wb') as f: pickle.dump(self.results, f, protocol=2) return self.results
def run_assembly(stmts, folder, pmcid): indexcard_prefix = folder + '/index_cards/' + pmcid otherout_prefix = folder + '/other_outputs/' + pmcid # Filter for grounding grounded_stmts = [] for st in stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(eh, mh) pa.add_statements(grounded_stmts) print '%d statements collected in total.' % len(pa.stmts) unique_stmts = pa.combine_duplicates() print '%d statements after combining duplicates.' % len(unique_stmts) ml = MechLinker(unique_stmts) ml.link_statements() pa = Preassembler(eh, mh, ml.statements) pa.combine_duplicates() related_stmts = pa.combine_related() print '%d statements after combining related.' % len(related_stmts) with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(related_stmts, fh) flattened_evidence_stmts = flatten_evidence(related_stmts) card_counter = 1 card_lim = float('inf') top_stmts = [] for st in sorted(flattened_evidence_stmts, key=lambda x: len(x.evidence), reverse=True): print len(st.evidence), st if is_background_knowledge(st): print 'This statement is background knowledge - skipping.' continue # Assemble IndexCards ia = IndexCardAssembler([st]) ia.make_model() if ia.cards: ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break ea = EnglishAssembler(top_stmts) print '=======================' print ea.make_model() print '=======================' # Print the statement graph graph = render_stmt_graph(related_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv') pya = PysbAssembler() pya.add_statements(related_stmts) model = pya.make_model() print 'PySB model has %d monomers and %d rules' %\ (len(model.monomers), len(model.rules))
def run_preassembly(self, stmts, print_summary=True): """Run complete preassembly procedure on the given statements. Results are returned as a dict and stored in the attribute :py:attr:`results`. They are also saved in the pickle file `<basename>_results.pkl`. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Statements to preassemble. print_summary : bool If True (default), prints a summary of the preassembly process to the console. Returns ------- dict A dict containing the following entries: - `raw`: the starting set of statements before preassembly. - `duplicates1`: statements after initial de-duplication. - `valid`: statements found to have valid modification sites. - `mapped`: mapped statements (list of :py:class:`indra.preassembler.sitemapper.MappedStatement`). - `mapped_stmts`: combined list of valid statements and statements after mapping. - `duplicates2`: statements resulting from de-duplication of the statements in `mapped_stmts`. - `related2`: top-level statements after combining the statements in `duplicates2`. """ # First round of preassembly: remove duplicates before sitemapping pa1 = Preassembler(hierarchies, stmts) logger.info("Combining duplicates") pa1.combine_duplicates() # Map sites logger.info("Mapping sites") (valid, mapped) = sm.map_sites(pa1.unique_stmts) # Combine valid and successfully mapped statements into single list correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) mapped_stmts = valid + correctly_mapped_stmts # Second round of preassembly: de-duplicate and combine related pa2 = Preassembler(hierarchies, mapped_stmts) logger.info("Combining duplicates again") pa2.combine_duplicates() pa2.combine_related() # Fill out the results dict self.results = {} self.results['raw'] = stmts self.results['duplicates1'] = pa1.unique_stmts self.results['valid'] = valid self.results['mapped'] = mapped self.results['mapped_stmts'] = mapped_stmts self.results['duplicates2'] = pa2.unique_stmts self.results['related2'] = pa2.related_stmts # Print summary if print_summary: logger.info("\nStarting number of statements: %d" % len(stmts)) logger.info("After duplicate removal: %d" % len(pa1.unique_stmts)) logger.info("Unique statements with valid sites: %d" % len(valid)) logger.info("Unique statements with invalid sites: %d" % len(mapped)) logger.info("After post-mapping duplicate removal: %d" % len(pa2.unique_stmts)) logger.info("After combining related statements: %d" % len(pa2.related_stmts)) # Save the results if we're caching if self.basename is not None: results_filename = '%s_results.pkl' % self.basename with open(results_filename, 'wb') as f: pickle.dump(self.results, f, protocol=2) return self.results
if __name__ == '__main__': # Load the statements if len(sys.argv) < 2: print("Usage: %s reach_stmts_file" % sys.argv[0]) sys.exit() results = load_file(sys.argv[1]) all_stmts = [stmt for paper_stmts in results.values() for stmt in paper_stmts] report_stmt_counts(results, plot_prefix='raw') report_grounding(all_stmts, plot_prefix='raw') report_stmt_types(all_stmts, plot_prefix='raw') report_stmt_participants(all_stmts) # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) report_grounding(map_stmts, plot_prefix='preassembled') # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() report_evidence_distribution(pa.unique_stmts, plot_prefix='preassembled')
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker ml = MechLinker(related_stmts) # Link statements linked_stmts = ml.link_statements() # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [ stmt for stmt in related_stmts if stmt not in background_assertions ] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh, protocol=2) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model()) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
if args.date_range: min_date_str, max_date_str = args.date_range.split(':') if min_date_str: min_date = datetime.strptime(min_date_str, '%Y%m%d%H%M%S') clauses.add(db.RawStatements.create_date > min_date) if max_date_str: max_date = datetime.strptime(max_date_str, '%Y%m%d%H%M%S') clauses.add(db.RawStatements.create_date < max_date) all_stmts, results = load_stmts_from_db(clauses, db) report_stmt_counts(results['reach'], plot_prefix='raw_reach') report_stmt_counts(results['sparser'], plot_prefix='raw_sparser') report_grounding(all_stmts, plot_prefix='raw') report_stmt_types(all_stmts, plot_prefix='raw') report_stmt_participants(all_stmts) # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) report_grounding(map_stmts, plot_prefix='preassembled') # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() report_evidence_distribution(pa.unique_stmts, plot_prefix='preassembled')
def analyze(filename, plot=False): # Load the file results = load_file(filename) # Put together a list of all statements all_stmts = [ stmt for paper_stmts in results.values() for stmt in paper_stmts ] # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) map_stmts = gmap.rename_agents(map_stmts) # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() # Map GO IDs to genes and associated statements logger.info('Building map from GO IDs to stmts') go_gene_map = {} go_name_map = {} for stmt in pa.unique_stmts: (bp_name, go, gene) = go_gene_pair(stmt) if bp_name is None and go is None and gene is None: continue go_gene_list = go_gene_map.get(go, []) go_gene_list.append((gene, stmt)) go_gene_map[go] = go_gene_list go_name_set = go_name_map.get(go, set([])) go_name_set.add(bp_name) go_name_map[go] = go_name_set # Iterate over all of the GO IDs and compare the annotated genes in GO # to the ones from the given statements go_stmt_map = {} for ix, go_id in enumerate(go_gene_map.keys()): logger.info( 'Getting genes for %s (%s) from GO (%d of %d)' % (go_id, ','.join( list(go_name_map[go_id])), ix + 1, len(go_gene_map.keys()))) genes_from_go = get_genes_for_go_id(go_id) gene_stmt_list = go_gene_map[go_id] in_go = [] not_in_go = [] for (gene, stmt) in gene_stmt_list: if gene in genes_from_go: in_go.append(stmt) else: not_in_go.append(stmt) go_stmt_map[go_id] = { 'names': list(go_name_map[go_id]), 'in_go': in_go, 'not_in_go': not_in_go } with open('go_stmt_map.pkl', 'wb') as f: pickle.dump(go_stmt_map, f, protocol=2) if plot: plot_stmt_counts(go_stmt_map, 'go_stmts.pdf')
for fn in fnames: print '\n\n----------------------------' print 'Processing %s...' % fn xml_str = open(fn, 'rt').read() tp = trips.process_xml(xml_str) print 'Extracted events by type' print '------------------------' for k,v in tp.extracted_events.iteritems(): print k, len(v) print '------------------------' print '%s statements collected.' % len(tp.statements) pa.add_statements(tp.statements) print '----------------------------\n\n' print '%d statements collected in total.' % len(pa.stmts) duplicate_stmts = pa.combine_duplicates() print '%d statements after combining duplicates.' % len(duplicate_stmts) related_stmts = pa.combine_related() print '%d statements after combining related.' % len(related_stmts) # Print the statement graph graph = render_stmt_graph(related_stmts) graph.draw('trips_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, 'trips_statements.tsv') print_stmts(related_stmts, 'trips_related_statements.tsv') pya = PysbAssembler() pya.add_statements(related_stmts) model = pya.make_model()
def test_matches_key_fun(): from indra.statements import WorldContext, RefContext def has_location(stmt): if not stmt.context or not stmt.context.geo_location or \ not stmt.context.geo_location.db_refs.get('GEOID'): return False return True def event_location_matches(stmt): if isinstance(stmt, Event): if not has_location(stmt): context_key = None else: context_key = stmt.context.geo_location.db_refs['GEOID'] matches_key = str((stmt.concept.matches_key(), context_key)) else: matches_key = stmt.matches_key() return matches_key def event_location_refinement(st1, st2, ontology, entities_refined): if isinstance(st1, Event) and isinstance(st2, Event): ref = st1.refinement_of(st2, ontology) if not ref: return False if not has_location(st2): return True elif not has_location(st1) and has_location(st2): return False else: return st1.context.geo_location.db_refs['GEOID'] == \ st2.context.geo_location.db_refs['GEOID'] context1 = WorldContext( geo_location=RefContext('x', db_refs={'GEOID': '1'})) context2 = WorldContext( geo_location=RefContext('x', db_refs={'GEOID': '2'})) health = 'wm/concept/causal_factor/health_and_life' e1 = Event(Concept('health', db_refs={'WM': [(health, 1.0)]}), context=context1, evidence=Evidence(text='1', source_api='eidos')) e2 = Event(Concept('health', db_refs={'WM': [(health, 1.0)]}), context=context2, evidence=Evidence(text='2', source_api='eidos')) e3 = Event(Concept('health', db_refs={'WM': [(health, 1.0)]}), context=context2, evidence=Evidence(text='3', source_api='eidos')) pa = Preassembler(world_ontology, [e1, e2, e3], matches_fun=event_location_matches, refinement_fun=event_location_refinement) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 2, unique_stmts from indra.tools.assemble_corpus import run_preassembly stmts = run_preassembly([e1, e2, e3], matches_fun=event_location_matches, refinement_fun=event_location_refinement) assert len(stmts) == 2, stmts
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker # Link statements linked_stmts = MechLinker.infer_active_forms(related_stmts) linked_stmts += MechLinker.infer_modifications(related_stmts) linked_stmts += MechLinker.infer_activations(related_stmts) # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [stmt for stmt in related_stmts if stmt not in background_assertions] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model().encode('utf-8')) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')