def test_render_stmt_graph(): braf = Agent('BRAF', db_refs={'HGNC': '1097'}) mek1 = Agent('MAP2K1', db_refs={'HGNC': '6840'}) mek = Agent('MEK', db_refs={'FPLX':'MEK'}) # Statements p0 = Phosphorylation(braf, mek) p1 = Phosphorylation(braf, mek1) p2 = Phosphorylation(braf, mek1, position='218') p3 = Phosphorylation(braf, mek1, position='222') p4 = Phosphorylation(braf, mek1, 'serine') p5 = Phosphorylation(braf, mek1, 'serine', '218') p6 = Phosphorylation(braf, mek1, 'serine', '222') stmts = [p0, p1, p2, p3, p4, p5, p6] pa = Preassembler(hierarchies, stmts=stmts) pa.combine_related() graph = render_stmt_graph(pa.related_stmts, reduce=False) # One node for each statement assert len(graph.nodes()) == 7 # Edges: # p0 supports p1-p6 = 6 edges # p1 supports p2-p6 = 5 edges # p2 supports p5 = 1 edge # p3 supports p6 = 1 edge # p4 supports p5-p6 = 2 edges # (p5 and p6 support none--they are top-level) # 6 + 5 + 1 + 1 + 2 = 15 edges assert len(graph.edges()) == 15
def test_flatten_evidence_hierarchy(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 2 assert 'bar' in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 1 assert supporting_stmt.evidence[0].text == 'foo' supporting_stmt.evidence[0].text = 'changed_foo' assert supporting_stmt.evidence[0].text == 'changed_foo' assert 'changed_foo' not in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert {ev.annotations.get('support_type') for ev in top_stmt.evidence} \ == {'direct', 'supported_by'}
def test_render_stmt_graph(): braf = Agent('BRAF', db_refs={'HGNC': '1097'}) mek1 = Agent('MAP2K1', db_refs={'HGNC': '6840'}) mek = Agent('MEK', db_refs={'FPLX': 'MEK'}) # Statements p0 = Phosphorylation(braf, mek) p1 = Phosphorylation(braf, mek1) p2 = Phosphorylation(braf, mek1, position='218') p3 = Phosphorylation(braf, mek1, position='222') p4 = Phosphorylation(braf, mek1, 'serine') p5 = Phosphorylation(braf, mek1, 'serine', '218') p6 = Phosphorylation(braf, mek1, 'serine', '222') stmts = [p0, p1, p2, p3, p4, p5, p6] pa = Preassembler(bio_ontology, stmts=stmts) pa.combine_related() graph = render_stmt_graph(pa.related_stmts, reduce=False) # One node for each statement assert len(graph.nodes()) == 7 # Edges: # p0 supports p1-p6 = 6 edges # p1 supports p2-p6 = 5 edges # p2 supports p5 = 1 edge # p3 supports p6 = 1 edge # p4 supports p5-p6 = 2 edges # (p5 and p6 support none--they are top-level) # 6 + 5 + 1 + 1 + 2 = 15 edges assert len(graph.edges()) == 15
def test_translocation(): st1 = Translocation(Agent('AKT'), None, None) st2 = Translocation(Agent('AKT'), None, 'plasma membrane') st3 = Translocation(Agent('AKT'), None, 'nucleus') pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) pa.combine_related() assert len(pa.related_stmts) == 2
def test_pathsfromto(): bp = biopax.process_pc_pathsfromto(['MAP2K1'], ['MAPK1']) bp.get_phosphorylation() assert_pmids(bp.statements) pre = Preassembler(hierarchies, bp.statements) pre.combine_related() assert unicode_strs(pre.unique_stmts)
def test_modification_refinement_residue_noenz(): erbb3 = Agent('Erbb3') st1 = Phosphorylation(None, erbb3) st2 = Phosphorylation(None, erbb3, 'Y') pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1
def test_flatten_evidence_hierarchy(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 2 assert 'bar' in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 1 assert supporting_stmt.evidence[0].text == 'foo' supporting_stmt.evidence[0].text = 'changed_foo' assert supporting_stmt.evidence[0].text == 'changed_foo' assert 'changed_foo' not in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert {ev.annotations.get('support_type') for ev in top_stmt.evidence} \ == {'direct', 'supported_by'}
def test_modification_refinement_residue_noenz(): erbb3 = Agent('Erbb3') st1 = Phosphorylation(None, erbb3) st2 = Phosphorylation(None, erbb3, 'Y') pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1
def test_translocation(): st1 = Translocation(Agent('AKT'), None, None) st2 = Translocation(Agent('AKT'), None, 'plasma membrane') st3 = Translocation(Agent('AKT'), None, 'nucleus') pa = Preassembler(bio_ontology, stmts=[st1, st2, st3]) pa.combine_related() assert len(pa.related_stmts) == 2, pa.related_stmts
def test_complex_refinement_order(): st1 = Complex([Agent('MED23'), Agent('ELK1')]) st2 = Complex([Agent('ELK1', mods=[ModCondition('phosphorylation')]), Agent('MED23')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 1
def test_complex_agent_refinement(): ras = Agent('RAS') raf1 = Agent('RAF', mods=[ModCondition('ubiquitination', None, None, True)]) raf2 = Agent('RAF', mods=[ModCondition('ubiquitination', None, None, False)]) st1 = Complex([ras, raf1]) st2 = Complex([ras, raf2]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.unique_stmts) == 2 assert len(pa.related_stmts) == 2
def test_homodimer_refinement(): egfr = Agent('EGFR') erbb = Agent('ERBB2') st1 = Complex([erbb, erbb]) st2 = Complex([erbb, egfr]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_complex_refinement(): ras = Agent('RAS') raf = Agent('RAF') mek = Agent('MEK') st1 = Complex([ras, raf]) st2 = Complex([mek, ras, raf]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_related() assert len(pa.unique_stmts) == 2 assert len(pa.related_stmts) == 2
def test_complex_refinement_order(): st1 = Complex([Agent('MED23'), Agent('ELK1')]) st2 = Complex([ Agent('ELK1', mods=[ModCondition('phosphorylation')]), Agent('MED23') ]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 1
def test_homodimer_refinement(): egfr = Agent('EGFR') erbb = Agent('ERBB2') st1 = Complex([erbb, erbb]) st2 = Complex([erbb, egfr]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_complex_refinement(): ras = Agent('RAS') raf = Agent('RAF') mek = Agent('MEK') st1 = Complex([ras, raf]) st2 = Complex([mek, ras, raf]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.unique_stmts) == 2 assert len(pa.related_stmts) == 2
def test_flatten_stmts(): st1 = Phosphorylation(Agent('MAP3K5'), Agent('RAF1'), 'S', '338') st2 = Phosphorylation(None, Agent('RAF1'), 'S', '338') st3 = Phosphorylation(None, Agent('RAF1')) st4 = Phosphorylation(Agent('PAK1'), Agent('RAF1'), 'S', '338') st5 = Phosphorylation(None, Agent('RAF1'), evidence=Evidence(text='foo')) pa = Preassembler(hierarchies, stmts=[st1, st2, st3, st4, st5]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 2 assert len(flatten_stmts(pa.unique_stmts)) == 4 assert len(flatten_stmts(pa.related_stmts)) == 4
def test_complex_agent_refinement(): ras = Agent('RAS') raf1 = Agent('RAF', mods=[ModCondition('ubiquitination', None, None, True)]) raf2 = Agent('RAF', mods=[ModCondition('ubiquitination', None, None, False)]) st1 = Complex([ras, raf1]) st2 = Complex([ras, raf2]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_related() assert len(pa.unique_stmts) == 2 assert len(pa.related_stmts) == 2
def test_flatten_stmts(): st1 = Phosphorylation(Agent('MAP3K5'), Agent('RAF1'), 'S', '338') st2 = Phosphorylation(None, Agent('RAF1'), 'S', '338') st3 = Phosphorylation(None, Agent('RAF1')) st4 = Phosphorylation(Agent('PAK1'), Agent('RAF1'), 'S', '338') st5 = Phosphorylation(None, Agent('RAF1'), evidence=Evidence(text='foo')) pa = Preassembler(bio_ontology, stmts=[st1, st2, st3, st4, st5]) pa.combine_duplicates() pa.combine_related() assert len(pa.related_stmts) == 2 assert len(flatten_stmts(pa.unique_stmts)) == 4 assert len(flatten_stmts(pa.related_stmts)) == 4
def test_activation_refinement(): subj = Agent('alcohol', db_refs={'CHEBI': 'CHEBI:16236', 'HMDB': 'HMDB00108', 'PUBCHEM': '702', 'TEXT': 'alcohol'}) obj = Agent('endotoxin', db_refs={'TEXT': 'endotoxin'}) st1 = Inhibition(subj, obj) st2 = Activation(subj, obj) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_activation_refinement(): subj = Agent('alcohol', db_refs={ 'CHEBI': 'CHEBI:16236', 'HMDB': 'HMDB00108', 'PUBCHEM': '702', 'TEXT': 'alcohol' }) obj = Agent('endotoxin', db_refs={'TEXT': 'endotoxin'}) st1 = Inhibition(subj, obj) st2 = Activation(subj, obj) pa = Preassembler(bio_ontology, stmts=[st1, st2]) pa.combine_duplicates() assert len(pa.unique_stmts) == 2 pa.combine_related() assert len(pa.related_stmts) == 2
def test_association_refinement(): health = 'UN/entities/human/health' food = 'UN/entities/human/food' food_security = 'UN/entities/human/food/food_security' eh = Event(Concept('health', db_refs={'UN': [(health, 1.0)]})) ef = Event(Concept('food', db_refs={'UN': [(food, 1.0)]})) efs = Event(Concept('food security', db_refs={'UN': [(food_security, 1.0)]})) st1 = Association([eh, ef], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ef, eh], evidence=[Evidence(source_api='eidos2')]) st3 = Association([eh, efs], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ef, efs], evidence=[Evidence(source_api='eidos4')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() # debugging assert len(unique_stmts) == 3 rel_stmts = pa.combine_related() assert len(rel_stmts) == 2 eh_efs_stmt = [st for st in rel_stmts if (st.members[0].concept.name in {'health', 'food security'} and st.members[1].concept.name in {'health', 'food security'})][0] assert len(eh_efs_stmt.supported_by) == 1 assert (eh_efs_stmt.supported_by[0].members[0].concept.name in {'food', 'health'}) assert (eh_efs_stmt.supported_by[0].members[1].concept.name in {'food', 'health'})
def test_modification_refinement_noenz2(): """A more specific modification statement should be supported by a more generic modification statement. Similar to test_modification_refinement_noenz for statements where one argument is associated with a component in the hierarchy (SIRT1 in this case) but the other is not (BECN1). """ sirt1 = Agent('SIRT1', db_refs={ 'HGNC': '14929', 'UP': 'Q96EB6', 'TEXT': 'SIRT1' }) becn1 = Agent('BECN1', db_refs={ 'HGNC': '1034', 'UP': 'Q14457', 'TEXT': 'Beclin 1' }) st1 = Deacetylation(sirt1, becn1) st2 = Deacetylation(None, becn1) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. assert (len(stmts) == 1) assert (stmts[0].equals(st1)) assert (len(stmts[0].supported_by) == 1) assert (stmts[0].supported_by[0].equals(st2)) assert (stmts[0].supported_by[0].supports[0].equals(st1))
def test_association_refinement(): health = 'UN/entities/human/health' food = 'UN/entities/human/food' food_security = 'UN/entities/human/food/food_security' eh = Event(Concept('health', db_refs={'UN': [(health, 1.0)]})) ef = Event(Concept('food', db_refs={'UN': [(food, 1.0)]})) efs = Event( Concept('food security', db_refs={'UN': [(food_security, 1.0)]})) st1 = Association([eh, ef], evidence=[Evidence(source_api='eidos1')]) st2 = Association([ef, eh], evidence=[Evidence(source_api='eidos2')]) st3 = Association([eh, efs], evidence=[Evidence(source_api='eidos3')]) st4 = Association([ef, efs], evidence=[Evidence(source_api='eidos4')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() # debugging assert len(unique_stmts) == 3 rel_stmts = pa.combine_related() assert len(rel_stmts) == 2 eh_efs_stmt = [ st for st in rel_stmts if (st.members[0].concept.name in {'health', 'food security'} and st.members[1].concept.name in {'health', 'food security'}) ][0] assert len(eh_efs_stmt.supported_by) == 1 assert (eh_efs_stmt.supported_by[0].members[0].concept.name in {'food', 'health'}) assert (eh_efs_stmt.supported_by[0].members[1].concept.name in {'food', 'health'})
def test_association_refinement(): unrelated = 'wm/concept/causal_factor/wild_food_sources' parent = 'wm/concept/causal_factor/health_and_life' child = 'wm/concept/causal_factor/health_and_life/' \ 'living_condition/food_safety' parent_event = Event(Concept('parent', db_refs={'WM': [(parent, 1.0)]})) unrelated_event = \ Event(Concept('unrelated', db_refs={'WM': [(unrelated, 1.0)]})) child_event = Event(Concept('child', db_refs={'WM': [(child, 1.0)]})) st1 = Association([parent_event, unrelated_event], evidence=[Evidence(source_api='eidos1')]) st2 = Association([unrelated_event, parent_event], evidence=[Evidence(source_api='eidos2')]) st3 = Association([parent_event, child_event], evidence=[Evidence(source_api='eidos3')]) st4 = Association([unrelated_event, child_event], evidence=[Evidence(source_api='eidos4')]) pa = Preassembler(world_ontology, [st1, st2, st3, st4]) unique_stmts = pa.combine_duplicates() assert len(unique_stmts) == 3 top_level_stmts = pa.combine_related() assert len(top_level_stmts) == 2, top_level_stmts names = { tuple(sorted(e.concept.name for e in stmt.members)): stmt for stmt in top_level_stmts } stmt = names[('child', 'unrelated')] assert len(stmt.supported_by) == 1 assert {e.concept.name for e in stmt.supported_by[0].members} == \ {'parent', 'unrelated'}
def test_return_toplevel(): src = Agent('SRC', db_refs={'HGNC': '11283'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nras) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related(return_toplevel=True) assert len(stmts) == 1 assert len(stmts[0].supported_by) == 1 assert len(stmts[0].supported_by[0].supports) == 1 stmts = pa.combine_related(return_toplevel=False) assert len(stmts) == 2 ix = 1 if stmts[0].residue else 0 assert len(stmts[1 - ix].supported_by) == 1 assert len(stmts[1 - ix].supported_by[0].supports) == 1 assert len(stmts[ix].supports) == 1 assert len(stmts[ix].supports[0].supported_by) == 1
def test_return_toplevel(): src = Agent('SRC', db_refs = {'HGNC': '11283'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nras) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related(return_toplevel=True) assert len(stmts) == 1 assert len(stmts[0].supported_by) == 1 assert len(stmts[0].supported_by[0].supports) == 1 stmts = pa.combine_related(return_toplevel=False) assert len(stmts) == 2 ix = 1 if stmts[0].residue else 0 assert len(stmts[1-ix].supported_by) == 1 assert len(stmts[1-ix].supported_by[0].supports) == 1 assert len(stmts[ix].supports) == 1 assert len(stmts[ix].supports[0].supported_by) == 1
def assemble_model(requester_name): global stmts # Performing grounding mapping on the statements gmapper = gm.GroundingMapper(gm.default_grounding_map) stmts = gmapper.map_agents(stmts) pa = Preassembler(hierarchies, stmts) pa.combine_related() stmts = pa.related_stmts ml = MechLinker(stmts) linked_stmts = ml.link_statements() if linked_stmts: for linked_stmt in linked_stmts: if linked_stmt.inferred_stmt: question = mechlinker_queries.print_linked_stmt(linked_stmt) say(question) stmts.append(linked_stmt.inferred_stmt) say("%s: Done, updating layout." % requester_name) update_layout()
def test_flatten_evidence_multilevel(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', evidence=[Evidence(text='bar')]) st3 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='baz')]) pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 3, len(top_stmt.evidence) anns = [ev.annotations['support_type'] for ev in top_stmt.evidence] assert anns.count('direct') == 1 assert anns.count('supported_by') == 2
def test_flatten_evidence_multilevel(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', evidence=[Evidence(text='bar')]) st3 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='baz')]) pa = Preassembler(bio_ontology, stmts=[st1, st2, st3]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 3, len(top_stmt.evidence) anns = [ev.annotations['support_type'] for ev in top_stmt.evidence] assert anns.count('direct') == 1 assert anns.count('supported_by') == 2
def test_conversion_refinement(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) gtp = Agent('GTP') gdp = Agent('GDP') st1 = Conversion(ras, gtp, gdp) st2 = Conversion(hras, gtp, gdp) st3 = Conversion(hras, [gtp, gdp], gdp) st4 = Conversion(hras, [gdp, gtp], gdp) pa = Preassembler(bio_ontology, stmts=[st1, st2, st3, st4]) toplevel_stmts = pa.combine_related() assert len(toplevel_stmts) == 2
def test_conversion_refinement(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) gtp = Agent('GTP') gdp = Agent('GDP') st1 = Conversion(ras, gtp, gdp) st2 = Conversion(hras, gtp, gdp) st3 = Conversion(hras, [gtp, gdp], gdp) st4 = Conversion(hras, [gdp, gtp], gdp) pa = Preassembler(hierarchies, stmts=[st1, st2, st3, st4]) toplevel_stmts = pa.combine_related() assert len(toplevel_stmts) == 2
def test_preassemble_related_complex(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) kras = Agent('KRAS', db_refs={'HGNC': '6407'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) st1 = Complex([kras, hras]) st2 = Complex([kras, ras]) st3 = Complex([hras, kras]) st4 = Complex([ras, kras]) pa = Preassembler(hierarchies, [st1, st2, st3, st4]) uniq = pa.combine_duplicates() assert len(uniq) == 2 top = pa.combine_related() assert len(top) == 1
def test_preassemble_related_complex(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) kras = Agent('KRAS', db_refs={'HGNC': '6407'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) st1 = Complex([kras, hras]) st2 = Complex([kras, ras]) st3 = Complex([hras, kras]) st4 = Complex([ras, kras]) pa = Preassembler(bio_ontology, [st1, st2, st3, st4]) uniq = pa.combine_duplicates() assert len(uniq) == 2 top = pa.combine_related() assert len(top) == 1
def test_flatten_evidence_hierarchy(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 2 assert 'bar' in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 1 assert supporting_stmt.evidence[0].text == 'foo'
def test_modification_norefinement_subsfamily(): """A more specific modification statement should be supported by a more generic modification statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) st1 = Phosphorylation(src, nras) st2 = Phosphorylation(src, ras, 'Y', '32', evidence=[Evidence(text='foo')]) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # Modification is less specific, enzyme more specific in st1, therefore # these statements shouldn't be combined. assert len(stmts) == 2 assert len(stmts[0].evidence) == 1, stmts
def test_superfamily_refinement_isa_or_partof(): src = Agent('SRC', db_refs={'HGNC': '11283'}) prkag1 = Agent('PRKAG1', db_refs={'HGNC': '9385'}) ampk = Agent('AMPK', db_refs={'FPLX': 'AMPK'}) st1 = Phosphorylation(src, ampk, 'tyrosine', '32') st2 = Phosphorylation(src, prkag1, 'tyrosine', '32') pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the gene-level # one, supported by the family one. assert len(stmts) == 1 assert stmts[0].equals(st2) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st1)
def extract_phos(): with open(stmts_fname, 'rb') as fh: model = pickle.load(fh) stmts = [] for pmid, pmid_stmts in model.items(): for stmt in pmid_stmts: if isinstance(stmt, Phosphorylation): stmts.append(stmt) logger.info('%d phosphorylations in RAS Machine' % len(stmts)) stmts = [s for s in stmts if s.enz is not None] logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts)) stmts_grounded = filter_grounded(stmts) logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded)) stmts_enzkinase = filter_enzkinase(stmts_grounded) logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase)) sm = SiteMapper(default_site_map) stmts_valid, _ = sm.map_sites(stmts_enzkinase) logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid)) pa = Preassembler(hierarchies, stmts_valid) stmts_unique = pa.combine_duplicates() logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique)) stmts_unique = pa.combine_related() logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique)) with open('mapped_unique_phos.pkl', 'wb') as fh: pickle.dump(stmts_unique, fh, protocol=2) # Filter RAS Machine statements for direct and not hypothesis stmts = filter_direct(stmts_unique) logger.info('%d direct phosphorylations in RAS Machine' % len(stmts)) stmts = filter_non_hypothesis(stmts) logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts)) with open('filtered_phos.pkl', 'wb') as fh: pickle.dump(stmts, fh, protocol=2) return stmts
def test_superfamily_refinement_isa_or_partof(): src = Agent('SRC', db_refs = {'HGNC': '11283'}) prkag1 = Agent('PRKAG1', db_refs = {'HGNC': '9385'}) ampk = Agent('AMPK', db_refs = {'FPLX': 'AMPK'}) st1 = Phosphorylation(src, ampk, 'tyrosine', '32') st2 = Phosphorylation(src, prkag1, 'tyrosine', '32') pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the gene-level # one, supported by the family one. assert len(stmts) == 1 assert stmts[0].equals(st2) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st1)
def test_modification_norefinement_enzfamily(): """A more specific modification statement should be supported by a more generic modification statement.""" mek = Agent('MEK') raf = Agent('RAF') braf = Agent('BRAF') st1 = Phosphorylation(raf, mek, 'Y', '32', evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # Modification is less specific, enzyme more specific in st1, therefore # these statements shouldn't be combined. assert len(stmts) == 2 assert len(stmts[1].evidence) == 1
def test_modification_norefinement_noenz(): """A more specific modification statement should be supported by a more generic modification statement.""" src = Agent('SRC', db_refs = {'HGNC': '11283'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) st1 = Phosphorylation(src, nras) st2 = Phosphorylation(None, nras, 'Y', '32', evidence=[Evidence(text='foo')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # Modification is less specific, enzyme more specific in st1, therefore # these statements shouldn't be combined. assert len(stmts) == 2 assert len(stmts[1].evidence)==1
def test_modification_refinement(): """A more specific modification statement should be supported by a more generic modification statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nras) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. assert len(stmts) == 1 assert stmts[0].equals(st1) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st2)
def test_modification_norefinement_enzfamily(): """A more specific modification statement should be supported by a more generic modification statement.""" mek = Agent('MEK') raf = Agent('RAF') braf = Agent('BRAF') st1 = Phosphorylation(raf, mek, 'Y', '32', evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # Modification is less specific, enzyme more specific in st1, therefore # these statements shouldn't be combined. assert len(stmts) == 2 assert len(stmts[1].evidence)==1
def test_bound_condition_norefinement(): """A statement with more specific bound context should be supported by a less specific statement.""" src = Agent('SRC', db_refs = {'HGNC': '11283'}) gtp = Agent('GTP', db_refs = {'CHEBI': '15996'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) nrasgtp = Agent('NRAS', db_refs = {'HGNC': '7989'}, bound_conditions=[BoundCondition(gtp, True)]) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nrasgtp) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # The bound condition is more specific in st2 but the modification is less # specific. Therefore these statements should not be combined. assert len(stmts) == 2
def test_modification_refinement(): """A more specific modification statement should be supported by a more generic modification statement.""" src = Agent('SRC', db_refs = {'HGNC': '11283'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nras) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. assert len(stmts) == 1 assert stmts[0].equals(st1) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st2)
def test_superfamily_refinement(): """A gene-level statement should be supported by a family-level statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) st1 = Phosphorylation(src, ras, 'tyrosine', '32') st2 = Phosphorylation(src, nras, 'tyrosine', '32') pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the gene-level # one, supported by the family one. assert len(stmts) == 1 assert (stmts[0].equals(st2)) assert (len(stmts[0].supported_by) == 1) assert (stmts[0].supported_by[0].equals(st1))
def test_superfamily_refinement(): """A gene-level statement should be supported by a family-level statement.""" src = Agent('SRC', db_refs = {'HGNC': '11283'}) ras = Agent('RAS', db_refs = {'FPLX': 'RAS'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) st1 = Phosphorylation(src, ras, 'tyrosine', '32') st2 = Phosphorylation(src, nras, 'tyrosine', '32') pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the gene-level # one, supported by the family one. assert len(stmts) == 1 assert (stmts[0].equals(st2)) assert (len(stmts[0].supported_by) == 1) assert (stmts[0].supported_by[0].equals(st1))
def test_bound_condition_norefinement(): """A statement with more specific bound context should be supported by a less specific statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) gtp = Agent('GTP', db_refs={'CHEBI': '15996'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) nrasgtp = Agent('NRAS', db_refs={'HGNC': '7989'}, bound_conditions=[BoundCondition(gtp, True)]) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nrasgtp) pa = Preassembler(bio_ontology, stmts=[st1, st2]) stmts = pa.combine_related() # The bound condition is more specific in st2 but the modification is less # specific. Therefore these statements should not be combined. assert len(stmts) == 2
def test_modification_norefinement_noenz(): """A more specific modification statement should be supported by a more generic modification statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) st1 = Phosphorylation(src, nras) st2 = Phosphorylation(None, nras, 'Y', '32', evidence=[Evidence(text='foo')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # Modification is less specific, enzyme more specific in st1, therefore # these statements shouldn't be combined. assert (len(stmts) == 2) assert (len(stmts[1].evidence) == 1)
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = ac.filter_grounded_only(statements, score_threshold=0.4) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) print('%d top-level statements' % len(top_stmts)) return top_stmts
def test_bound_condition_refinement(): """A statement with more specific bound context should be supported by a less specific statement.""" src = Agent('SRC', db_refs = {'HGNC': '11283'}) gtp = Agent('GTP', db_refs = {'CHEBI': '15996'}) nras = Agent('NRAS', db_refs = {'HGNC': '7989'}) nrasgtp = Agent('NRAS', db_refs = {'HGNC': '7989'}, bound_conditions=[BoundCondition(gtp, True)]) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nrasgtp, 'tyrosine', '32') # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() assert len(stmts) == 1 assert stmts[0].equals(st2) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st1)
def test_flatten_evidence_hierarchy_supports(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa_stmts = pa.combine_related(return_toplevel=False) assert len(pa_stmts) == 2 flattened = flatten_evidence(pa_stmts, collect_from='supports') assert len(flattened) == 2 top_stmt = flattened[1] assert len(top_stmt.evidence) == 1 assert 'bar' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 2 assert set([e.text for e in supporting_stmt.evidence]) == {'foo', 'bar'}
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = map_onto(statements) ac.dump_statements(statements, 'pi_mtg_demo_unfiltered.pkl') statements = ac.filter_grounded_only(statements, score_threshold=0.7) #statements = ac.filter_by_db_refs(statements, 'UN', # ['conflict', 'food_security', 'precipitation'], policy='one', # match_suffix=True) statements = ac.filter_by_db_refs( statements, 'UN', [ 'conflict', 'food_security', 'flooding', 'food_production', 'human_migration', 'drought', 'food_availability', 'market', 'food_insecurity' ], policy='all', match_suffix=True) assume_polarity(statements) statements = filter_has_polarity(statements) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) #related_stmts = ac.filter_belief(related_stmts, 0.8) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) pa.stmts = top_stmts print('%d top-level statements' % len(top_stmts)) conflicts = pa.find_contradicts() top_stmts = remove_contradicts(top_stmts, conflicts) ac.dump_statements(top_stmts, 'pi_mtg_demo.pkl') return top_stmts
def test_multiprocessing(): braf = Agent('BRAF', db_refs={'HGNC': '1097'}) mek1 = Agent('MAP2K1', db_refs={'HGNC': '6840'}) mek = Agent('MEK', db_refs={'FPLX':'MEK'}) # Statements p0 = Phosphorylation(braf, mek) p1 = Phosphorylation(braf, mek1) p2 = Phosphorylation(braf, mek1, position='218') p3 = Phosphorylation(braf, mek1, position='222') p4 = Phosphorylation(braf, mek1, 'serine') p5 = Phosphorylation(braf, mek1, 'serine', '218') p6 = Phosphorylation(braf, mek1, 'serine', '222') p7 = Dephosphorylation(braf, mek1) stmts = [p0, p1, p2, p3, p4, p5, p6, p7] pa = Preassembler(hierarchies, stmts=stmts) # Size cutoff set to a low number so that one group will run remotely # and one locally toplevel = pa.combine_related(return_toplevel=True, poolsize=1, size_cutoff=2) assert len(toplevel) == 3, 'Got %d toplevel statements.' % len(toplevel)
def test_influence_refinement(): tran = 'UN/entities/human/infrastructure/transportation' truck = 'UN/entities/human/infrastructure/transportation/' + \ 'transportation_methods' agr = 'UN/entities/human/livelihood' ctran = Event(Concept('transportation', db_refs={'UN': [(tran, 1.0)]})) ctruck = Event(Concept('trucking', db_refs={'UN': [(truck, 1.0)]})) cagr = Event(Concept('agriculture', db_refs={'UN': [(agr, 1.0)]})) stmt1 = Influence(ctran, cagr, evidence=[Evidence(source_api='eidos1')]) stmt2 = Influence(ctruck, cagr, evidence=[Evidence(source_api='eidos2')]) stmt3 = Influence(cagr, ctran, evidence=[Evidence(source_api='eidos3')]) eidos_ont = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sources/eidos/eidos_ontology.rdf') hm = HierarchyManager(eidos_ont, True, True) hierarchies = {'entity': hm} pa = Preassembler(hierarchies, [stmt1, stmt2, stmt3]) rel_stmts = pa.combine_related() assert len(rel_stmts) == 2 truck_stmt = [st for st in rel_stmts if st.subj.concept.name == 'trucking'][0] assert len(truck_stmt.supported_by) == 1 assert truck_stmt.supported_by[0].subj.concept.name == 'transportation'
def test_modification_refinement_noenz2(): """A more specific modification statement should be supported by a more generic modification statement. Similar to test_modification_refinement_noenz for statements where one argument is associated with a component in the hierarchy (SIRT1 in this case) but the other is not (BECN1). """ sirt1 = Agent('SIRT1', db_refs={'HGNC':'14929', 'UP':'Q96EB6', 'TEXT':'SIRT1'}) becn1 = Agent('BECN1', db_refs={'HGNC': '1034', 'UP': 'Q14457', 'TEXT': 'Beclin 1'}) st1 = Deacetylation(sirt1, becn1) st2 = Deacetylation(None, becn1) pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. assert (len(stmts) == 1) assert (stmts[0].equals(st1)) assert (len(stmts[0].supported_by) == 1) assert (stmts[0].supported_by[0].equals(st2)) assert (stmts[0].supported_by[0].supports[0].equals(st1))
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker # Link statements linked_stmts = MechLinker.infer_active_forms(related_stmts) linked_stmts += MechLinker.infer_modifications(related_stmts) linked_stmts += MechLinker.infer_activations(related_stmts) # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [stmt for stmt in related_stmts if stmt not in background_assertions] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model().encode('utf-8')) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def run_preassembly(self, stmts, print_summary=True): """Run complete preassembly procedure on the given statements. Results are returned as a dict and stored in the attribute :py:attr:`results`. They are also saved in the pickle file `<basename>_results.pkl`. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Statements to preassemble. print_summary : bool If True (default), prints a summary of the preassembly process to the console. Returns ------- dict A dict containing the following entries: - `raw`: the starting set of statements before preassembly. - `duplicates1`: statements after initial de-duplication. - `valid`: statements found to have valid modification sites. - `mapped`: mapped statements (list of :py:class:`indra.preassembler.sitemapper.MappedStatement`). - `mapped_stmts`: combined list of valid statements and statements after mapping. - `duplicates2`: statements resulting from de-duplication of the statements in `mapped_stmts`. - `related2`: top-level statements after combining the statements in `duplicates2`. """ # First round of preassembly: remove duplicates before sitemapping pa1 = Preassembler(hierarchies, stmts) logger.info("Combining duplicates") pa1.combine_duplicates() # Map sites logger.info("Mapping sites") (valid, mapped) = sm.map_sites(pa1.unique_stmts) # Combine valid and successfully mapped statements into single list correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) mapped_stmts = valid + correctly_mapped_stmts # Second round of preassembly: de-duplicate and combine related pa2 = Preassembler(hierarchies, mapped_stmts) logger.info("Combining duplicates again") pa2.combine_duplicates() pa2.combine_related() # Fill out the results dict self.results = {} self.results['raw'] = stmts self.results['duplicates1'] = pa1.unique_stmts self.results['valid'] = valid self.results['mapped'] = mapped self.results['mapped_stmts'] = mapped_stmts self.results['duplicates2'] = pa2.unique_stmts self.results['related2'] = pa2.related_stmts # Print summary if print_summary: logger.info("\nStarting number of statements: %d" % len(stmts)) logger.info("After duplicate removal: %d" % len(pa1.unique_stmts)) logger.info("Unique statements with valid sites: %d" % len(valid)) logger.info("Unique statements with invalid sites: %d" % len(mapped)) logger.info("After post-mapping duplicate removal: %d" % len(pa2.unique_stmts)) logger.info("After combining related statements: %d" % len(pa2.related_stmts)) # Save the results if we're caching if self.basename is not None: results_filename = '%s_results.pkl' % self.basename with open(results_filename, 'wb') as f: pickle.dump(self.results, f, protocol=2) return self.results