def test_bound_condition_refinement(): """A statement with more specific bound context should be supported by a less specific statement.""" src = Agent('SRC', db_refs={'HGNC': '11283'}) gtp = Agent('GTP', db_refs={'CHEBI': '15996'}) nras = Agent('NRAS', db_refs={'HGNC': '7989'}) nrasgtp = Agent('NRAS', db_refs={'HGNC': '7989'}, bound_conditions=[BoundCondition(gtp, True)]) st1 = Phosphorylation(src, nras, 'tyrosine', '32') st2 = Phosphorylation(src, nrasgtp, 'tyrosine', '32') # The top-level list should contain only one statement, the more specific # modification, supported by the less-specific modification. pa = Preassembler(hierarchies, stmts=[st1, st2]) stmts = pa.combine_related() assert len(stmts) == 1 assert stmts[0].equals(st2) assert len(stmts[0].supported_by) == 1 assert stmts[0].supported_by[0].equals(st1)
def test_save_sentences_unicode(): mek = Agent('MEK', db_refs={'TEXT': 'MAP2K1'}) ev = Evidence(source_api='reach', pmid='PMID000asdf', text='foo\U0001F4A9bar') st = Phosphorylation(None, mek, evidence=[ev]) sent = get_sentences_for_agent('MAP2K1', [st]) assert unicode_strs(sent) twg = agent_texts_with_grounding([st]) save_sentences(twg, [st], 'test_save_sentences.csv')
def test_in_place_overwrite_of_gm(): """Make sure HGNC lookups don't modify the original grounding map by adding keys.""" erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'P28482'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_agents([stmt]) gmap_after_mapping = gm.gm assert set(gmap_after_mapping['ERK1'].keys()) == set(['TEXT', 'UP'])
def test_simple_mapping(): akt = Agent('pkbA', db_refs={'TEXT': 'Akt', 'UP':'XXXXXX'}) stmt = Phosphorylation(None, akt) gm = GroundingMapper(default_grounding_map) mapped_stmts = gm.map_agents([stmt]) assert len(mapped_stmts) == 1 mapped_akt = mapped_stmts[0].sub assert mapped_akt.db_refs['TEXT'] == 'Akt' assert mapped_akt.db_refs['BE'] == 'AKT' assert unicode_strs((akt, stmt, gm, mapped_akt))
def test_up_and_mismatched_hgnc(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'P28482', 'HGNC': '6877'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert mapped_stmts[0].sub.db_refs['HGNC'] == '6877', \ mapped_stmts[0].sub.db_refs assert mapped_stmts[0].sub.db_refs['UP'] == 'P27361', \ mapped_stmts[0].sub.db_refs
def test_up_with_no_gene_name_with_hgnc_sym(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'A0K5Q6', 'HGNC': '6871'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert mapped_stmts[0].sub.db_refs['HGNC'] == '6871', \ mapped_stmts[0].sub.db_refs assert mapped_stmts[0].sub.db_refs['UP'] == 'P28482', \ mapped_stmts[0].sub.db_refs
def test_map_standardize_up_hgnc(): a1 = Agent('MAPK1', db_refs={'HGNC': '6871'}) a2 = Agent('MAPK1', db_refs={'UP': 'P28482'}) stmt = Phosphorylation(a1, a2) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 st = mapped_stmts[0] assert st.enz.db_refs['HGNC'] == st.sub.db_refs['HGNC'], \ (st.enz.db_refs, st.sub.db_refs) assert st.enz.db_refs['UP'] == st.sub.db_refs['UP']
def get_statements(self): stmts = [] for rel_key, rel_info in self._relations.items(): # Turn the arguments into a dict. args = {e['role']: e['entity_duid'] for e in rel_info['argument']} entity_args = args.copy() # Remove some special cases. trigger_id = entity_args.pop('TRIGGER') site_id = entity_args.pop('SITE', None) # Get the entity ids. entities = { role: self._get_agent(eid) for role, eid in entity_args.items() } rel_type = rel_info['relationType'] if rel_type == 'PHOSPHORYLATION': # Get the agents. enz, enz_coords = entities.get('KINASE', (None, None)) sub, sub_coords = entities.get('SUBSTRATE', (None, None)) if sub is None: continue # Get the site residue, position, site_coords = self._get_site(site_id) # Get the evidence ev = self._get_evidence(trigger_id, args, [enz_coords, sub_coords], site_coords) # Turn taxonomy into context, sub TAX takes precedence tax = None if enz and 'TAX' in enz.db_refs: tax = enz.db_refs.pop('TAX') if sub and 'TAX' in sub.db_refs: tax = sub.db_refs.pop('TAX') if tax is not None: context = \ BioContext(species=RefContext(tax, {'TAXONOMY': tax})) ev.context = context stmts.append( Phosphorylation(enz, sub, residue=residue, position=position, evidence=[ev])) else: logger.warning("Unhandled statement type: %s" % rel_type) return stmts
def test_ground_gilda(): for mode in ['web', 'local']: mek = Agent('Mek', db_refs={'TEXT': 'MEK'}) erk = Agent('Erk1', db_refs={'TEXT': 'Erk1'}) stmt = Phosphorylation(mek, erk) ground_statements([stmt], mode=mode) assert stmt.enz.name == 'MEK', stmt.enz assert stmt.enz.db_refs['FPLX'] == 'MEK' assert stmt.sub.name == 'MAPK3' assert stmt.sub.db_refs['HGNC'] == '6877' assert stmt.sub.db_refs['UP'] == 'P27361'
def test_get_statement_queries(): ag = Agent('MAP2K1', db_refs={}) stmt = Phosphorylation(None, ag) urls = get_statement_queries([stmt]) assert 'MAP2K1@NAME' in urls[0] urls = get_statement_queries([stmt], fallback_ns='TEXT') assert 'MAP2K1@TEXT' in urls[0] urls = get_statement_queries([stmt], pick_ns_fun=lambda x: '%s@%s' % (x.name, 'XXX')) assert 'MAP2K1@XXX' in urls[0], urls[0] ag = Agent('MEK', db_refs={'FPLX': 'MEK'}) stmt = Phosphorylation(None, ag) urls = get_statement_queries([stmt]) assert 'MEK@FPLX' in urls[0] urls = get_statement_queries([stmt], fallback_ns='TEXT') assert 'MEK@FPLX' in urls[0] urls = get_statement_queries([stmt], pick_ns_fun=lambda x: '%s@%s' % (x.name, 'XXX'))
def test_flatten_evidence_multilevel(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', evidence=[Evidence(text='bar')]) st3 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='baz')]) pa = Preassembler(hierarchies, stmts=[st1, st2, st3]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 3, len(top_stmt.evidence) anns = [ev.annotations['support_type'] for ev in top_stmt.evidence] assert anns.count('direct') == 1 assert anns.count('supported_by') == 2
def process_phosphorylations(self, skip_empty=True): """Create Phosphorylation statements from phosphoelm_data Parameters ---------- skip_empty : bool Default: True. If False, also create statements when upstream kinases in entry['kinases'] are not known. """ for entry in self._phosphoelm_data: if entry['species'].lower() != 'h**o sapiens' or\ skip_empty and not entry['kinases']: # Skip entries without any kinases or if species is other # than human. continue # Entries: # 'acc': '<UP ID>', <-- substrate # 'sequence': '<protein sequence>', # 'position': '<sequence position>', # 'code': '<phosphorylated residue>', # 'pmids': '<pmid>', # 'kinases': '<responsible kinase>', <-- enzyme # 'source': 'HTP|LTP', # 'species': '<species name in latin>', # 'entry_date': 'yyyy-mm-dd HH:MM:SS.mmmmmm' substrate = _agent_from_id(entry['acc']) enzyme = _agent_from_str(entry['kinases']) # Skip if enz is None instead of an Agent (only when we skip # empty kinase entries) if skip_empty and enzyme is None: continue pmid = entry['pmids'] if not validate_text_refs({'PMID': pmid}): pmid = None # Build evidence, add statement evidence = Evidence(source_api='phosphoelm', pmid=pmid, annotations={ 'data_source': entry.get('source'), 'phosphoelm_substrate_id': entry['acc'], 'phosphoelm_kinase_name': entry.get('kinases'), 'entry_date': entry['entry_date'], 'sequence': entry['sequence'] }) self.statements.append( Phosphorylation(enz=enzyme, sub=substrate, residue=entry['code'], position=entry['position'], evidence=evidence))
def make_test_statements(a, b, source_api, ev_num=None, copies=1): stmts = [] A = Agent(a) B = Agent(b) for i in range(copies): if ev_num is None: ev_num = i ev_text = "Evidence %d for %s phosphorylates %s." % (ev_num, a, b) ev_list = [Evidence(text=ev_text, source_api=source_api)] stmts.append(Phosphorylation(Agent(A), Agent(B), evidence=ev_list)) return stmts
def test_map_standardize_chebi_hmdb(): a1 = Agent('X', db_refs={'HMDB': 'HMDB0000122'}) a2 = Agent('Y', db_refs={'CHEBI': 'CHEBI:15903'}) stmt = Phosphorylation(a1, a2) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 st = mapped_stmts[0] assert st.enz.db_refs['CHEBI'] == st.sub.db_refs['CHEBI'], \ (st.enz.db_refs, st.sub.db_refs) assert st.enz.name == 'beta-D-glucose', st.enz assert st.sub.name == 'beta-D-glucose', st.sub
def test_flatten_evidence_hierarchy_supports(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa_stmts = pa.combine_related(return_toplevel=False) assert len(pa_stmts) == 2 flattened = flatten_evidence(pa_stmts, collect_from='supports') assert len(flattened) == 2 top_stmt = flattened[1] assert len(top_stmt.evidence) == 1 assert 'bar' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 2 assert set([e.text for e in supporting_stmt.evidence]) == {'foo', 'bar'}
def test_map_standardize_chebi_hmdb(): a1 = Agent('X', db_refs={'HMDB': 'HMDB0000122'}) a2 = Agent('Y', db_refs={'CHEBI': 'CHEBI:4167'}) gm = GroundingMapper(default_grounding_map) stmt = Phosphorylation(a1, a2) mapped_stmts = gm.map_agents([stmt]) assert len(mapped_stmts) == 1 st = mapped_stmts[0] assert st.enz.db_refs['CHEBI'] == st.sub.db_refs['CHEBI'], \ (st.enz.db_refs, st.sub.db_refs) assert st.enz.name == 'D-glucopyranose', st.enz assert st.sub.name == 'D-glucopyranose', st.sub
def test_hgnc_but_not_up(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'HGNC': '6871'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_erk = mapped_stmts[0].sub assert mapped_erk.name == 'MAPK1' assert mapped_erk.db_refs['TEXT'] == 'ERK1' assert mapped_erk.db_refs['HGNC'] == '6871' assert mapped_erk.db_refs['UP'] == 'P28482'
def test_renaming(): akt_indra = Agent('pkbA', db_refs={'TEXT': 'Akt', 'FPLX': 'AKT family', 'UP': 'P31749'}) akt_hgnc_from_up = Agent('pkbA', db_refs={'TEXT': 'Akt', 'UP': 'P31749'}) akt_other = Agent('pkbA', db_refs={'TEXT': 'Akt'}) tat_up_no_hgnc = Agent('foo', db_refs={'TEXT': 'bar', 'UP': 'P04608'}) stmts = [Phosphorylation(None, akt_indra), Phosphorylation(None, akt_hgnc_from_up), Phosphorylation(None, akt_other), Phosphorylation(None, tat_up_no_hgnc), ] renamed_stmts = gm.rename_agents(stmts) assert len(renamed_stmts) == 4 # Should draw on BE first assert renamed_stmts[0].sub.name == 'AKT family' # Then on the HGNC lookup from Uniprot assert renamed_stmts[1].sub.name == 'AKT1', renamed_stmts[1].sub.name # Don't fall back on text if there's no grounding assert renamed_stmts[2].sub.name == 'pkbA' assert renamed_stmts[3].sub.name == 'tat' assert unicode_strs((akt_indra, akt_hgnc_from_up, akt_other, tat_up_no_hgnc, stmts, gm, renamed_stmts))
def test_text_and_norm_text(): gm.gilda_mode = 'local' # We should filter out ignores in both TEXT and TEXT_NORM ag = Agent('x', db_refs={'TEXT': 'XREF_BIBR', 'TEXT_NORM': 'ERK'}) stmt = Phosphorylation(None, ag) res = gm.map_stmts([stmt]) assert not res ag = Agent('x', db_refs={'TEXT': 'ERK', 'TEXT_NORM': 'XREF_BIBR'}) stmt = Phosphorylation(None, ag) res = gm.map_stmts([stmt]) assert not res # We should disambiguate based on both TEXT and TEXT_NORM ag = Agent('x', db_refs={'TEXT': 'AA', 'TEXT_NORM': 'XXX'},) stmt = Phosphorylation(None, ag, evidence=Evidence(text='Arachidonic acid (AA)')) res = gm.map_stmts([stmt]) assert res[0].sub.name == 'arachidonic acid', res[0] ag = Agent('x', db_refs={'TEXT': 'XXX', 'TEXT_NORM': 'AA'}) stmt = Phosphorylation(None, ag, evidence=Evidence(text='Arachidonic acid (AA)')) res = gm.map_stmts([stmt]) assert res[0].sub.name == 'arachidonic acid', res[0] ag = Agent('x', db_refs={'TEXT': 'XXX', 'TEXT_NORM': 'ERK'}) stmt = Phosphorylation(None, ag) res = gm.map_stmts([stmt]) assert res[0].sub.name == 'ERK', res[0] ag = Agent('x', db_refs={'TEXT': 'ERK', 'TEXT_NORM': 'XXX'}) stmt = Phosphorylation(None, ag) res = gm.map_stmts([stmt]) assert res[0].sub.name == 'ERK', res[0]
def test_flatten_evidence_hierarchy(): braf = Agent('BRAF') mek = Agent('MAP2K1') st1 = Phosphorylation(braf, mek, evidence=[Evidence(text='foo')]) st2 = Phosphorylation(braf, mek, 'S', '218', evidence=[Evidence(text='bar')]) pa = Preassembler(hierarchies, stmts=[st1, st2]) pa.combine_related() assert len(pa.related_stmts) == 1 flattened = flatten_evidence(pa.related_stmts) assert len(flattened) == 1 top_stmt = flattened[0] assert len(top_stmt.evidence) == 2 assert 'bar' in [e.text for e in top_stmt.evidence] assert 'foo' in [e.text for e in top_stmt.evidence] assert len(top_stmt.supported_by) == 1 supporting_stmt = top_stmt.supported_by[0] assert len(supporting_stmt.evidence) == 1 assert supporting_stmt.evidence[0].text == 'foo'
def test_gilda_ground_ungrounded(): ag1 = Agent('x', db_refs={'TEXT': 'RAS', 'FPLX': 'RAS'}) ag2 = Agent('x', db_refs={'TEXT': 'RAS'}) ag3 = Agent('x', db_refs={'TEXT': 'RAS', 'XXXXX': 'XXXX'}) stmts = [Phosphorylation(None, ag) for ag in (ag1, ag2, ag3)] ground_statement(stmts[0], ungrounded_only=True) assert ag1.name == 'x' ground_statement(stmts[0], ungrounded_only=False) assert ag1.name == 'RAS', ag1 ground_statement(stmts[1], ungrounded_only=True) assert ag2.name == 'RAS' grounded_stmts = ground_statements([stmts[2]], ungrounded_only=True) assert grounded_stmts[0].sub.name == 'RAS'
def test_map_standardize_chebi_pc(): a1 = Agent('X', db_refs={'PUBCHEM': '42611257'}) a2 = Agent('Y', db_refs={'CHEBI': 'CHEBI:63637'}) stmt = Phosphorylation(a1, a2) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 st = mapped_stmts[0] assert st.enz.db_refs['PUBCHEM'] == st.sub.db_refs['PUBCHEM'], \ (st.enz.db_refs, st.sub.db_refs) assert st.enz.db_refs['CHEBI'] == st.sub.db_refs['CHEBI'], \ (st.enz.db_refs, st.sub.db_refs) assert st.enz.name == 'vemurafenib' assert st.sub.name == 'vemurafenib'
def test_up_id_with_no_hgnc_id(): """Non human protein""" gag = Agent('Gag', db_refs={'TEXT': 'Gag'}) stmt = Phosphorylation(None, gag) g_map = {'Gag': {'TEXT': 'Gag', 'UP': 'P04585'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_gag = mapped_stmts[0].sub assert mapped_gag.name == 'gag-pol' assert mapped_gag.db_refs['TEXT'] == 'Gag' assert mapped_gag.db_refs.get('HGNC') is None assert mapped_gag.db_refs['UP'] == 'P04585'
def test_intervention_query_from_stmt(): stmt = Activation(Agent('EGF', db_refs={'HGNC': '3229'}), Agent('ERK', db_refs={'FPLX': 'ERK'})) query = SimpleInterventionProperty.from_stmt(stmt) assert isinstance(query.condition_entity, Agent) assert query.condition_entity.name == 'EGF' assert isinstance(query.target_entity, Agent) assert query.target_entity.name == 'ERK' assert query.target_entity.activity stmt = Phosphorylation(Agent('EGF', db_refs={'HGNC': '3229'}), Agent('ERK', db_refs={'FPLX': 'ERK'})) query = SimpleInterventionProperty.from_stmt(stmt) assert query.target_entity.mods
def test_up_id_with_no_gene_name(): """Expect no HGNC entry; no error raised.""" no_gn = Agent('NoGNname', db_refs={'TEXT': 'NoGN'}) stmt = Phosphorylation(None, no_gn) g_map = {'NoGN': {'TEXT': 'NoGN', 'UP': 'A0K5Q6'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_ag = mapped_stmts[0].sub assert mapped_ag.name == 'NoGNname' assert mapped_ag.db_refs['TEXT'] == 'NoGN' assert mapped_ag.db_refs.get('HGNC') is None assert mapped_ag.db_refs['UP'] == 'A0K5Q6'
def test_find_contradicts_refinement(): ras = Agent('RAS', db_refs={'FPLX': 'RAS'}) kras = Agent('KRAS', db_refs={'HGNC': '6407'}) hras = Agent('HRAS', db_refs={'HGNC': '5173'}) st1 = Phosphorylation(Agent('x'), ras) st2 = Dephosphorylation(Agent('x'), kras) st3 = Dephosphorylation(Agent('x'), hras) pa = Preassembler(hierarchies, [st1, st2, st3]) contradicts = pa.find_contradicts() assert len(contradicts) == 2 for s1, s2 in contradicts: assert {s1.uuid, s2.uuid} in ({st1.uuid, st2.uuid}, {st1.uuid, st3.uuid})
def process_phosphorylation_statements(self): """Looks for Phosphorylation events in the graph and extracts them into INDRA statements. In particular, looks for a Positive_regulation event node with a child Phosphorylation event node. If Positive_regulation has an outgoing Cause edge, that's the subject If Phosphorylation has an outgoing Theme edge, that's the object If Phosphorylation has an outgoing Site edge, that's the site """ G = self.G statements = [] pwcs = self.find_event_parent_with_event_child('Positive_regulation', 'Phosphorylation') for pair in pwcs: (pos_reg, phos) = pair cause = self.get_entity_text_for_relation(pos_reg, 'Cause') theme = self.get_entity_text_for_relation(phos, 'Theme') print('Cause:', cause, 'Theme:', theme) # If the trigger word is dephosphorylate or similar, then we # extract a dephosphorylation statement trigger_word = self.get_entity_text_for_relation( phos, 'Phosphorylation') if 'dephos' in trigger_word: deph = True else: deph = False site = self.get_entity_text_for_relation(phos, 'Site') theme_node = self.get_related_node(phos, 'Theme') assert (theme_node is not None) evidence = self.node_to_evidence(theme_node, is_direct=False) if theme is not None: if deph: statements.append( Dephosphorylation(s2a(cause), s2a(theme), site, evidence=evidence)) else: statements.append( Phosphorylation(s2a(cause), s2a(theme), site, evidence=evidence)) return statements
def test_grounding_map_gilda_priority(): gm.gilda_mode = 'web' fetal_bovine_serum = Agent('FBS', db_refs={'TEXT': 'FBS'}) pmid = '28536624' stmt = Phosphorylation(None, fetal_bovine_serum, evidence=[Evidence(pmid=pmid, text_refs={'PMID': pmid})]) mapped_stmts = gm.map_stmts([stmt]) annotations = mapped_stmts[0].evidence[0].annotations # agents should not be in annotations if gilda is run. Second condition # added as future proofing in case some future change causes this mapping # to add agent annotations in the future. assert 'agents' not in annotations or \ 'gilda' not in annotations['agents']
def test_path_property_to_json(): stmt = Phosphorylation(enz=Agent('EGFR', db_refs={'HGNC': '3236'}), sub=Agent('ERK', db_refs={'FPLX': 'ERK'})) entity_constraints = {'exclude': [Agent('PI3K', db_refs={'FPLX': 'PI3K'})]} relationship_contraints = {'exclude': ['IncreaseAmount', 'DecreaseAmount']} query = PathProperty(stmt, entity_constraints, relationship_contraints) assert query json = query.to_json() assert json.get('type') == 'path_property' path = json.get('path') assert path.get('type') == 'Phosphorylation' deserialize_query = Query._from_json(json) json2 = deserialize_query.to_json() assert json == json2, {'json': json, 'json2': json2}
def test_map_entry_hgnc_and_up(): """Make sure that HGNC symbol is replaced with HGNC ID when grounding map includes both UP ID and HGNC symbol.""" rela = Agent('NF-kappaB p65', db_refs={'TEXT': 'NF-kappaB p65'}) erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(erk, rela) g_map = {'NF-kappaB p65': {'TEXT': 'NF-kappaB p65', 'UP': 'Q04206', 'HGNC': 'RELA'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_agents([stmt]) assert len(mapped_stmts) == 1 ms = mapped_stmts[0] assert ms.sub.db_refs == {'TEXT': 'NF-kappaB p65', 'UP': 'Q04206', 'HGNC': '9955'}