def __init__(self, a1_text, a2_text, sentence_segmentations, pmid): # Store pmid self.pmid = pmid # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) # Run TEES and parse into networkx graph self.G = parse_output(a1_text, a2_text, sentence_segmentations) # Extract statements from the TEES graph self.statements = [] self.statements.extend(self.process_phosphorylation_statements()) self.statements.extend(self.process_binding_statements()) self.statements.extend(self.process_increase_expression_amount()) self.statements.extend(self.process_decrease_expression_amount()) # Ground statements self.statements = mapper.map_agents(self.statements)
def map_grounding(stmts_in, **kwargs): """Map grounding using the GroundingMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. do_rename : Optional[bool] If True, Agents are renamed based on their mapped grounding. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping grounding on %d statements...' % len(stmts_in)) do_rename = kwargs.get('do_rename') if do_rename is None: do_rename = True gm = GroundingMapper(grounding_map) stmts_out = gm.map_agents(stmts_in, do_rename=do_rename) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
def __init__(self, text, pmid, tees_path, python2_path): # Store pmid self.pmid = pmid # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) # Run TEES and parse into networkx graph self.G = run_and_parse_tees(text, tees_path, python2_path) # Extract statements from the TEES graph self.statements = [] self.statements.extend(self.process_phosphorylation_statements()) self.statements.extend(self.process_binding_statements()) self.statements.extend(self.process_increase_expression_amount()) self.statements.extend(self.process_decrease_expression_amount()) # Ground statements self.statements = mapper.map_agents(self.statements)
def _agent_from_ns_id(ag_ns, ag_id): # Add the ID as a placeholder name agent = Agent(ag_id) # If we have a proper grounding, add to db_refs if ag_id is not None: agent.db_refs[ag_ns] = ag_id # Now standardize db_refs and set standardized name GroundingMapper.standardize_agent_name(agent, standardize_refs=True) agent.db_refs['TEXT'] = agent.name return agent
def add_grounding(self): # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) self.statements = mapper.map_agents(self.statements)
def filter_grounded(stmts): gm = GroundingMapper(grounding_map) stmts_mapped = gm.map_agents(stmts, do_rename=True) stmts_grounded = [] for stmt in stmts_mapped: all_grounded = True for agent in stmt.agent_list(): if agent is not None: if set(agent.db_refs.keys()) == set(['TEXT']): all_grounded = False break if all_grounded: stmts_grounded.append(stmt) return stmts_grounded
def get_agent(name): opts = {'text': name} indra_url = read_from_config('INDRA_GROUND_URL') res = requests.post(indra_url, json=opts) if res.status_code != 200 and not res.json(): return Agent(name, db_refs={'TEXT': name}) js = res.json() top_term = js[0]['term'] agent = Agent(name, db_refs={'TEXT': name, top_term['db']: top_term['id']}) GroundingMapper.standardize_agent_name(agent, standardize_refs=True) return agent
def test_up_with_no_gene_name_with_hgnc_sym(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'A0K5Q6', 'HGNC': '6871'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert mapped_stmts[0].sub.db_refs['HGNC'] == '6871', \ mapped_stmts[0].sub.db_refs assert mapped_stmts[0].sub.db_refs['UP'] == 'P28482', \ mapped_stmts[0].sub.db_refs
def test_in_place_overwrite_of_gm(): """Make sure HGNC lookups don't modify the original grounding map by adding keys.""" erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'P28482'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) gmap_after_mapping = gm.grounding_map assert set(gmap_after_mapping['ERK1'].keys()) == set(['TEXT', 'UP'])
def test_up_and_mismatched_hgnc(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'P28482', 'HGNC': '6877'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert mapped_stmts[0].sub.db_refs['HGNC'] == '6877', \ mapped_stmts[0].sub.db_refs assert mapped_stmts[0].sub.db_refs['UP'] == 'P27361', \ mapped_stmts[0].sub.db_refs
def test_hgnc_but_not_up(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'HGNC': '6871'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_erk = mapped_stmts[0].sub assert mapped_erk.name == 'MAPK1' assert mapped_erk.db_refs['TEXT'] == 'ERK1' assert mapped_erk.db_refs['HGNC'] == '6871' assert mapped_erk.db_refs['UP'] == 'P28482'
def test_up_id_with_no_gene_name(): """Expect no HGNC entry; no error raised.""" no_gn = Agent('NoGNname', db_refs={'TEXT': 'NoGN'}) stmt = Phosphorylation(None, no_gn) g_map = {'NoGN': {'TEXT': 'NoGN', 'UP': 'A0K5Q6'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_ag = mapped_stmts[0].sub assert mapped_ag.name == 'NoGNname' assert mapped_ag.db_refs['TEXT'] == 'NoGN' assert mapped_ag.db_refs.get('HGNC') is None assert mapped_ag.db_refs['UP'] == 'A0K5Q6'
def test_up_id_with_no_hgnc_id(): """Non human protein""" gag = Agent('Gag', db_refs={'TEXT': 'Gag'}) stmt = Phosphorylation(None, gag) g_map = {'Gag': {'TEXT': 'Gag', 'UP': 'P04585'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 mapped_gag = mapped_stmts[0].sub assert mapped_gag.name == 'gag-pol' assert mapped_gag.db_refs['TEXT'] == 'Gag' assert mapped_gag.db_refs.get('HGNC') is None assert mapped_gag.db_refs['UP'] == 'P04585'
def test_map_entry_hgnc_and_up(): """Make sure that HGNC symbol is replaced with HGNC ID when grounding map includes both UP ID and HGNC symbol.""" rela = Agent('NF-kappaB p65', db_refs={'TEXT': 'NF-kappaB p65'}) erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(erk, rela) g_map = {'NF-kappaB p65': {'TEXT': 'NF-kappaB p65', 'UP': 'Q04206', 'HGNC': '9955'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt]) assert len(mapped_stmts) == 1 ms = mapped_stmts[0] assert ms.sub.db_refs == {'TEXT': 'NF-kappaB p65', 'UP': 'Q04206', 'HGNC': '9955'}
def test_name_standardize_hgnc_up(): a1 = Agent('x', db_refs={'HGNC': '9387'}) GroundingMapper.standardize_agent_name(a1, True) assert a1.name == 'PRKAG3' a1 = Agent('x', db_refs={'UP': 'Q9UGI9'}) GroundingMapper.standardize_agent_name(a1, True) assert a1.name == 'PRKAG3' a1 = Agent('x', db_refs={'UP': 'Q8BGM7'}) GroundingMapper.standardize_agent_name(a1, True) assert a1.name == 'Prkag3'
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker ml = MechLinker(related_stmts) # Link statements linked_stmts = ml.link_statements() # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [ stmt for stmt in related_stmts if stmt not in background_assertions ] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh, protocol=2) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model()) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def get_channel_agent(channel): ag = Agent(channel, db_refs={'HGNC': hgnc_client.get_hgnc_id(channel)}) GroundingMapper.standardize_agent_name(ag, standardize_refs=True) return ag
def test_name_standardize_chebi(): a1 = Agent('x', db_refs={'CHEBI': '15996'}) GroundingMapper.standardize_agent_name(a1, False) assert a1.name == 'GTP'
def test_name_standardize_mesh(): a1 = Agent('x', db_refs={'MESH': 'D008545'}) GroundingMapper.standardize_agent_name(a1, False) assert a1.name == 'Melanoma', a1.name
def test_up_and_invalid_hgnc_sym(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'UP': 'P28482', 'HGNC': 'foobar'}} gm = GroundingMapper(g_map)
def get_statements(): statements = [] egf = Agent('EGF') egfr = Agent('EGFR') st = Complex([egf, egfr]) statements.append(st) egfre = Agent('EGFR', bound_conditions=[BoundCondition(egf, True)]) egfre = Agent('EGFR', bound_conditions=[BoundCondition(egf, True)]) st = Complex([egfre, egfre]) statements.append(st) egfrdimer = Agent('EGFR', bound_conditions=[BoundCondition(egfr, True)]) st = Transphosphorylation(egfrdimer, 'Y') statements.append(st) egfrpY = Agent('EGFR', mods=[ModCondition('phosphorylation', 'Y')]) grb2 = Agent('GRB2') st = Complex([egfrpY, grb2]) statements.append(st) grb2bound = Agent('GRB2', bound_conditions=[BoundCondition(egfr, True)]) sos1 = Agent('SOS1') st = Complex([grb2bound, sos1]) statements.append(st) hras = Agent('HRAS') kras = Agent('KRAS') nras = Agent('NRAS') gdp = Agent('GDP') for ras in [hras, kras, nras]: st = Complex([ras, gdp]) statements.append(st) sos1bound = Agent('SOS1', bound_conditions=[BoundCondition(grb2, True)]) hras_gdp = Agent('HRAS', bound_conditions=[BoundCondition(gdp, True)]) kras_gdp = Agent('KRAS', bound_conditions=[BoundCondition(gdp, True)]) nras_gdp = Agent('NRAS', bound_conditions=[BoundCondition(gdp, True)]) for ras_gdp in [hras_gdp, kras_gdp, nras_gdp]: st = Complex([sos1bound, ras_gdp]) statements.append(st) st = ActiveForm(ras_gdp, 'activity', False) statements.append(st) hras_bound = Agent('HRAS', bound_conditions=[BoundCondition(sos1, True)]) kras_bound = Agent('KRAS', bound_conditions=[BoundCondition(sos1, True)]) nras_bound = Agent('NRAS', bound_conditions=[BoundCondition(sos1, True)]) sos1bound = Agent('SOS1', bound_conditions=[BoundCondition(grb2, True)]) for ras_bound in [hras_bound, kras_bound, nras_bound]: st = Complex([sos1bound, ras_bound]) statements.append(st) gtp = Agent('GTP') hras_gtp = Agent('HRAS', bound_conditions=[BoundCondition(gtp, True)]) kras_gtp = Agent('KRAS', bound_conditions=[BoundCondition(gtp, True)]) nras_gtp = Agent('NRAS', bound_conditions=[BoundCondition(gtp, True)]) braf = Agent('BRAF') for ras_gtp in [hras_gtp, kras_gtp, nras_gtp]: st = Complex([ras_gtp, braf]) statements.append(st) st = ActiveForm(ras_gtp, 'activity', True) statements.append(st) hras_braf = Agent('BRAF', bound_conditions=[BoundCondition(hras, True)]) kras_braf = Agent('BRAF', bound_conditions=[BoundCondition(kras, True)]) nras_braf = Agent('BRAF', bound_conditions=[BoundCondition(nras, True)]) for braf1 in [hras_braf, kras_braf, nras_braf]: for braf2 in [hras_braf, kras_braf, nras_braf]: st = Complex([braf1, braf2]) statements.append(st) braf_bound = Agent('BRAF', bound_conditions=[BoundCondition(braf, True)]) st = Transphosphorylation(braf_bound) statements.append(st) braf_phos = Agent('BRAF', mods=[ModCondition('phosphorylation')]) mek1 = Agent('MAP2K1') mek2 = Agent('MAP2K2') st = ActiveForm(braf_phos, 'kinase', True) statements.append(st) st = Phosphorylation(braf_phos, mek1) statements.append(st) st = Phosphorylation(braf_phos, mek2) statements.append(st) mek1_phos = Agent('MAP2K1', mods=[ModCondition('phosphorylation')]) mek2_phos = Agent('MAP2K2', mods=[ModCondition('phosphorylation')]) mapk1 = Agent('MAPK1') mapk3 = Agent('MAPK3') st = ActiveForm(mek1_phos, 'kinase', True) statements.append(st) st = ActiveForm(mek2_phos, 'kinase', True) statements.append(st) st = Phosphorylation(braf_phos, mek1) statements.append(st) st = Phosphorylation(braf_phos, mek2) statements.append(st) for mek in [mek1_phos, mek2_phos]: for erk in [mapk1, mapk3]: st = Phosphorylation(mek, erk) for st in statements: st.belief = 1 st.evidence.append(Evidence(source_api='assertion')) # Update the statements with grounding info. To do this, we set the "text" # field of the db_refs to copy from the agent name, then run the grounding # mapper for st in statements: for ag in st.agent_list(): if ag is None: continue else: ag.db_refs = {'TEXT': ag.name} # Now load the grounding map and run gm = GroundingMapper(default_grounding_map) mapped_stmts = gm.map_agents(statements) # This shouldn't change anything, but just in case... renamed_stmts = gm.rename_agents(mapped_stmts) return renamed_stmts
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker # Link statements linked_stmts = MechLinker.infer_active_forms(related_stmts) linked_stmts += MechLinker.infer_modifications(related_stmts) linked_stmts += MechLinker.infer_activations(related_stmts) # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [stmt for stmt in related_stmts if stmt not in background_assertions] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model().encode('utf-8')) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def test_hgnc_sym_with_no_id(): erk = Agent('ERK1', db_refs={'TEXT': 'ERK1'}) stmt = Phosphorylation(None, erk) g_map = {'ERK1': {'TEXT': 'ERK1', 'HGNC': 'foobar'}} gm = GroundingMapper(g_map) mapped_stmts = gm.map_stmts([stmt])
def test_name_standardize_go(): a1 = Agent('x', db_refs={'GO': 'GO:0006915'}) GroundingMapper.standardize_agent_name(a1, False) assert a1.name == 'apoptotic process'