def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker ml = MechLinker(related_stmts) # Link statements linked_stmts = ml.link_statements() # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [ stmt for stmt in related_stmts if stmt not in background_assertions ] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh, protocol=2) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model()) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def get_statements(): statements = [] egf = Agent('EGF') egfr = Agent('EGFR') st = Complex([egf, egfr]) statements.append(st) egfre = Agent('EGFR', bound_conditions=[BoundCondition(egf, True)]) egfre = Agent('EGFR', bound_conditions=[BoundCondition(egf, True)]) st = Complex([egfre, egfre]) statements.append(st) egfrdimer = Agent('EGFR', bound_conditions=[BoundCondition(egfr, True)]) st = Transphosphorylation(egfrdimer, 'Y') statements.append(st) egfrpY = Agent('EGFR', mods=[ModCondition('phosphorylation', 'Y')]) grb2 = Agent('GRB2') st = Complex([egfrpY, grb2]) statements.append(st) grb2bound = Agent('GRB2', bound_conditions=[BoundCondition(egfr, True)]) sos1 = Agent('SOS1') st = Complex([grb2bound, sos1]) statements.append(st) hras = Agent('HRAS') kras = Agent('KRAS') nras = Agent('NRAS') gdp = Agent('GDP') for ras in [hras, kras, nras]: st = Complex([ras, gdp]) statements.append(st) sos1bound = Agent('SOS1', bound_conditions=[BoundCondition(grb2, True)]) hras_gdp = Agent('HRAS', bound_conditions=[BoundCondition(gdp, True)]) kras_gdp = Agent('KRAS', bound_conditions=[BoundCondition(gdp, True)]) nras_gdp = Agent('NRAS', bound_conditions=[BoundCondition(gdp, True)]) for ras_gdp in [hras_gdp, kras_gdp, nras_gdp]: st = Complex([sos1bound, ras_gdp]) statements.append(st) st = ActiveForm(ras_gdp, 'activity', False) statements.append(st) hras_bound = Agent('HRAS', bound_conditions=[BoundCondition(sos1, True)]) kras_bound = Agent('KRAS', bound_conditions=[BoundCondition(sos1, True)]) nras_bound = Agent('NRAS', bound_conditions=[BoundCondition(sos1, True)]) sos1bound = Agent('SOS1', bound_conditions=[BoundCondition(grb2, True)]) for ras_bound in [hras_bound, kras_bound, nras_bound]: st = Complex([sos1bound, ras_bound]) statements.append(st) gtp = Agent('GTP') hras_gtp = Agent('HRAS', bound_conditions=[BoundCondition(gtp, True)]) kras_gtp = Agent('KRAS', bound_conditions=[BoundCondition(gtp, True)]) nras_gtp = Agent('NRAS', bound_conditions=[BoundCondition(gtp, True)]) braf = Agent('BRAF') for ras_gtp in [hras_gtp, kras_gtp, nras_gtp]: st = Complex([ras_gtp, braf]) statements.append(st) st = ActiveForm(ras_gtp, 'activity', True) statements.append(st) hras_braf = Agent('BRAF', bound_conditions=[BoundCondition(hras, True)]) kras_braf = Agent('BRAF', bound_conditions=[BoundCondition(kras, True)]) nras_braf = Agent('BRAF', bound_conditions=[BoundCondition(nras, True)]) for braf1 in [hras_braf, kras_braf, nras_braf]: for braf2 in [hras_braf, kras_braf, nras_braf]: st = Complex([braf1, braf2]) statements.append(st) braf_bound = Agent('BRAF', bound_conditions=[BoundCondition(braf, True)]) st = Transphosphorylation(braf_bound) statements.append(st) braf_phos = Agent('BRAF', mods=[ModCondition('phosphorylation')]) mek1 = Agent('MAP2K1') mek2 = Agent('MAP2K2') st = ActiveForm(braf_phos, 'kinase', True) statements.append(st) st = Phosphorylation(braf_phos, mek1) statements.append(st) st = Phosphorylation(braf_phos, mek2) statements.append(st) mek1_phos = Agent('MAP2K1', mods=[ModCondition('phosphorylation')]) mek2_phos = Agent('MAP2K2', mods=[ModCondition('phosphorylation')]) mapk1 = Agent('MAPK1') mapk3 = Agent('MAPK3') st = ActiveForm(mek1_phos, 'kinase', True) statements.append(st) st = ActiveForm(mek2_phos, 'kinase', True) statements.append(st) st = Phosphorylation(braf_phos, mek1) statements.append(st) st = Phosphorylation(braf_phos, mek2) statements.append(st) for mek in [mek1_phos, mek2_phos]: for erk in [mapk1, mapk3]: st = Phosphorylation(mek, erk) for st in statements: st.belief = 1 st.evidence.append(Evidence(source_api='assertion')) # Update the statements with grounding info. To do this, we set the "text" # field of the db_refs to copy from the agent name, then run the grounding # mapper for st in statements: for ag in st.agent_list(): if ag is None: continue else: ag.db_refs = {'TEXT': ag.name} # Now load the grounding map and run gm = GroundingMapper(default_grounding_map) mapped_stmts = gm.map_agents(statements) # This shouldn't change anything, but just in case... renamed_stmts = gm.rename_agents(mapped_stmts) return renamed_stmts
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker # Link statements linked_stmts = MechLinker.infer_active_forms(related_stmts) linked_stmts += MechLinker.infer_modifications(related_stmts) linked_stmts += MechLinker.infer_activations(related_stmts) # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [stmt for stmt in related_stmts if stmt not in background_assertions] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model().encode('utf-8')) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')