def __init__(self, text, pmid, tees_path, python2_path): # Store pmid self.pmid = pmid # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) # Run TEES and parse into networkx graph self.G = run_and_parse_tees(text, tees_path, python2_path) # Extract statements from the TEES graph self.statements = [] self.statements.extend(self.process_phosphorylation_statements()) self.statements.extend(self.process_binding_statements()) self.statements.extend(self.process_increase_expression_amount()) self.statements.extend(self.process_decrease_expression_amount()) # Ground statements self.statements = mapper.map_agents(self.statements)
def run_with_pmids_helper(model_path, pmids): default_config_fname = os.path.join(model_path, 'config.yaml') config = get_machine_config(default_config_fname) belief_threshold = config.get('belief_threshold', 0.95) twitter_cred = get_twitter_cred(config) ndex_cred = get_ndex_cred(config) # Get optional grounding map gm_path = config.get('grounding_map_path') if gm_path: try: from indra.preassembler.grounding_mapper import load_grounding_map grounding_map = load_grounding_map(gm_path) except Exception as e: logger.error('Could not load grounding map from %s' % gm_path) logger.error(e) grounding_map = None else: grounding_map = None run_machine(model_path, {'enumerated': [pmid.strip() for pmid in pmids]}, belief_threshold, ndex_cred=ndex_cred, twitter_cred=twitter_cred, grounding_map=grounding_map)
def run_with_pmids_helper(model_path, pmids): default_config_fname = os.path.join(model_path, 'config.yaml') config = get_machine_config(default_config_fname) belief_threshold = config.get('belief_threshold', 0.95) twitter_cred = get_twitter_cred(config) ndex_cred = get_ndex_cred(config) # Get optional grounding map gm_path = config.get('grounding_map_path') if gm_path: try: from indra.preassembler.grounding_mapper import load_grounding_map grounding_map = load_grounding_map(gm_path) except Exception as e: logger.error('Could not load grounding map from %s' % gm_path) logger.error(e) grounding_map = None else: grounding_map = None run_machine( model_path, {'enumerated': [pmid.strip() for pmid in pmids]}, belief_threshold, ndex_cred=ndex_cred, twitter_cred=twitter_cred, grounding_map=grounding_map )
def __init__(self, a1_text, a2_text, sentence_segmentations, pmid): # Store pmid self.pmid = pmid # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) # Run TEES and parse into networkx graph self.G = parse_output(a1_text, a2_text, sentence_segmentations) # Extract statements from the TEES graph self.statements = [] self.statements.extend(self.process_phosphorylation_statements()) self.statements.extend(self.process_binding_statements()) self.statements.extend(self.process_increase_expression_amount()) self.statements.extend(self.process_decrease_expression_amount()) # Ground statements self.statements = mapper.map_agents(self.statements)
def add_grounding(self): # Load grounding information path_this = os.path.dirname(os.path.abspath(__file__)) gm_fname = os.path.join(path_this, '../../resources/', 'extracted_reach_grounding_map.csv') try: gm = load_grounding_map(gm_fname) except BaseException: raise Exception('Could not load the grounding map from ' + gm_fname) mapper = GroundingMapper(gm) self.statements = mapper.map_agents(self.statements)
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker ml = MechLinker(related_stmts) # Link statements linked_stmts = ml.link_statements() # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [ stmt for stmt in related_stmts if stmt not in background_assertions ] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh, protocol=2) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model()) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def run_with_search_helper(model_path, config, num_days=None): logger.info('-------------------------') logger.info(time.strftime('%c')) if not os.path.isdir(model_path): logger.error('%s is not a directory', model_path) sys.exit() default_config_fname = os.path.join(model_path, 'config.yaml') if config: config = get_machine_config(config) elif os.path.exists(default_config_fname): logger.info('Loading default configuration from %s', default_config_fname) config = get_machine_config(default_config_fname) else: logger.error('Configuration file argument missing.') sys.exit() # Probability cutoff for filtering statements default_belief_threshold = 0.95 belief_threshold = config.get('belief_threshold') if belief_threshold is None: belief_threshold = default_belief_threshold msg = 'Belief threshold argument (belief_threshold) not specified.' + \ ' Using default belief threshold %.2f' % default_belief_threshold logger.info(msg) else: logger.info('Using belief threshold: %.2f' % belief_threshold) twitter_cred = get_twitter_cred(config) if twitter_cred: logger.info('Using Twitter with given credentials.') else: logger.info('Not using Twitter due to missing credentials.') gmail_cred = get_gmail_cred(config) if gmail_cred: logger.info('Using Gmail with given credentials.') else: logger.info('Not using Gmail due to missing credentials.') ndex_cred = get_ndex_cred(config) if ndex_cred: logger.info('Using NDEx with given credentials.') else: logger.info('Not using NDEx due to missing information.') pmids = {} # Get email PMIDs if gmail_cred: logger.info('Getting PMIDs from emails.') try: email_pmids = get_email_pmids(gmail_cred) # Put the email_pmids into the pmids dictionary pmids['Gmail'] = email_pmids logger.info('Collected %d PMIDs from Gmail', len(email_pmids)) except Exception: logger.exception('Could not get PMIDs from Gmail, continuing.') # Get PMIDs for general search_terms and genes search_genes = config.get('search_genes') search_terms = config.get('search_terms') if not search_terms: logger.info('No search terms argument (search_terms) specified.') else: if search_genes is not None: search_terms += search_genes logger.info('Using search terms: %s' % ', '.join(search_terms)) if num_days is None: num_days = int(config.get('search_terms_num_days', 5)) logger.info('Searching the last %d days', num_days) pmids_term = get_searchterm_pmids(search_terms, num_days=num_days) num_pmids = len(set(itt.chain.from_iterable(pmids_term.values()))) logger.info('Collected %d PMIDs from PubMed search_terms.', num_pmids) pmids = _extend_dict(pmids, pmids_term) # Get optional grounding map gm_path = config.get('grounding_map_path') if gm_path: try: from indra.preassembler.grounding_mapper import load_grounding_map grounding_map = load_grounding_map(gm_path) except Exception as e: logger.error('Could not load grounding map from %s' % gm_path) logger.error(e) grounding_map = None else: grounding_map = None ''' # Get PMIDs for search_genes # Temporarily removed because Entrez-based article searches # are lagging behind and cannot be time-limited if not search_genes: logger.info('No search genes argument (search_genes) specified.') else: logger.info('Using search genes: %s' % ', '.join(search_genes)) pmids_gene = get_searchgenes_pmids(search_genes, num_days=5) num_pmids = sum([len(pm) for pm in pmids_gene.values()]) logger.info('Collected %d PMIDs from PubMed search_genes.' % num_pmids) pmids = _extend_dict(pmids, pmids_gene) ''' run_machine( model_path, pmids, belief_threshold, search_genes=search_genes, ndex_cred=ndex_cred, twitter_cred=twitter_cred, grounding_map=grounding_map )
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker # Link statements linked_stmts = MechLinker.infer_active_forms(related_stmts) linked_stmts += MechLinker.infer_modifications(related_stmts) linked_stmts += MechLinker.infer_activations(related_stmts) # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = related_stmts + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [stmt for stmt in related_stmts if stmt not in background_assertions] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model().encode('utf-8')) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def run_with_search_helper(model_path, config, num_days=None): logger.info('-------------------------') logger.info(time.strftime('%c')) if not os.path.isdir(model_path): logger.error('%s is not a directory', model_path) sys.exit() default_config_fname = os.path.join(model_path, 'config.yaml') if config: config = get_machine_config(config) elif os.path.exists(default_config_fname): logger.info('Loading default configuration from %s', default_config_fname) config = get_machine_config(default_config_fname) else: logger.error('Configuration file argument missing.') sys.exit() # Probability cutoff for filtering statements default_belief_threshold = 0.95 belief_threshold = config.get('belief_threshold') if belief_threshold is None: belief_threshold = default_belief_threshold msg = 'Belief threshold argument (belief_threshold) not specified.' + \ ' Using default belief threshold %.2f' % default_belief_threshold logger.info(msg) else: logger.info('Using belief threshold: %.2f' % belief_threshold) twitter_cred = get_twitter_cred(config) if twitter_cred: logger.info('Using Twitter with given credentials.') else: logger.info('Not using Twitter due to missing credentials.') gmail_cred = get_gmail_cred(config) if gmail_cred: logger.info('Using Gmail with given credentials.') else: logger.info('Not using Gmail due to missing credentials.') ndex_cred = get_ndex_cred(config) if ndex_cred: logger.info('Using NDEx with given credentials.') else: logger.info('Not using NDEx due to missing information.') pmids = {} # Get email PMIDs if gmail_cred: logger.info('Getting PMIDs from emails.') try: email_pmids = get_email_pmids(gmail_cred) # Put the email_pmids into the pmids dictionary pmids['Gmail'] = email_pmids logger.info('Collected %d PMIDs from Gmail', len(email_pmids)) except Exception: logger.exception('Could not get PMIDs from Gmail, continuing.') # Get PMIDs for general search_terms and genes search_genes = config.get('search_genes') search_terms = config.get('search_terms') if not search_terms: logger.info('No search terms argument (search_terms) specified.') else: if search_genes is not None: search_terms += search_genes logger.info('Using search terms: %s' % ', '.join(search_terms)) if num_days is None: num_days = int(config.get('search_terms_num_days', 5)) logger.info('Searching the last %d days', num_days) pmids_term = get_searchterm_pmids(search_terms, num_days=num_days) num_pmids = len(set(itt.chain.from_iterable(pmids_term.values()))) logger.info('Collected %d PMIDs from PubMed search_terms.', num_pmids) pmids = _extend_dict(pmids, pmids_term) # Get optional grounding map gm_path = config.get('grounding_map_path') if gm_path: try: from indra.preassembler.grounding_mapper import load_grounding_map grounding_map = load_grounding_map(gm_path) except Exception as e: logger.error('Could not load grounding map from %s' % gm_path) logger.error(e) grounding_map = None else: grounding_map = None ''' # Get PMIDs for search_genes # Temporarily removed because Entrez-based article searches # are lagging behind and cannot be time-limited if not search_genes: logger.info('No search genes argument (search_genes) specified.') else: logger.info('Using search genes: %s' % ', '.join(search_genes)) pmids_gene = get_searchgenes_pmids(search_genes, num_days=5) num_pmids = sum([len(pm) for pm in pmids_gene.values()]) logger.info('Collected %d PMIDs from PubMed search_genes.' % num_pmids) pmids = _extend_dict(pmids, pmids_gene) ''' run_machine(model_path, pmids, belief_threshold, search_genes=search_genes, ndex_cred=ndex_cred, twitter_cred=twitter_cred, grounding_map=grounding_map)