def map_statements(stmts, source, outfile=None): """Tabulate valid, invalid, and mapped sites from a set of Statements.""" # Look for errors in database statements sm = SiteMapper(default_site_map) valid_stmts, mapped_stmts = sm.map_sites(stmts) # Collect stats from SiteMapper itself sites = [] for site_key, mapping in sm._cache.items(): gene, res, pos = site_key freq = sm._sitecount[site_key] if mapping == 'VALID': valid, mapped, mapped_res, mapped_pos, explanation = \ (1, 0, None, None, None) else: valid = 0 # Not mapped if mapping is None: mapped, mapped_res, mapped_pos, explanation = \ (0, None, None, None) # Mapped! else: mapped_res, mapped_pos, explanation = mapping mapped = 1 if mapped_pos else 0 si = SiteInfo(gene, res, pos, valid, mapped, mapped_res, mapped_pos, explanation, freq, source) sites.append(si) # Write to CSV file if outfile: header = [[field.upper() for field in si._asdict().keys()]] rows = header + replace_nones(sites) write_unicode_csv(outfile, rows) return sites
def map_sequence(stmts_in, **kwargs): """Map sequences using the SiteMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping sites on %d statements...' % len(stmts_in)) sm = SiteMapper(default_site_map) valid, mapped = sm.map_sites(stmts_in) correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) stmts_out = valid + correctly_mapped_stmts logger.info('%d statements with valid sites' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
def extract_phos(): with open(stmts_fname, 'rb') as fh: model = pickle.load(fh) stmts = [] for pmid, pmid_stmts in model.items(): for stmt in pmid_stmts: if isinstance(stmt, Phosphorylation): stmts.append(stmt) logger.info('%d phosphorylations in RAS Machine' % len(stmts)) stmts = [s for s in stmts if s.enz is not None] logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts)) stmts_grounded = filter_grounded(stmts) logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded)) stmts_enzkinase = filter_enzkinase(stmts_grounded) logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase)) sm = SiteMapper(default_site_map) stmts_valid, _ = sm.map_sites(stmts_enzkinase) logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid)) pa = Preassembler(hierarchies, stmts_valid) stmts_unique = pa.combine_duplicates() logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique)) stmts_unique = pa.combine_related() logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique)) with open('mapped_unique_phos.pkl', 'wb') as fh: pickle.dump(stmts_unique, fh, protocol=2) # Filter RAS Machine statements for direct and not hypothesis stmts = filter_direct(stmts_unique) logger.info('%d direct phosphorylations in RAS Machine' % len(stmts)) stmts = filter_non_hypothesis(stmts) logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts)) with open('filtered_phos.pkl', 'wb') as fh: pickle.dump(stmts, fh, protocol=2) return stmts
def map_sequence(stmts_in, **kwargs): """Map sequences using the SiteMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping sites on %d statements...' % len(stmts_in)) kwarg_list = ['do_methionine_offset', 'do_orthology_mapping', 'do_isoform_mapping'] sm = SiteMapper(default_site_map) valid, mapped = sm.map_sites(stmts_in, **_filter(kwargs, kwarg_list)) correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) stmts_out = valid + correctly_mapped_stmts logger.info('%d statements with valid sites' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
gene_counts = make_bar_plot(sites) """ # This script does two things: # 1) Plots stats on invalid sites from databases # - showing their frequency # - per site # - per reaction # 2) Showing the fraction of the invalid sites in DBs that are mapped # - per site # - per reaction # 3) Showing accuracy: # - that the mapped sites are likely legit # - and that the unmapped sites are likely errors sm = SiteMapper(default_site_map) with open('smcache.pkl', 'rb') as f: (sm._cache, sm._sitecount) = pickle.load(f) # Load the agent files agent_files = [ 'pc_pid_modified_agents.pkl', 'pc_psp_modified_agents.pkl', 'pc_reactome_modified_agents.pkl' ] # For each set of mods all_sites = [] for agent_file in agent_files: db_name = agent_file.split('_')[1] sites = map_agents(agent_file, sm, db_name) all_sites += sites print("Stats for %s -------------" % db_name)
def preassemble(self, filters=None): """Preassemble the Statements collected in the model. Use INDRA's GroundingMapper, Preassembler and BeliefEngine on the IncrementalModel and save the unique statements and the top level statements in class attributes. Currently the following filter options are implemented: - grounding: require that all Agents in statements are grounded - model_one: require that at least one Agent is in the incremental model - model_all: require that all Agents are in the incremental model - prior_one: require that at least one Agent is in the prior model - prior_all: require that all Agents are in the prior model Note that model_one -> prior_all are increasingly more restrictive options. Parameters ---------- filters : Optional[list[str]] A list of filter options to apply when choosing the statements. See description above for more details. Default: None """ stmts = self.get_statements() logger.info("%d raw Statements in total" % len(stmts)) # Fix grounding logger.info("Running grounding map") twg = gm.agent_texts_with_grounding(stmts) prot_map = gm.protein_map_from_twg(twg) gm.default_grounding_map.update(prot_map) gmap = gm.GroundingMapper(gm.default_grounding_map) stmts = gmap.map_agents(stmts, do_rename=True) logger.info("%d Statements after grounding map" % len(stmts)) # Fix sites sm = SiteMapper(default_site_map) stmts, _ = sm.map_sites(stmts) logger.info("%d Statements with valid sequence" % len(stmts)) if filters: if "grounding" in filters: # Filter out ungrounded statements logger.info("Running grounding filter") stmts = self._relevance_filter(stmts, ["grounding"]) logger.info("%s Statements after filter" % len(stmts)) if "human_only" in filters: # Filter out non-human proteins logger.info("Running non-human protein filter") stmts = self._relevance_filter(stmts, ["human_only"]) logger.info("%s Statements after filter" % len(stmts)) for rel_key in ("prior_one", "model_one", "prior_all", "model_all"): if rel_key in filters: logger.info("Running %s relevance filter" % rel_key) stmts = self._relevance_filter(stmts, [rel_key]) logger.info("%s Statements after filter" % len(stmts)) # Combine duplicates logger.info("Preassembling %d Statements" % len(stmts)) pa = Preassembler(hierarchies, stmts) self.unique_stmts = pa.combine_duplicates() logger.info("%d unique Statements" % len(self.unique_stmts)) # Run BeliefEngine on unique statements be = BeliefEngine() be.set_prior_probs(self.unique_stmts) # Build statement hierarchy self.unique_stmts = pa.combine_related(return_toplevel=False) self.toplevel_stmts = [st for st in self.unique_stmts if not st.supports] logger.info("%d top-level Statements" % len(self.toplevel_stmts)) # Run BeliefEngine on hierarchy be.set_hierarchy_probs(self.unique_stmts)