def extract_phos(): with open(stmts_fname, 'rb') as fh: model = pickle.load(fh) stmts = [] for pmid, pmid_stmts in model.items(): for stmt in pmid_stmts: if isinstance(stmt, Phosphorylation): stmts.append(stmt) logger.info('%d phosphorylations in RAS Machine' % len(stmts)) stmts = [s for s in stmts if s.enz is not None] logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts)) stmts_grounded = filter_grounded(stmts) logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded)) stmts_enzkinase = filter_enzkinase(stmts_grounded) logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase)) sm = SiteMapper(default_site_map) stmts_valid, _ = sm.map_sites(stmts_enzkinase) logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid)) pa = Preassembler(hierarchies, stmts_valid) stmts_unique = pa.combine_duplicates() logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique)) stmts_unique = pa.combine_related() logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique)) with open('mapped_unique_phos.pkl', 'wb') as fh: pickle.dump(stmts_unique, fh, protocol=2) # Filter RAS Machine statements for direct and not hypothesis stmts = filter_direct(stmts_unique) logger.info('%d direct phosphorylations in RAS Machine' % len(stmts)) stmts = filter_non_hypothesis(stmts) logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts)) with open('filtered_phos.pkl', 'wb') as fh: pickle.dump(stmts, fh, protocol=2) return stmts
def map_sequence(stmts_in, **kwargs): """Map sequences using the SiteMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping sites on %d statements...' % len(stmts_in)) kwarg_list = ['do_methionine_offset', 'do_orthology_mapping', 'do_isoform_mapping'] sm = SiteMapper(default_site_map) valid, mapped = sm.map_sites(stmts_in, **_filter(kwargs, kwarg_list)) correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) stmts_out = valid + correctly_mapped_stmts logger.info('%d statements with valid sites' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
gene_counts = make_bar_plot(sites) """ # This script does two things: # 1) Plots stats on invalid sites from databases # - showing their frequency # - per site # - per reaction # 2) Showing the fraction of the invalid sites in DBs that are mapped # - per site # - per reaction # 3) Showing accuracy: # - that the mapped sites are likely legit # - and that the unmapped sites are likely errors sm = SiteMapper(default_site_map) with open('smcache.pkl', 'rb') as f: (sm._cache, sm._sitecount) = pickle.load(f) # Load the agent files agent_files = [ 'pc_pid_modified_agents.pkl', 'pc_psp_modified_agents.pkl', 'pc_reactome_modified_agents.pkl' ] # For each set of mods all_sites = [] for agent_file in agent_files: db_name = agent_file.split('_')[1] sites = map_agents(agent_file, sm, db_name) all_sites += sites print("Stats for %s -------------" % db_name)