예제 #1
0
def map_statements(stmts, source, outfile=None):
    """Tabulate valid, invalid, and mapped sites from a set of Statements."""
    # Look for errors in database statements
    sm = SiteMapper(default_site_map)
    valid_stmts, mapped_stmts = sm.map_sites(stmts)
    # Collect stats from SiteMapper itself
    sites = []
    for site_key, mapping in sm._cache.items():
        gene, res, pos = site_key
        freq = sm._sitecount[site_key]
        if mapping == 'VALID':
            valid, mapped, mapped_res, mapped_pos, explanation = \
                                                      (1, 0, None, None, None)
        else:
            valid = 0
            # Not mapped
            if mapping is None:
                mapped, mapped_res, mapped_pos, explanation = \
                                                    (0, None, None, None)
            # Mapped!
            else:
                mapped_res, mapped_pos, explanation = mapping
                mapped = 1 if mapped_pos else 0
        si = SiteInfo(gene, res, pos, valid, mapped, mapped_res, mapped_pos,
                      explanation, freq, source)
        sites.append(si)
    # Write to CSV file
    if outfile:
        header = [[field.upper() for field in si._asdict().keys()]]
        rows = header + replace_nones(sites)
        write_unicode_csv(outfile, rows)
    return sites
예제 #2
0
def map_sequence(stmts_in, **kwargs):
    """Map sequences using the SiteMapper.

    Parameters
    ----------
    stmts_in : list[indra.statements.Statement]
        A list of statements to map.
    save : Optional[str]
        The name of a pickle file to save the results (stmts_out) into.

    Returns
    -------
    stmts_out : list[indra.statements.Statement]
        A list of mapped statements.
    """
    logger.info('Mapping sites on %d statements...' % len(stmts_in))
    sm = SiteMapper(default_site_map)
    valid, mapped = sm.map_sites(stmts_in)
    correctly_mapped_stmts = []
    for ms in mapped:
        if all([True if mm[1] is not None else False
                for mm in ms.mapped_mods]):
            correctly_mapped_stmts.append(ms.mapped_stmt)
    stmts_out = valid + correctly_mapped_stmts
    logger.info('%d statements with valid sites' % len(stmts_out))
    dump_pkl = kwargs.get('save')
    if dump_pkl:
        dump_statements(stmts_out, dump_pkl)
    return stmts_out
예제 #3
0
def extract_phos():
    with open(stmts_fname, 'rb') as fh:
        model = pickle.load(fh)

    stmts = []
    for pmid, pmid_stmts in model.items():
        for stmt in pmid_stmts:
            if isinstance(stmt, Phosphorylation):
                stmts.append(stmt)
    logger.info('%d phosphorylations in RAS Machine' % len(stmts))

    stmts = [s for s in stmts if s.enz is not None]
    logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts))

    stmts_grounded = filter_grounded(stmts)
    logger.info('%d grounded phosphorylations in RAS Machine' %
                len(stmts_grounded))

    stmts_enzkinase = filter_enzkinase(stmts_grounded)
    logger.info('%d phosphorylations with kinase enzyme in RAS Machine' %
                len(stmts_enzkinase))

    sm = SiteMapper(default_site_map)
    stmts_valid, _ = sm.map_sites(stmts_enzkinase)
    logger.info('%d valid-sequence phosphorylations in RAS Machine' %
                len(stmts_valid))

    pa = Preassembler(hierarchies, stmts_valid)
    stmts_unique = pa.combine_duplicates()
    logger.info('%d unique phosphorylations in RAS Machine' %
                len(stmts_unique))

    stmts_unique = pa.combine_related()
    logger.info('%d top-level phosphorylations in RAS Machine' %
                len(stmts_unique))

    with open('mapped_unique_phos.pkl', 'wb') as fh:
        pickle.dump(stmts_unique, fh, protocol=2)

    # Filter RAS Machine statements for direct and not hypothesis
    stmts = filter_direct(stmts_unique)
    logger.info('%d direct phosphorylations in RAS Machine' % len(stmts))
    stmts = filter_non_hypothesis(stmts)
    logger.info('%d non-hypothesis phosphorylations in RAS Machine' %
                len(stmts))

    with open('filtered_phos.pkl', 'wb') as fh:
        pickle.dump(stmts, fh, protocol=2)

    return stmts
예제 #4
0
def map_sequence(stmts_in, **kwargs):
    """Map sequences using the SiteMapper.

    Parameters
    ----------
    stmts_in : list[indra.statements.Statement]
        A list of statements to map.
    do_methionine_offset : boolean
        Whether to check for off-by-one errors in site position (possibly)
        attributable to site numbering from mature proteins after
        cleavage of the initial methionine. If True, checks the reference
        sequence for a known modification at 1 site position greater
        than the given one; if there exists such a site, creates the
        mapping. Default is True.
    do_orthology_mapping : boolean
        Whether to check sequence positions for known modification sites
        in mouse or rat sequences (based on PhosphoSitePlus data). If a
        mouse/rat site is found that is linked to a site in the human
        reference sequence, a mapping is created. Default is True.
    do_isoform_mapping : boolean
        Whether to check sequence positions for known modifications
        in other human isoforms of the protein (based on PhosphoSitePlus
        data). If a site is found that is linked to a site in the human
        reference sequence, a mapping is created. Default is True.
    save : Optional[str]
        The name of a pickle file to save the results (stmts_out) into.

    Returns
    -------
    stmts_out : list[indra.statements.Statement]
        A list of mapped statements.
    """
    logger.info('Mapping sites on %d statements...' % len(stmts_in))
    kwarg_list = ['do_methionine_offset', 'do_orthology_mapping',
                  'do_isoform_mapping']
    sm = SiteMapper(default_site_map)
    valid, mapped = sm.map_sites(stmts_in, **_filter(kwargs, kwarg_list))
    correctly_mapped_stmts = []
    for ms in mapped:
        if all([True if mm[1] is not None else False
                for mm in ms.mapped_mods]):
            correctly_mapped_stmts.append(ms.mapped_stmt)
    stmts_out = valid + correctly_mapped_stmts
    logger.info('%d statements with valid sites' % len(stmts_out))
    dump_pkl = kwargs.get('save')
    if dump_pkl:
        dump_statements(stmts_out, dump_pkl)
    return stmts_out
예제 #5
0
파일: __init__.py 프로젝트: jmuhlich/indra
def extract_phos():
    with open(stmts_fname, 'rb') as fh:
        model = pickle.load(fh)

    stmts = []
    for pmid, pmid_stmts in model.items():
        for stmt in pmid_stmts:
            if isinstance(stmt, Phosphorylation):
                stmts.append(stmt)
    logger.info('%d phosphorylations in RAS Machine' % len(stmts))

    stmts = [s for s in stmts if s.enz is not None]
    logger.info('%d phosphorylations with enzyme in RAS Machine' % len(stmts))

    stmts_grounded = filter_grounded(stmts)
    logger.info('%d grounded phosphorylations in RAS Machine' % len(stmts_grounded))

    stmts_enzkinase = filter_enzkinase(stmts_grounded)
    logger.info('%d phosphorylations with kinase enzyme in RAS Machine' % len(stmts_enzkinase))

    sm = SiteMapper(default_site_map)
    stmts_valid, _ = sm.map_sites(stmts_enzkinase)
    logger.info('%d valid-sequence phosphorylations in RAS Machine' % len(stmts_valid))

    pa = Preassembler(hierarchies, stmts_valid)
    stmts_unique = pa.combine_duplicates()
    logger.info('%d unique phosphorylations in RAS Machine' % len(stmts_unique))

    stmts_unique = pa.combine_related()
    logger.info('%d top-level phosphorylations in RAS Machine' % len(stmts_unique))

    with open('mapped_unique_phos.pkl', 'wb') as fh:
        pickle.dump(stmts_unique, fh, protocol=2)

    # Filter RAS Machine statements for direct and not hypothesis
    stmts = filter_direct(stmts_unique)
    logger.info('%d direct phosphorylations in RAS Machine' % len(stmts))
    stmts = filter_non_hypothesis(stmts)
    logger.info('%d non-hypothesis phosphorylations in RAS Machine' % len(stmts))

    with open('filtered_phos.pkl', 'wb') as fh:
        pickle.dump(stmts, fh, protocol=2)

    return stmts
예제 #6
0
    gene_counts = make_bar_plot(sites)

    """
    # This script does two things:
    # 1) Plots stats on invalid sites from databases
    #    - showing their frequency
    #       - per site
    #       - per reaction
    # 2) Showing the fraction of the invalid sites in DBs that are mapped
    #    - per site
    #    - per reaction
    # 3) Showing accuracy:
    #    - that the mapped sites are likely legit
    #    - and that the unmapped sites are likely errors

    sm = SiteMapper(default_site_map)
    with open('smcache.pkl', 'rb') as f:
        (sm._cache, sm._sitecount) = pickle.load(f)

    # Load the agent files
    agent_files = [
        'pc_pid_modified_agents.pkl', 'pc_psp_modified_agents.pkl',
        'pc_reactome_modified_agents.pkl'
    ]
    # For each set of mods
    all_sites = []
    for agent_file in agent_files:
        db_name = agent_file.split('_')[1]
        sites = map_agents(agent_file, sm, db_name)
        all_sites += sites
        print("Stats for %s -------------" % db_name)
예제 #7
0
    def preassemble(self, filters=None):
        """Preassemble the Statements collected in the model.

        Use INDRA's GroundingMapper, Preassembler and BeliefEngine
        on the IncrementalModel and save the unique statements and
        the top level statements in class attributes.

        Currently the following filter options are implemented:
        - grounding: require that all Agents in statements are grounded
        - model_one: require that at least one Agent is in the incremental model
        - model_all: require that all Agents are in the incremental model
        - prior_one: require that at least one Agent is in the prior model
        - prior_all: require that all Agents are in the prior model
        Note that model_one -> prior_all are increasingly more restrictive
        options.

        Parameters
        ----------
        filters : Optional[list[str]]
            A list of filter options to apply when choosing the statements.
            See description above for more details. Default: None
        """
        stmts = self.get_statements()
        logger.info("%d raw Statements in total" % len(stmts))

        # Fix grounding
        logger.info("Running grounding map")
        twg = gm.agent_texts_with_grounding(stmts)
        prot_map = gm.protein_map_from_twg(twg)
        gm.default_grounding_map.update(prot_map)
        gmap = gm.GroundingMapper(gm.default_grounding_map)
        stmts = gmap.map_agents(stmts, do_rename=True)

        logger.info("%d Statements after grounding map" % len(stmts))

        # Fix sites
        sm = SiteMapper(default_site_map)
        stmts, _ = sm.map_sites(stmts)

        logger.info("%d Statements with valid sequence" % len(stmts))

        if filters:
            if "grounding" in filters:
                # Filter out ungrounded statements
                logger.info("Running grounding filter")
                stmts = self._relevance_filter(stmts, ["grounding"])
                logger.info("%s Statements after filter" % len(stmts))
            if "human_only" in filters:
                # Filter out non-human proteins
                logger.info("Running non-human protein filter")
                stmts = self._relevance_filter(stmts, ["human_only"])
                logger.info("%s Statements after filter" % len(stmts))
            for rel_key in ("prior_one", "model_one", "prior_all", "model_all"):
                if rel_key in filters:
                    logger.info("Running %s relevance filter" % rel_key)
                    stmts = self._relevance_filter(stmts, [rel_key])
                    logger.info("%s Statements after filter" % len(stmts))

        # Combine duplicates
        logger.info("Preassembling %d Statements" % len(stmts))
        pa = Preassembler(hierarchies, stmts)
        self.unique_stmts = pa.combine_duplicates()
        logger.info("%d unique Statements" % len(self.unique_stmts))

        # Run BeliefEngine on unique statements
        be = BeliefEngine()
        be.set_prior_probs(self.unique_stmts)

        # Build statement hierarchy
        self.unique_stmts = pa.combine_related(return_toplevel=False)
        self.toplevel_stmts = [st for st in self.unique_stmts if not st.supports]
        logger.info("%d top-level Statements" % len(self.toplevel_stmts))
        # Run BeliefEngine on hierarchy
        be.set_hierarchy_probs(self.unique_stmts)