Пример #1
0
def test_is_category():
    assert hgnc_client.is_kinase('MAPK1')
    assert not hgnc_client.is_kinase('EGF')
    assert hgnc_client.is_phosphatase('PTEN')
    assert not hgnc_client.is_phosphatase('KRAS')
    assert hgnc_client.is_transcription_factor('FOXO3')
    assert not hgnc_client.is_transcription_factor('AKT1')
Пример #2
0
 def update(self, x, is_last=False):
     # This is the simple case where there is an exact match
     if x == self.node_id:
         self.truth = True
     # This is a special case where we have a prefix match
     elif self.node_id.endswith(':') and x.startswith(self.node_id):
         self.truth = True
     # This is another special case where we're dealing with a high-level
     # term
     elif self.node_id.startswith('CATEGORY'):
         from indra.databases.hgnc_client import is_kinase, is_phosphatase, \
             is_transcription_factor, get_hgnc_name
         cat = self.node_id.split(':')[1]
         gene_name = get_hgnc_name(x.split(':')[1])
         if cat == 'kinase':
             self.truth = is_kinase(gene_name)
         elif cat == 'phosphatase':
             self.truth = is_phosphatase(gene_name)
         elif cat == 'tf':
             self.truth = is_transcription_factor(gene_name)
         else:
             self.truth = False
     else:
         self.truth = False
     self.is_last = is_last
Пример #3
0
def filter_kinase_annots(annot_sites, include_fplx=True):
    kinase_sites = {}
    for k, v in annot_sites.items():
        ctrl_id, ctrl_ns, _, _, _ = k
        if ctrl_ns == 'HGNC':
            # If genes with HGNC IDs aren't known to be kinases, they will
            # be filtered out here
            if hgnc_client.is_kinase(ctrl_id):
                kinase_sites[k] = v
        elif include_fplx and ctrl_ns == 'FPLX':
            children = expander.get_children(
                Agent(ctrl_id, db_refs={'FPLX': ctrl_id}))
            for _, hgnc_id in children:
                hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
                if hgnc_client.is_kinase(hgnc_name):
                    kinase_sites[k] = v
                    break
        # The rest of the entries here typically have UP IDs that correspond
        # to non-human proteins or aren't proteins at all.
    return kinase_sites
Пример #4
0
def get_all_enzymes():
    HOME = str(Path.home())
    ec_code_path = '.obo/ec-code/ec-code.obo'
    if not os.path.exists(os.path.join(HOME, ec_code_path)):
        _ = pyobo.get_id_name_mapping('ec-code')
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    else:
        obo = obonet.read_obo(os.path.join(HOME, ec_code_path))
    up_nodes = set()
    for node in obo.nodes:
        if node.startswith('uniprot'):
            up_nodes.add(node[8:])
    human_ups = {u for u in up_nodes if uniprot_client.is_human(u)}
    enzymes = {uniprot_client.get_gene_name(u) for u in human_ups}
    enzymes = {g for g in enzymes if not hgnc_client.is_kinase(g)}
    enzymes = {g for g in enzymes if not hgnc_client.is_phosphatase(g)}
    logger.info(f'Filtered {len(enzymes)} enzymes in total')
    return enzymes
Пример #5
0
def create_export(site_stmts, mapping_results, export_file, evs_file):
    from indra.statements import Agent
    from indra.tools.expand_families import Expander
    from indra.databases import uniprot_client, hgnc_client

    expander = Expander()

    # Make header for main export file
    export_header = [
        'ID', 'CTRL_NS', 'CTRL_ID', 'CTRL_GENE_NAME', 'CTRL_IS_KINASE',
        'TARGET_UP_ID', 'TARGET_GENE_NAME', 'TARGET_RES', 'TARGET_POS',
        'SOURCES'
    ]
    # Make header for evidence export file
    evidence_header = [
        'ID', 'SOURCE', 'PMID', 'DBID', 'TEXT', 'DESCRIPTION', 'ORIG_UP_ID',
        'ORIG_RES', 'ORIG_POS', 'MAPPED_UP_ID', 'MAPPED_RES', 'MAPPED_POS'
    ]

    site_info = {}
    site_evidence = defaultdict(list)
    idx = 0
    # site_stmts is a dict with structure like
    # site_stmts[('Q15438', 'T', '395')]['rhs']['signor'] ->
    # [Phosphorylation(PRKCD(), CYTH1(), T, 395)]
    for (orig_up_id, orig_res, orig_pos), stmt_dict in site_stmts.items():
        # We skip sites that are missing residue or position
        if not orig_res or not orig_pos:
            continue
        # Next, we construct keys for the *final* site (either valid to begin
        # with or mapped to be valid), and if there is no valid final
        # site, we skip the site
        ms = mapping_results[(orig_up_id, orig_res, orig_pos)]
        if ms.valid:
            final_site = [ms.up_id, ms.orig_res, ms.orig_pos]
        elif ms.mapped_res and ms.mapped_pos:
            final_site = [ms.mapped_id, ms.mapped_res, ms.mapped_pos]
        else:
            continue
        # Skip non-human substrates
        if not uniprot_client.is_human(final_site[0]):
            continue
        target_gene_name = uniprot_client.get_gene_name(final_site[0])
        final_site = [
            final_site[0], target_gene_name, final_site[1], final_site[2]
        ]

        # We now look at all the Statements where the given site
        # appears as a substrate and get controllers and evidences
        for source, stmts in stmt_dict['rhs'].items():
            for stmt in stmts:
                # If there is no controller, we skip the entry
                if stmt.enz is None:
                    continue
                # We next get the grounding for the controller and
                # if there is no grounding, we skip it
                ctrl_ns, ctrl_id = stmt.enz.get_grounding()
                if ctrl_ns not in ['UP', 'HGNC', 'FPLX'] or ctrl_id is None:
                    continue

                ctrl_gene_name = None
                ctrl_is_kinase = False
                # Get human gene name for UniProt entries
                if ctrl_ns == 'UP':
                    # Skip non-human protein controllers
                    if not uniprot_client.is_human(ctrl_id):
                        continue
                    ctrl_gene_name = uniprot_client.get_gene_name(ctrl_id)
                    if hgnc_client.is_kinase(ctrl_gene_name):
                        ctrl_is_kinase = True
                # Map human gene names to UniProt IDs
                if ctrl_ns == 'HGNC':
                    gene_name = hgnc_client.get_hgnc_name(ctrl_id)
                    if hgnc_client.is_kinase(gene_name):
                        ctrl_is_kinase = True
                    up_id = hgnc_client.get_uniprot_id(ctrl_id)
                    if up_id:
                        ctrl_ns = 'UP'
                        ctrl_gene_name = gene_name
                        ctrl_id = up_id
                if ctrl_ns == 'FPLX':
                    children = expander.get_children(
                        Agent(ctrl_id, db_refs={'FPLX': ctrl_id}))
                    for _, hgnc_id in children:
                        gene_name = hgnc_client.get_hgnc_name(hgnc_id)
                        if hgnc_client.is_kinase(gene_name):
                            ctrl_is_kinase = True
                            break

                # We can now make a full key that contains the controller
                # as well as the target and final site
                final_annot_key = tuple(
                    [ctrl_ns, ctrl_id, ctrl_gene_name, ctrl_is_kinase] +
                    final_site)
                # We use this full key to store evidences and mapping details
                if final_annot_key not in site_info:
                    site_info[final_annot_key] = idx
                    idx += 1
                # Note: we do get multiple pieces of evidence, e.g.,
                # from biopax
                for ev in stmt.evidence:
                    site_evidence[final_annot_key].append([ev, source, ms])

    # Now make the actual export tables
    def sanitize_ev_text(txt):
        if txt is None:
            return ''
        else:
            txt = txt.replace('\n', ' ')
            return txt

    export_rows = [export_header]
    evidence_rows = [evidence_header]
    for key, idx in site_info.items():
        (ctrl_ns, ctrl_id, ctrl_gene_name, ctrl_is_kinase, target_up_id,
         target_gene_name, target_res, target_pos) = key
        export_row = [
            str(idx), ctrl_ns, ctrl_id, ctrl_gene_name, ctrl_is_kinase,
            target_up_id, target_gene_name, target_res, target_pos
        ]
        # Now get evidences
        evs = site_evidence[key]
        sources = sorted(list({s for e, s, m in evs}))
        export_row.append(','.join(sources))
        export_rows.append(export_row)
        for evidence, source, ms in evs:
            if source == 'bel':
                source_id = evidence.source_id[:16]
            else:
                source_id = evidence.source_id
            row = [
                str(idx), source, evidence.pmid, source_id,
                sanitize_ev_text(evidence.text), ms.description, ms.up_id,
                ms.orig_res, ms.orig_pos, ms.mapped_id, ms.mapped_res,
                ms.mapped_pos
            ]
            evidence_rows.append(row)

    with open(export_file, 'wt') as fh:
        csvwriter = csv.writer(fh)
        csvwriter.writerows(export_rows)
    with open(evs_file, 'wt') as fh:
        csvwriter = csv.writer(fh)
        csvwriter.writerows(evidence_rows)