示例#1
0
def test_mirna_standardize():
    name, db_refs = standardize_name_db_refs({'HGNC': '31476'})
    assert db_refs['HGNC'] == '31476'
    assert db_refs['MIRBASE'] == 'MI0000060'
    assert name == 'MIRLET7A1'

    name, db_refs = standardize_name_db_refs({'MIRBASE': 'MI0001730'})
    assert db_refs['MIRBASE'] == 'MI0001730'
    assert name == 'mmu-mir-451a'
示例#2
0
    def _get_agents_from_singular_entity(self, bpe: bp.PhysicalEntity):
        """This is for extracting one or more Agents from a PhysicalEntity
        which doesn't have member_physical_entities."""
        try:
            return copy.deepcopy(self._agents[bpe.uid])
        except KeyError:
            pass

        mcs = BiopaxProcessor._get_entity_mods(bpe) if _is_protein(bpe) else []
        name = bpe.display_name
        agents = []

        # We first get processed xrefs
        xrefs = BiopaxProcessor._get_processed_xrefs(bpe)

        # We now need to harmonize UP and HGNC
        # Case 1. Multiple genes coding for one protein
        nhgnc_ids = len(xrefs.get('HGNC', {}))
        nup_ids = len(xrefs.get('UP', {}))
        # One protein coded by many genes
        if nhgnc_ids > 1 and nup_ids == 1:
            for hgnc_id in xrefs['HGNC']:
                standard_name, db_refs = \
                    standardize_name_db_refs({'HGNC': hgnc_id})
                if standard_name:
                    name = standard_name
                agents.append(Agent(name, db_refs=db_refs, mods=mcs))
        # One gene coding for many proteins
        elif nhgnc_ids == 1 and nup_ids > 1:
            for up_id in xrefs['UP']:
                standard_name, db_refs = \
                    standardize_name_db_refs({'UP': up_id})
                if standard_name:
                    name = standard_name
                agents.append(Agent(name, db_refs=db_refs, mods=mcs))
        # This is secretly a family, i.e., we have more than one
        # gene/protein IDs and so we can go by one of the ID sets and
        # standardize from there
        elif nhgnc_ids > 1 and nhgnc_ids == nup_ids:
            for up_id in xrefs['UP']:
                standard_name, db_refs = \
                    standardize_name_db_refs({'UP': up_id})
                if standard_name:
                    name = standard_name
                agents.append(Agent(name, db_refs=db_refs, mods=mcs))
        # Otherwise it's just a regular Agent
        else:
            standard_name, db_refs = \
                standardize_name_db_refs(clean_up_xrefs(xrefs))
            if standard_name:
                name = standard_name
            agents.append(Agent(name, db_refs=db_refs, mods=mcs))
        # There is a potential here that an Agent name was set to None
        # if both the display name and the standard name are missing.
        # We filter these out
        agents = [a for a in agents if a.name is not None]
        return agents
示例#3
0
def test_drugbank_mappings():
    name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'})
    assert db_refs.get('CHEBI') == 'CHEBI:142437', db_refs
    assert db_refs.get('CHEMBL') == 'CHEMBL1201666', db_refs
    assert name == 'lepirudin'
    # Here we test for alternative prioritization of name spaces
    name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'},
                                             ns_order=['DRUGBANK', 'CHEBI'])
    # We expect to get the Drugbank standard name
    assert name == 'Lepirudin'
示例#4
0
    def _get_target_agent(target_element):
        name_tag = db_find(target_element, 'db:name')
        name = name_tag.text

        db_refs = {}

        # Get Drugbank target ID
        target_id = db_find(target_element, 'db:id').text
        db_refs['DRUGBANKV4.TARGET'] = target_id

        # Extract other xrefs
        for xref_tag in db_findall(
                target_element, 'db:polypeptide/'
                'db:external-identifiers/'
                'db:external-identifier'):
            resource = db_find(xref_tag, 'db:resource').text
            identifier = db_find(xref_tag, 'db:identifier').text
            if resource == 'HUGO Gene Nomenclature Committee (HGNC)':
                db_refs['HGNC'] = identifier[5:]
            elif resource == 'UniProtKB':
                db_refs['UP'] = identifier
        standard_name, db_refs = standardize_name_db_refs(db_refs)
        if standard_name:
            name = standard_name
        agent = Agent(name, db_refs=db_refs)
        return agent
示例#5
0
def get_gene_agent(name, gene_entrez_id):
    db_refs = {'EGID': gene_entrez_id}
    hgnc_id = hgnc_client.get_hgnc_id(name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
    standard_name, db_refs = standardize_name_db_refs(db_refs)
    if standard_name:
        name = standard_name
    return Agent(name, db_refs=db_refs)
示例#6
0
 def _extract_protein(self, name, gene_id):
     refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         refs['HGNC'] = hgnc_id
     standard_name, db_refs = standardize_name_db_refs(refs)
     if standard_name:
         name = standard_name
     return Agent(name, db_refs=db_refs)
示例#7
0
def get_disease_agent(name, disease_id):
    groundings = disease_id.split('|')
    db_refs = {}
    for gr in groundings:
        db_ns, db_id = gr.split(':')
        db_refs[db_ns] = db_id
    standard_name, db_refs = standardize_name_db_refs(db_refs)
    if standard_name:
        name = standard_name
    return Agent(name, db_refs=db_refs)
示例#8
0
def get_db_refs_by_ident(ns, ident, node_data):
    """Return standard name and grounding based on a namespace and an ID.

    Parameters
    ----------
    ns : str
        A name space in which the given identifier is interpreted.
    ident : str
        The identifier in the given name space to get grounding for.
    node_data : pybel.dsl.BaseAbundance
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    ns_list = [
        'HGNC', 'UNIPROT', 'UP', 'FPLX', 'GO', 'GOBP', 'GOCC', 'MESHPP',
        'MESHD', 'MESH', 'MGI', 'RGD', 'SFAM', 'EGID', 'ENTREZ', 'NCBIGENE',
        'MIRBASE', 'CHEBI', 'ECCODE'
        'SDIS', 'SCHEM', 'TEXT', 'DOID', 'EFO', 'HP', 'PFAM', 'ECCODE',
        'HGNC.GENEFAMILY', 'HGNC_GROUP', 'NCBITAXON', 'PUBCHEM'
    ]
    ns_mappings = {
        'UNIPROT': 'UP',
        'GOBP': 'GO',
        'GOCC': 'GO',
        'MESHPP': 'MESH',
        'MESHD': 'MESH',
        'ENTREZ': 'EGID',
        'NCBIGENE': 'EGID',
        'NCBITAXON': 'TAXONOMY',
        'HGNC.GENEFAMILY': 'HGNC_GROUP',
        'CHEBIID': 'CHEBI'
    }
    raw_name = node_data.name
    if ns in ns_list:
        mapped_ns = ns_mappings.get(ns, ns)
        raw_db_refs = {mapped_ns: ident}
        std_name, std_db_refs = standardize_name_db_refs(raw_db_refs)
        if std_name is None:
            std_name = raw_name
        if std_db_refs is None:
            std_db_refs = raw_db_refs
    else:
        logger.info("Unhandled namespace %s with name %s and "
                    "identifier %s (%s)." % (ns, raw_name, ident, node_data))
        std_name = raw_name
        std_db_refs = None
    return std_name, std_db_refs
示例#9
0
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     name = ent_name
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
             [BoundCondition(a, True) for a in agents[1:]]
         return agent
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             db_refs = process_uniprot_entry(id)
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {
                 database: id
             }  # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' % str(key))
             else:
                 db_refs['FPLX'] = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase',
                                 'DRUGBANK'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             elif database == 'ChEBI' and id.startswith('SID:'):
                 gnd_type = 'PUBCHEM.SUBSTANCE'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         standard_name, db_refs = standardize_name_db_refs(db_refs)
         if standard_name:
             name = standard_name
         return Agent(name, db_refs=db_refs)
示例#10
0
 def get_ref_context(lst):
     if not lst:
         return None
     db_name, db_id = lst[0].split(':', 1)
     db_name = db_name.upper()
     # Here we are dealing with UniProt subcellular components
     # so we use a different namespace for those
     if db_name == 'UNIPROT':
         db_name = 'UPLOC'
     # These aren't real groundings
     elif db_name == 'UAZ':
         return None
     standard_name, db_refs = \
         standardize_name_db_refs({db_name: db_id})
     return RefContext(standard_name, db_refs=db_refs)
示例#11
0
    def _make_agent(self, symbol, entrez_id, swissprot_id, trembl_id):
        """Make an Agent object, appropriately grounded.

        Parameters
        ----------
        entrez_id : str
            Entrez id number
        swissprot_id : str
            Swissprot (reviewed UniProt) ID.
        trembl_id : str
            Trembl (unreviewed UniProt) ID.
        symbol : str
            A plain text symbol, or None if not listed.

        Returns
        -------
        agent : indra.statements.Agent
            A grounded agent object.
        """
        db_refs = {}
        name = symbol
        if swissprot_id:
            if '|' not in swissprot_id:
                db_refs['UP'] = swissprot_id
        elif trembl_id:
            if '|' not in trembl_id:
                db_refs['UP'] = trembl_id
        if entrez_id:
            db_refs['EGID'] = entrez_id
        standard_name, db_refs = standardize_name_db_refs(db_refs)
        if standard_name:
            name = standard_name

        # At the time of writing this, the name was never None but
        # just in case
        if name is None:
            return None

        return Agent(name, db_refs=db_refs)
示例#12
0
 def _extract_drugs(self, compound_ids, lspci_id):
     drugs = []
     for id_ in compound_ids.split('|'):
         db_refs = {'LSPCI': lspci_id}
         if id_.startswith('CHEMBL'):
             db_refs['CHEMBL'] = id_
         elif id_.startswith('HMSL'):
             db_refs['HMS-LINCS'] = id_.split('HMSL')[1]
         else:
             logger.warning('Unhandled ID type: %s' % id_)
         # Name standardization finds correct names but because
         # ChEMBL is incomplete as a local resource, we don't
         # universally standardize its names, instead, we look
         # it up explicitly when necessary.
         name, db_refs = standardize_name_db_refs(db_refs)
         if name is None:
             # This is one way to detect that the drug could not be
             # standardized beyond just its name so in the
             # standardized_only condition, we skip this drug
             if self.standardized_only:
                 continue
             elif 'HMS-LINCS' in db_refs:
                 name = \
                     lincs_client_obj.get_small_molecule_name(
                         db_refs['HMS-LINCS'])
             elif 'CHEMBL' in db_refs:
                 name = chembl_client.get_chembl_name(db_refs['CHEMBL'])
         # If name is still None, we just use the ID as the name
         if name is None:
             # With the named_only restriction, we skip drugs without
             # a proper name.
             if self.named_only:
                 continue
             name = id_
         assert_valid_db_refs(db_refs)
         drugs.append(Agent(name, db_refs=db_refs))
     drugs = list({agent.matches_key():
                   agent for agent in drugs}.values())
     return drugs
示例#13
0
    def _get_drug_agent(drug_element):
        name_tag = db_find(drug_element, 'db:name')
        name = name_tag.text

        db_refs = {}

        # Extract the DrugBank ID
        drugbank_id_tags = db_findall(drug_element, 'db:drugbank-id')
        # We do a sort here because sometimes there's more than one
        # DrugBank ID and we choose the "smaller" one here
        drugbank_id = sorted([
            di.text for di in drugbank_id_tags if di.text.startswith('DB')
        ])[0]
        db_refs['DRUGBANK'] = drugbank_id

        # Extract CAS ID
        cas_tag = db_find(drug_element, 'db:cas-number')
        if cas_tag is not None and cas_tag.text is not None:
            db_refs['CAS'] = cas_tag.text

        # Extract other xrefs
        for xref_tag in db_findall(
                drug_element, 'db:external-identifiers/'
                'db:external-identifier'):
            resource = db_find(xref_tag, 'db:resource').text
            identifier = db_find(xref_tag, 'db:identifier').text
            if resource == 'ChEMBL':
                db_refs['CHEMBL'] = ensure_chembl_prefix(identifier)
            elif resource == 'PubChem Compound':
                db_refs['PUBCHEM'] = identifier
            elif resource == 'ChEBI':
                db_refs['CHEBI'] = ensure_chebi_prefix(identifier)
        assert_valid_db_refs(db_refs)
        standard_name, db_refs = standardize_name_db_refs(db_refs)
        assert_valid_db_refs(db_refs)
        if standard_name:
            name = standard_name
        agent = Agent(name, db_refs=db_refs)
        return agent
def map_readout(stmts):
    for stmt in stmts:
        db_refs = {'TEXT': stmt.obj.db_refs['TEXT'], 'MESH': 'D014779'}
        stmt.obj.name, stmt.obj.db_refs = \
            standardize_name_db_refs(db_refs)
    return stmts
示例#15
0
def get_ref_context(db_ns, db_id):
    db_id = db_id.strip()
    if db_ns in {'BTO'}:
        db_id = ensure_prefix(db_ns, db_id)
    standard_name, db_refs = standardize_name_db_refs({db_ns: db_id})
    return RefContext(standard_name, db_refs)
示例#16
0
def test_nonhuman_entrez():
    name, db_refs = standardize_name_db_refs({'EGID': '109880'})
    assert name == 'Braf', name
    assert db_refs['UP'] == 'P28028', db_refs
示例#17
0
def test_drugbank_mappings():
    name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'})
    assert db_refs.get('CHEBI') == 'CHEBI:142437', db_refs
    assert db_refs.get('CHEMBL') == 'CHEMBL1201666', db_refs
    assert name == 'lepirudin'