Пример #1
0
    def get_agent(self, acsn_agent: str) -> Union[Agent, None]:
        """Return an INDRA Agent corresponding to an ACSN agent.

        Parameters
        ----------
        acsn_agent :
            Agent extracted from the relations statement data frame

        Returns
        -------
        :
            Returns INDRA agent with HGNC or FamPlex ID in db_refs. If there
            are no groundings available, we return None.
        """
        mapping = self.correspondence_dict.get(acsn_agent)
        if not mapping:
            return None
        if len(mapping) == 1:
            hgnc_id = get_hgnc_id(mapping[0])
            if hgnc_id:
                db_refs = {'HGNC': hgnc_id}
                return get_standard_agent(mapping[0], db_refs=db_refs)
        else:
            fplx_rel = self.fplx_lookup.get(
                tuple(sorted(self.correspondence_dict[acsn_agent])))
            if fplx_rel:
                db_refs = {'FPLX': fplx_rel}
                return get_standard_agent(fplx_rel, db_refs=db_refs)
        return None
Пример #2
0
def test_get_agent():
    # Agents
    VEGFA = get_standard_agent('VEGFA', db_refs={'HGNC': '12680'})
    MIRLET7A = get_standard_agent('MIRLET7A', db_refs={'FPLX': 'MIRLET7A'})

    assert ap.get_agent('VEGFA').db_refs == VEGFA.db_refs, VEGFA.db_refs
    assert ap.get_agent('MIRLET7A*').db_refs == \
           MIRLET7A.db_refs, MIRLET7A.db_refs
    assert ap.get_agent('XyZ') is None
Пример #3
0
    def _get_agents_from_singular_entity(self, bpe: bp.PhysicalEntity):
        """This is for extracting one or more Agents from a PhysicalEntity
        which doesn't have member_physical_entities."""
        try:
            return copy.deepcopy(self._agents[bpe.uid])
        except KeyError:
            pass

        mcs = BiopaxProcessor._get_entity_mods(bpe) if _is_protein(bpe) else []
        name = bpe.display_name
        agents = []

        # We first get processed xrefs
        xrefs = BiopaxProcessor._get_processed_xrefs(bpe)

        # We now need to harmonize UP and HGNC
        # Case 1. Multiple genes coding for one protein
        nhgnc_ids = len(xrefs.get('HGNC', {}))
        nup_ids = len(xrefs.get('UP', {}))
        # One protein coded by many genes
        if nhgnc_ids > 1 and nup_ids == 1:
            for hgnc_id in xrefs['HGNC']:
                agent = get_standard_agent(name, {'HGNC': hgnc_id}, mods=mcs)
                agents.append(agent)
        # One gene coding for many proteins
        elif nhgnc_ids == 1 and nup_ids > 1:
            for up_id in xrefs['UP']:
                agent = get_standard_agent(name, {'UP': up_id}, mods=mcs)
                agents.append(agent)
        # This is secretly a family, i.e., we have more than one
        # gene/protein IDs and so we can go by one of the ID sets and
        # standardize from there
        elif nhgnc_ids > 1 and nhgnc_ids == nup_ids:
            for up_id in xrefs['UP']:
                agent = get_standard_agent(name, {'UP': up_id}, mods=mcs)
                agents.append(agent)
        # Otherwise it's just a regular Agent
        else:
            agent = get_standard_agent(name, clean_up_xrefs(xrefs), mods=mcs)
            agents.append(agent)
        # Since there are so many cases above, we fix UP / UPISO issues
        # in a single loop here
        for agent in agents:
            up_id = agent.db_refs.get('UP')
            if up_id is not None and '-' in up_id:
                base_id = up_id.split('-')[0]
                agent.db_refs['UP'] = base_id
                agent.db_refs['UPISO'] = up_id

        # There is a potential here that an Agent name was set to None
        # if both the display name and the standard name are missing.
        # We filter these out
        agents = [a for a in agents if a.name is not None]
        return agents
Пример #4
0
def _process_record_helper(
    record, subject, up_stmt_cls, down_stmt_cls
) -> Iterable[Statement]:
    up_genes, down_genes = _get_regulations(record)
    evidence = _get_evidence(record)
    for prefix, identifier, name in up_genes:
        target = get_standard_agent(name, {prefix: identifier})
        yield up_stmt_cls(subject, target, copy(evidence))
    for prefix, identifier, name in down_genes:
        target = get_standard_agent(name, {prefix: identifier})
        yield down_stmt_cls(subject, target, copy(evidence))
Пример #5
0
 def _get_agent(self, ent_name, ent_type, id, database):
     # Returns a list of agents corresponding to this id
     # (If it is a signor complex, returns an Agent object with complex
     # constituents as BoundConditions
     name = ent_name
     if database == 'SIGNOR' and id in self.complex_map:
         components = self.complex_map[id]
         agents = self._get_complex_agents(id)
         # Return the first agent with the remaining agents as a bound
         # condition
         agent = agents[0]
         agent.bound_conditions = \
             [BoundCondition(a, True) for a in agents[1:]]
         return agent
     elif ent_type == 'mirna' and id.startswith('URS'):
         db_refs = {'RNACENTRAL': id}
         return get_standard_agent(name, db_refs=db_refs)
     else:
         gnd_type = _type_db_map[(ent_type, database)]
         if gnd_type == 'UP':
             db_refs = process_uniprot_entry(id)
         # Map SIGNOR protein families to FamPlex families
         elif ent_type == 'proteinfamily':
             db_refs = {database: id}  # Keep the SIGNOR family ID in db_refs
             key = (database, id)
             # Use SIGNOR name unless we have a mapping in FamPlex
             famplex_id = famplex_map.get(key)
             if famplex_id is None:
                 logger.info('Could not find %s in FamPlex map' %
                             str(key))
             else:
                 db_refs['FPLX'] = famplex_id
         # Other possible groundings are PUBCHEM, SIGNOR, etc.
         elif gnd_type is not None:
             if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase',
                                 'DRUGBANK'):
                 raise ValueError('Unexpected database %s' % database)
             if database == 'PUBCHEM' and id.startswith('CID:'):
                 # We take off the CID: prefix plus fix an issue with
                 # SIGNOR's format in which it leaves extra spaces around
                 # the ID, as in 'CID: 923'
                 id = id[4:].strip()
             elif database == 'ChEBI' and id.startswith('SID:'):
                 gnd_type = 'PUBCHEM.SUBSTANCE'
                 id = id[4:].strip()
             db_refs = {gnd_type: id}
         # If no grounding, include as an untyped/ungrounded node
         else:
             name = ent_name
             db_refs = {}
         return get_standard_agent(name, db_refs=db_refs)
Пример #6
0
def get_disease_agent(name, disease_id):
    groundings = disease_id.split('|')
    db_refs = {}
    for gr in groundings:
        db_ns, db_id = gr.split(':')
        db_refs[db_ns] = db_id
    return get_standard_agent(name, db_refs)
Пример #7
0
    def _get_drug_agent(drug_element):
        name_tag = db_find(drug_element, 'db:name')
        name = name_tag.text

        db_refs = {}

        # Extract the DrugBank ID
        drugbank_id_tags = db_findall(drug_element, 'db:drugbank-id')
        # We do a sort here because sometimes there's more than one
        # DrugBank ID and we choose the "smaller" one here
        drugbank_id = sorted([di.text for di in drugbank_id_tags
                              if di.text.startswith('DB')])[0]
        db_refs['DRUGBANK'] = drugbank_id

        # Extract CAS ID
        cas_tag = db_find(drug_element, 'db:cas-number')
        if cas_tag is not None and cas_tag.text is not None:
            db_refs['CAS'] = cas_tag.text

        # Extract other xrefs
        for xref_tag in db_findall(drug_element, 'db:external-identifiers/'
                                   'db:external-identifier'):
            resource = db_find(xref_tag, 'db:resource').text
            identifier = db_find(xref_tag, 'db:identifier').text
            if resource == 'ChEMBL':
                db_refs['CHEMBL'] = ensure_chembl_prefix(identifier)
            elif resource == 'PubChem Compound':
                db_refs['PUBCHEM'] = identifier
            elif resource == 'ChEBI':
                db_refs['CHEBI'] = ensure_chebi_prefix(identifier)
        assert_valid_db_refs(db_refs)
        return get_standard_agent(name, db_refs)
Пример #8
0
def get_std_disease(raw_string: str, db_id: str) -> List[Agent]:
    """Standardize disease names.

    Parameters
    ----------
    raw_string :
        Name of the agent in the GNBR dataset.
    db_id :
        Entrez identifier of the agent.

    Returns
    -------
    :
        A standardized Agent object.
    """
    agents = []
    db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {}
    name = raw_string if not pd.isna(raw_string) else db_id

    if pd.isna(db_id):
        pass
    elif omim_no_prefix_pattern.match(db_id):
        db_refs['OMIM'] = db_id
    elif omim_pattern.match(db_id):
        db_refs['OMIM'] = db_id[5:]
    elif mesh_no_prefix_pattern.match(db_id):
        db_refs['MESH'] = db_id
    elif mesh_pattern.match(db_id):
        db_refs['MESH'] = db_id[5:]
    else:
        raise ValueError('Unexpected disease identifier: %s' % db_id)
    agents.append(get_standard_agent(name, db_refs))
    return agents
Пример #9
0
 def _process_row(row, stmt_type):
     # Note that even in the DUB table the subject of the statement
     # is called "E3"
     # There are some examples where a complex is implied (e.g., BMI1-RNF2),
     # for simplicity we just ignore these
     if '-' in row['E3AC']:
         return None
     subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']})
     obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']})
     if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
         # Note: we sometimes get int here
         pmid = str(row['SOURCEID'])
         text = row['SENTENCE']
     else:
         pmid = None
         text = None
     ev = Evidence(source_api='ubibrowser', pmid=pmid, text=text)
     stmt = stmt_type(subj_agent, obj_agent, evidence=[ev])
     return stmt
Пример #10
0
 def get_subject(record) -> Agent:
     db_refs = {}
     doid = record["do_id"]
     if doid:
         db_refs["DOID"] = doid
     umls_id = record["umls_cui"]
     if umls_id:
         db_refs["UMLS"] = umls_id
     name = record["disease_name"]
     return get_standard_agent(name, db_refs)
Пример #11
0
 def get_subject(record) -> Agent:
     db_refs = {}
     smiles = record["smiles"]
     if smiles:
         db_refs["SMILES"] = smiles
     pubchem_compound_id = record["pubchem_cid"]
     if pubchem_compound_id:
         db_refs["PUBCHEM"] = str(pubchem_compound_id)
     drugbank_id = record["drugbank_id"]
     if drugbank_id:
         db_refs["DRUGBANK"] = drugbank_id
     name = record["drug_name"]
     return get_standard_agent(name, db_refs)
Пример #12
0
def get_std_chemical(raw_string: str, db_id: str) -> List[Agent]:
    """Standardize chemical names.

    Parameters
    ----------
    raw_string :
        Name of the agent in the GNBR dataset.
    db_id :
        Entrez identifier of the agent.

    Returns
    -------
    :
        A standardized Agent object.
    """
    # If neither a name nor a DB ID is given, we return empty
    if pd.isna(db_id) and pd.isna(raw_string):
        return []
    # We add TEXT to db_refs if there is a raw_string
    db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {}
    # In this case we know that there is no db_id but we have raw_string that
    # we can use as a name and we return with that agent
    if pd.isna(db_id):
        return [Agent(raw_string, db_refs=db_refs)]
    # Otherwise we have a db_id that we can process
    else:
        agents = []
        for single_db_id in db_id.split('|'):
            single_db_refs = deepcopy(db_refs)
            name = raw_string if not pd.isna(raw_string) else single_db_id
            if cheby_pattern.match(single_db_id):
                single_db_refs['CHEBI'] = single_db_id
            elif mesh_pattern.match(single_db_id):
                mesh_id = single_db_id[5:]
                # There are often non-existent MESH IDs here for some reason
                # that can be filtered out with this technique
                if not mesh_client.get_mesh_name(mesh_id, offline=True):
                    continue
                single_db_refs['MESH'] = mesh_id
            elif mesh_no_prefix_pattern.match(single_db_id):
                mesh_id = single_db_id
                # There are often non-existent MESH IDs here for some reason
                # that can be filtered out with this technique
                if not mesh_client.get_mesh_name(mesh_id, offline=True):
                    continue
                single_db_refs['MESH'] = single_db_id
            else:
                raise ValueError('Unexpected chemical identifier: %s' %
                                 single_db_id)
            agents.append(get_standard_agent(name, single_db_refs))
    return agents
Пример #13
0
 def get_subject(record) -> Optional[Agent]:
     ncbigene_id = record["id"][len("gene:") :]
     uniprot_id = uniprot_client.get_id_from_entrez(ncbigene_id)
     if uniprot_id is None:
         logger.debug(f"Could not convert ncbigene:{ncbigene_id} to UniProt")
         return None
     name = uniprot_client.get_gene_name(uniprot_id)
     return get_standard_agent(
         name,
         {
             "EGID": ncbigene_id,
             "UP": uniprot_id,
         },
     )
Пример #14
0
 def get_agent_from_entity(self, entity):
     # Note: entities can be negated ("negated") and have a semantic type
     # (semtype) and score (score)
     # <Entity id="Dtest.txt.E8" cui="C3192263" name="Vemurafenib"
     # semtypes="orch,phsu" text="vemurafenib" score="851" negated="false"
     # begin="147" end="158" />
     name = entity.attrib['name']
     db_refs = {'TEXT': entity.attrib['text'], 'UMLS': entity.attrib['cui']}
     agent = get_standard_agent(name, db_refs)
     # We optionally add groundings from Gilda if standardization didn't
     # yield and additional references beyond UMLS.
     if self.use_gilda_grounding and set(db_refs) == {'TEXT', 'UMLS'}:
         import gilda
         matches = gilda.ground(name)
         if matches:
             db_refs[matches[0].term.db] = matches[0].term.id
             standardize_agent_name(agent, standardize_refs=True)
     return agent
Пример #15
0
def get_std_gene(raw_string: str, db_id: str) -> List[Agent]:
    """Standardize gene names.

    Parameters
    ----------
    raw_string :
        Name of the agent in the GNBR dataset.
    db_id :
        Entrez identifier of the agent.

    Returns
    -------
    :
        A standardized Agent object.
    """
    # If neither a name nor a DB ID is given, we return empty
    if pd.isna(db_id) and pd.isna(raw_string):
        return []
    # We add TEXT to db_refs if there is a raw_string
    db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {}
    # In this case we know that there is no db_id but we have raw_string that
    # we can use as a name and we return with that agent
    if pd.isna(db_id):
        return [Agent(raw_string, db_refs=db_refs)]
    # Otherwise we have a db_id that we can process
    else:
        agents = []
        for single_db_id in db_id.split(';'):
            single_db_refs = deepcopy(db_refs)
            name = raw_string if not pd.isna(raw_string) else single_db_id

            if entrez_pattern.match(single_db_id):
                single_db_refs['EGID'] = single_db_id
            else:
                match = entrez_with_tax_pattern.match(single_db_id)
                if not match:
                    raise ValueError('Unexpected gene identifier: %s' %
                                     single_db_id)
                single_db_refs['EGID'] = match.groups()[0]
            agents.append(get_standard_agent(name, single_db_refs))
    return agents
Пример #16
0
    def _get_target_agent(target_element):
        name_tag = db_find(target_element, 'db:name')
        name = name_tag.text

        db_refs = {}

        # Get Drugbank target ID
        target_id = db_find(target_element, 'db:id').text
        db_refs['DRUGBANKV4.TARGET'] = target_id

        # Extract other xrefs
        for xref_tag in db_findall(target_element, 'db:polypeptide/'
                                   'db:external-identifiers/'
                                   'db:external-identifier'):
            resource = db_find(xref_tag, 'db:resource').text
            identifier = db_find(xref_tag, 'db:identifier').text
            if resource == 'HUGO Gene Nomenclature Committee (HGNC)':
                db_refs['HGNC'] = identifier[5:]
            elif resource == 'UniProtKB':
                db_refs['UP'] = identifier
        return get_standard_agent(name, db_refs=db_refs)
Пример #17
0
def get_chemical_agent(name, mesh_id, cas_id):
    db_refs = {'MESH': mesh_id}
    if cas_id:
        db_refs['CAS'] = cas_id
    return get_standard_agent(name, db_refs)
Пример #18
0
def get_gene_agent(name, gene_entrez_id):
    db_refs = {'EGID': gene_entrez_id}
    hgnc_id = hgnc_client.get_hgnc_id(name)
    if hgnc_id:
        db_refs['HGNC'] = hgnc_id
    return get_standard_agent(name, db_refs)
Пример #19
0
 def _extract_protein(self, name, gene_id):
     db_refs = {'EGID': gene_id}
     hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
     if hgnc_id is not None:
         db_refs['HGNC'] = hgnc_id
     return get_standard_agent(name, db_refs=db_refs)