Exemplo n.º 1
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            logger.warning('Skipping collection Agent.')
            return None
        name_tag = ref.find("var/[@name='name']")
        if name_tag is not None:
            name = name_tag.text
        else:
            return None
        uid_tag = ref.find("var/[@name='uid']")
        if uid_tag is not None:
            uid = uid_tag.text
        else:
            uid = None

        db_refs = {}
        text_tag = ref.find("var/[@name='raw-text']")
        if text_tag is not None:
            db_refs['TEXT'] = text_tag.text

        if uid is not None and uid.startswith('UP:'):
            up_mnemonic = uid[3:]
            up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic)
            if up_id is not None:
                up_name = uniprot_client.get_gene_name(up_id)
                if up_name is not None:
                    name = up_name
                db_refs['UP'] = up_id

        assert name is not None

        agent = Agent(name, db_refs=db_refs)
        return agent
Exemplo n.º 2
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            logger.warning('Skipping collection Agent.')
            return None
        name_tag = ref.find("var/[@name='name']")
        if name_tag is not None:
            name = name_tag.text
        else:
            return None
        uid_tag = ref.find("var/[@name='uid']")
        if uid_tag is not None:
            uid = uid_tag.text
        else:
            uid = None

        db_refs = {}
        text_tag = ref.find("var/[@name='raw-text']")
        if text_tag is not None:
            db_refs['TEXT'] = text_tag.text

        if uid is not None and uid.startswith('UP:'):
            up_mnemonic = uid[3:]
            up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic)
            if up_id is not None:
                up_name = uniprot_client.get_gene_name(up_id)
                if up_name is not None:
                    name = up_name
                db_refs['UP'] = up_id

        assert name is not None

        agent = Agent(name, db_refs=db_refs)
        return agent
Exemplo n.º 3
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.

    """
    db_refs = None
    if ns == 'HGNC':
        # Assumption: name is an HGNC symbol
        hgnc_id = hgnc_client.get_current_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        elif isinstance(hgnc_id, list):
            logger.info('More than one current HGNC ID for %s, choosing %s' %
                        (name, hgnc_id[0]))
            hgnc_id = hgnc_id[0]
        name = hgnc_client.get_hgnc_name(hgnc_id)
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        # This is a simple test to see if name is a valid UniProt ID,
        # if we can't get a mnemonic, we assume it's not a UP ID
        if uniprot_client.get_mnemonic(name, web_fallback=False):
            up_id = name
        # We next check if it's a mnemonic
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
        else:
            name = uniprot_client.get_gene_name(up_id)
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        if name == 'cell proliferation':
            name = 'cell population proliferation'
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
        name = go_client.get_go_label(go_id)
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID from %s' % name)
            return name, None
        name = mesh_name
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns == 'MGI':
        up_id = mouse_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    elif ns == 'RGD':
        up_id = rat_lookup.get(name)
        if up_id:
            db_refs = {'UP': up_id}
    # Map Selventa families and complexes to FamPlex
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    elif ns == 'SCOMP':
        db_refs = {'SCOMP': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SCOMP complex: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info(
                    'HGNC entity %s with HGNC ID %s has no '
                    'corresponding Uniprot ID.', name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.debug('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return name, None
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
    # CHEBI
    elif ns == 'CHEBI':
        # We first look up BEL's own namespace map for ChEBI names to IDs
        chebi_id = chebi_name_id.get(name)
        # If that fails, we look up INDRA's ChEBI name to ID mapping
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # These appear in the name slot but are actually IDs
    elif ns == 'CHEBIID':
        chebi_id = identifiers.ensure_chebi_prefix(name)
        db_refs = {'CHEBI': chebi_id}
        name = chebi_client.get_chebi_name_from_id(chebi_id)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM', 'TEXT'):
        db_refs = {ns: name}
    elif ns == 'TAX':
        tid = taxonomy_client.get_taxonomy_id(name)
        if tid:
            db_refs = {'TAXONOMY': tid}
        else:
            logger.info('Could not get taxonomy ID for %s' % name)
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
    return name, db_refs
Exemplo n.º 4
0
    def _get_agent(self, participant):
        dbid = participant.get('identifier')
        text = participant.get('entity_text')[0]

        if dbid == 'GENERIC':
            if not text:
                return None
            else:
                return Agent(text)

        db_refs = {}
        entity_type = participant.get('entity_type')
        if entity_type in ['protein', 'chemical', 'gene']:
            # TODO: standardize name here
            name = participant.get('entity_text')[0]
            db_refs['TEXT'] = text
            if dbid:
                db_name, db_id = dbid.split(':')
                if db_name.lower() == 'uniprot':
                    uniprot_id = uniprot_client.get_id_from_mnemonic(db_id)
                    db_refs['UP'] = uniprot_id
                elif db_name.lower() == 'pubchem':
                    chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id)
                    db_refs['CHEBI'] = chebi_id
                elif db_name.lower() == 'hgnc':
                    db_refs['HGNC'] = db_id
        elif entity_type == 'protein_family':
            name = text
        else:
            return None
        # TODO: handle other participant types
        agent = Agent(name, db_refs=db_refs)

        features = participant.get('features')
        if features:
            for feature in features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    agent.bound_conditions.append(bc)
                elif feature_type == 'mutation_feature':
                    mc = self._get_mut_condition(feature)
                    agent.mutations.append(mc)
                elif feature_type == 'location_feature':
                    agent.location = feature.get('location')
        not_features = participant.get('features')
        if not_features:
            for feature in not_features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    mc.is_modified = False
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    bc.is_bound = False
                    agent.bound_conditions.append(bc)
        return agent
Exemplo n.º 5
0
def get_db_refs_by_name(ns, name, node_data):
    """Return standard name and grounding based on a namespace and a name.

    Parameters
    ----------
    ns : str
        A name space in which the given name is interpreted.
    name : str
        The name in the given name space to get grounding for.
    node_data : dict
        Node data for logging purposes.

    Returns
    -------
    name : str
        The standardized name for the given entity.
    db_refs : dict
        The grounding for the given entity.
    """
    db_refs = None
    if ns == 'HGNC':
        hgnc_id = hgnc_client.get_hgnc_id(name)
        if not hgnc_id:
            logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
            return name, None
        db_refs = {'HGNC': hgnc_id}
        up_id = _get_up_id(hgnc_id)
        if up_id:
            db_refs['UP'] = up_id
        mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
        if mirbase_id:
            db_refs['MIRBASE'] = mirbase_id

    elif ns in ('UNIPROT', 'UP'):
        up_id = None
        gene_name = uniprot_client.get_gene_name(name)
        if gene_name:
            up_id = name
        else:
            up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name)
            if up_id_from_mnem:
                up_id = up_id_from_mnem
                gene_name = uniprot_client.get_gene_name(up_id)
        if not up_id:
            logger.info('Couldn\'t get UP ID from %s' % name)
            return name, None
        db_refs = {'UP': up_id}
        if uniprot_client.is_human(up_id):
            hgnc_id = hgnc_client.get_hgnc_id(gene_name)
            if not hgnc_id:
                logger.info('Uniprot ID linked to invalid human gene '
                            'name %s' % name)
            else:
                db_refs['HGNC'] = hgnc_id
    elif ns == 'FPLX':
        db_refs = {'FPLX': name}
    elif ns in ('GO', 'GOBP', 'GOCC'):
        go_id = go_client.get_go_id_from_label(name)
        if not go_id:
            logger.info('Could not find GO ID for %s' % name)
            return name, None
        db_refs = {'GO': go_id}
    elif ns in ('MESHPP', 'MESHD', 'MESH'):
        mesh_id = mesh_client.get_mesh_id_name(name)
        if not mesh_id:
            logger.info('Could not find MESH ID fro %s' % name)
            return name, None
        db_refs = {'MESH': mesh_id}
    # For now, handle MGI/RGD but putting the name into the db_refs so
    # it's clear what namespace the name belongs to
    # FIXME: Full implementation would look up MGI/RGD identifiers from
    # the names, and obtain corresponding Uniprot IDs
    elif ns in ('MGI', 'RGD'):
        db_refs = {ns: name}
    # Map Selventa families to FamPlexes
    elif ns == 'SFAM':
        db_refs = {'SFAM': name}
        indra_name = bel_to_indra.get(name)
        if indra_name is None:
            logger.info('Could not find mapping for BEL/SFAM family: '
                        '%s (%s)' % (name, node_data))
        else:
            db_refs['FPLX'] = indra_name
            name = indra_name
    # Map Entrez genes to HGNC/UP
    elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'):
        hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
        db_refs = {'EGID': name}
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
            name = hgnc_client.get_hgnc_name(hgnc_id)
            up_id = hgnc_client.get_uniprot_id(hgnc_id)
            if up_id:
                db_refs['UP'] = up_id
            else:
                logger.info('HGNC entity %s with HGNC ID %s has no '
                            'corresponding Uniprot ID.',
                            name, hgnc_id)
            mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id)
            if mirbase_id:
                db_refs['MIRBASE'] = mirbase_id
        else:
            logger.info('Could not map EGID%s to HGNC.' % name)
            name = 'E%s' % name
    elif ns == 'MIRBASE':
        mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name)
        if not mirbase_id:
            logger.info('Could not map miRBase name %s to ID', name)
            return
        db_refs = {'MIRBASE': mirbase_id}
        hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    # CHEBI
    elif ns == 'CHEBI':
        chebi_id = chebi_name_id.get(name)
        if not chebi_id:
            chebi_id = chebi_client.get_chebi_id_from_name(name)
        if chebi_id:
            db_refs = {'CHEBI': chebi_id}
        else:
            logger.info('CHEBI name %s not found in map.' % name)
    # SDIS, SCHEM: Include the name as the ID for the namespace
    elif ns in ('SDIS', 'SCHEM'):
        db_refs = {ns: name}
    else:
        logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name,
                                                          node_data))
    return name, db_refs
Exemplo n.º 6
0
    def _get_agent(self, participant):
        dbid = participant.get('identifier')
        text = participant.get('entity_text')[0]

        if dbid == 'GENERIC':
            if not text:
                return None
            else:
                return Agent(text)

        db_refs = {}
        entity_type = participant.get('entity_type')
        if entity_type in ['protein', 'chemical', 'gene']:
            # TODO: standardize name here
            name = participant.get('entity_text')[0]
            db_refs['TEXT'] = text
            if dbid:
                db_name, db_id = dbid.split(':')
                if db_name.lower() == 'uniprot':
                    uniprot_id = uniprot_client.get_id_from_mnemonic(db_id)
                    db_refs['UP'] = uniprot_id
                elif db_name.lower() == 'pubchem':
                    chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id)
                    db_refs['CHEBI'] = chebi_id
                elif db_name.lower() == 'hgnc':
                    db_refs['HGNC'] = db_id
        elif entity_type == 'protein_family':
            name = text
        else:
            return None
        # TODO: handle other participant types
        agent = Agent(name, db_refs=db_refs)

        features = participant.get('features')
        if features:
            for feature in features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    agent.bound_conditions.append(bc)
                elif feature_type == 'mutation_feature':
                    mc = self._get_mut_condition(feature)
                    agent.mutations.append(mc)
                elif feature_type == 'location_feature':
                    agent.location = feature.get('location')
        not_features = participant.get('features')
        if not_features:
            for feature in not_features:
                feature_type = feature.get('feature_type')
                if feature_type == 'modification_feature':
                    mc = self._get_mod_condition(feature)
                    mc.is_modified = False
                    agent.mods.append(mc)
                elif feature_type == 'binding_feature':
                    bc = self._get_bound_condition(feature)
                    bc.is_bound = False
                    agent.bound_conditions.append(bc)
        return agent
Exemplo n.º 7
0
    def _get_agent_from_ref(self, ref):
        # TODO: handle collections
        if ref.attrib.get('category') == 'collection':
            #logger.warning('Skipping collection Agent.')
            return None

        # Find the name, uid and raw-text tags first and get their text
        # content if available
        uid_tag = ref.find("var/[@name='uid']")
        name_tag = ref.find("var/[@name='name']")
        text_tag = ref.find("var/[@name='raw-text']")
        if name_tag is not None and name_tag.text:
            name = name_tag.text
        else:
            name = None
        if uid_tag is not None and uid_tag.text:
            uid = uid_tag.text
        else:
            uid = None
        if text_tag is not None and text_tag.text:
            raw_text = text_tag.text
        else:
            raw_text = None

        # TODO: factor this out and reuse fix_agents
        db_refs = {}
        # Save raw text if available
        if raw_text:
            db_refs['TEXT'] = raw_text
        agent_name = raw_text
        # If we have a proper UID then we try to reconstruct an Agent from that
        if uid is not None and len(uid.split(':')) == 2:
            db_ns, db_id = uid.split(':')
            be_id = famplex_map.get((db_ns, db_id))
            if be_id:
                db_refs[db_ns] = db_id
                db_refs['FPLX'] = be_id
                agent_name = be_id
            elif db_ns in ['UP', 'Uniprot']:
                id_from_mnemonic = uniprot_client.get_id_from_mnemonic(db_id)
                if id_from_mnemonic:
                    db_id = id_from_mnemonic
                db_refs['UP'] = db_id
                hgnc_id = uniprot_client.get_hgnc_id(db_id)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
                    agent_name = hgnc_client.get_hgnc_name(hgnc_id)
                else:
                    gene_name = uniprot_client.get_gene_name(db_id)
                    if gene_name:
                        agent_name = gene_name
            elif db_ns == 'NCIT':
                db_refs['NCIT'] = db_id
                target = ncit_map.get(db_id)
                if target:
                    db_refs[target[0]] = target[1]
                    if target[0] == 'HGNC':
                        up_id = hgnc_client.get_uniprot_id(target[1])
                        agent_name = hgnc_client.get_hgnc_name(target[1])
                        if up_id:
                            db_refs['UP'] = up_id
                    elif target[0] == 'UP':
                        agent_name = uniprot_client.get_gene_name(target[1])
                        if agent_name:
                            hgnc_id = hgnc_client.get_hgnc_id(agent_name)
                            if hgnc_id:
                                db_refs['HGNC'] = hgnc_id
            elif db_ns == 'FA':
                db_refs['NXP'] = 'FA:' + db_id
            elif db_ns == 'XFAM':
                db_refs['PF'] = db_id.split('.')[0]
            elif db_ns == 'CHEBI':
                db_refs['CHEBI'] = 'CHEBI:' + db_id
            elif db_ns in ['GO', 'MESH', 'FPLX']:
                db_refs[db_ns] = db_id
            # Handle old BE mappings and add them as FPLX
            elif db_ns == 'BE':
                db_refs['FPLX'] = db_id
            elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']:
                db_refs[db_ns] = db_id
            else:
                logger.warning('Unknown database name space %s' % db_ns)
        if not agent_name:
            if raw_text is not None:
                agent_name = raw_text
            else:
                return None

        assert (agent_name)

        agent = Agent(agent_name, db_refs=db_refs)
        return agent