Пример #1
0
def fix_id(db_ns: str, db_id: str) -> Tuple[str, str]:
    """Fix ID issues specific to the SIF dump."""
    if db_ns == "GO":
        if db_id.isnumeric():
            db_id = "0" * (7 - len(db_id)) + db_id
    elif db_ns == "EFO" and db_id.startswith("EFO:"):
        db_id = db_id[4:]
    elif db_ns == "UP" and db_id.startswith("SL"):
        db_ns = "UPLOC"
    elif db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"):
        db_id = db_id.split("-")[0]
    elif db_ns == 'FPLX' and db_id == 'TCF-LEF':
        db_id = 'TCF_LEF'
    db_id = ensure_prefix_if_needed(db_ns, db_id)
    return db_ns, db_id
Пример #2
0
 def process_selventa_xref(xref):
     if pandas.isna(xref):
         return ''
     db_refs = {}
     for xref_part in xref.split('|'):
         prefix, db_id = xref_part.split(':', maxsplit=1)
         ns = xref_mappings.get(prefix)
         if not ns:
             logger.info('Unknown namespace: %s' % prefix)
             continue
         db_id = ensure_prefix_if_needed(ns, db_id)
         db_refs[ns] = db_id
     assert_valid_db_refs(db_refs)
     db_refs_str = '|'.join(['%s:%s' % (k, v)
                             for k, v in sorted(db_refs.items())])
     return db_refs_str
Пример #3
0
def fix_invalidities_db_refs(db_refs: Mapping[str, str]) -> Mapping[str, str]:
    """Return a fixed version of a db_refs grounding dict."""
    if 'PUBCHEM' in db_refs and \
            db_refs['PUBCHEM'].startswith('CID'):
        db_refs['PUBCHEM'] = \
            db_refs['PUBCHEM'].replace('CID:', '').strip()

    db_refs = {k: v for k, v in db_refs.items()
               if v is not None}

    for k, v in copy.deepcopy(db_refs).items():
        if k == 'CHEMBL' and not v.startswith('CHEMBL'):
            db_refs[k] = 'CHEMBL%s' % v
        elif k == 'ECCODE':
            db_refs['ECCODE'] = db_refs['ECCODE'].replace('.-', '')
        elif k == 'UNIPROT':
            db_refs.pop(k)
            # This is really a location
            if v.startswith('SL-'):
                db_refs['UPLOC'] = v
            # Otherwise we just fix the invalid key
            else:
                db_refs['UP'] = v
        elif k == 'UP':
            # There are cases where this is an empty string
            if not v.strip():
                db_refs.pop('UP', None)
            # Sometimes we have two IDs separated by a comma
            if ',' in v:
                db_refs['UP'] = v.split(',')[0]
            if v.startswith('SL-'):
                db_refs['UPLOC'] = db_refs.pop('UP')
        elif k == 'UAZ':
            db_refs.pop('UAZ')
            if v.startswith('CVCL'):
                db_refs['CVCL'] = v
        elif k == 'TAXONOMY' and v == '-1':
            db_refs.pop('TAXONOMY', None)
        elif k == 'LINCS' and re.match(r'\d+-\d+', v):
            db_refs['HMS-LINCS'] = db_refs.pop('LINCS')
        elif k == 'CVCL' and re.match(r'^[A-Z0-9]{4}$', v):
            db_refs['CVCL'] = 'CVCL_%s' % v
        elif k == 'CO':
            db_refs['CL'] = 'CL:%s' % db_refs.pop('CO')
        elif k == 'FPLX' and '-' in v:
            db_refs['FPLX'] = v.replace('-', '_')
        elif k == 'DRUGBANK' and v.startswith('DBSALT'):
            db_refs['DRUGBANK.SALT'] = db_refs.pop('DRUGBANK')
        # For MGI and RGD some sources added names as IDs that are invalid
        # and not easily fixable without reverse lookups so we rather
        # remove these.
        elif k == 'MGI' and not re.match(
                identifiers_registry['mgi']['pattern'], v):
            db_refs.pop('MGI', None)
        elif k == 'RGD' and not re.match(
                identifiers_registry['rgd']['pattern'], v):
            db_refs.pop('RGD', None)
        else:
            new_val = ensure_prefix_if_needed(k, v)
            db_refs[k] = new_val
    return db_refs
Пример #4
0
def fix_invalidities(stmts):
    for stmt in stmts:
        for agent in stmt.real_agent_list():
            for db_ns, db_id in agent.db_refs.items():
                agent.db_refs[db_ns] = ensure_prefix_if_needed(db_ns, db_id)
    return stmts
Пример #5
0
def test_ensure_prefix_if_needed():
    assert ensure_prefix_if_needed('CHEBI', 'CHEBI:123') == 'CHEBI:123'
    assert ensure_prefix_if_needed('CHEBI', '123') == 'CHEBI:123'
    assert ensure_prefix_if_needed('GO', '00004') == 'GO:00004'
    assert ensure_prefix_if_needed('EFO', '1234') == '1234'
    assert ensure_prefix_if_needed('XXXX', '1234') == '1234'