Пример #1
0
def homologene_uniprot_dict(source, target, only_swissprot=True):
    """
    Returns orthology translation table as dict from UniProt to Uniprot,
    obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for
    translation.

    :param int source: NCBI Taxonomy ID of the source species (keys).
    :param int target: NCBI Taxonomy ID of the target species (values).
    :param bool only_swissprot: Translate only SwissProt IDs.
    """
    result = {}

    hge = homologene_dict(source, target, 'entrez')
    hgr = homologene_dict(source, target, 'refseq')

    all_source = set(
        uniprot_input.all_uniprots(organism=source, swissprot='YES'))

    if not only_swissprot:
        all_source_trembl = uniprot_input.all_uniprots(organism=source,
                                                       swissprot='NO')
        all_source.update(set(all_source_trembl))

    for u in all_source:

        source_e = mapping.map_name(u, 'uniprot', 'entrez', source)
        source_r = mapping.map_name(u, 'uniprot', 'refseqp', source)
        target_u = set([])
        target_r = set([])
        target_e = set([])

        for e in source_e:
            if e in hge:
                target_e.update(hge[e])

        for r in source_r:
            if r in hgr:
                target_r.update(hgr[r])

        for e in target_e:
            target_u.update(mapping.map_name(e, 'entrez', 'uniprot', target))

        for r in target_r:
            target_u.update(mapping.map_name(e, 'refseqp', 'uniprot', target))


        target_u = \
            itertools.chain(
                *map(
                    lambda tu:
                        mapping.map_name(tu, 'uniprot', 'uniprot', target),
                    target_u
                )
            )

        result[u] = sorted(list(target_u))

    return result
Пример #2
0
def _matrixdb_protein_list(category, organism=9606):
    """
    Returns a set of proteins annotated by MatrixDB.

    :arg str category:
        The protein annotation category. Possible values: `ecm`, `membrane`
        or `secreted`.
    """

    url = urls.urls['matrixdb']['%s_proteins' % category]
    c = curl.Curl(url, silent=False, large=True)

    proteins = set()

    # header row
    _ = next(c.result)

    for l in c.result:
        if not l:
            continue

        proteins.add(l.strip().replace('"', '').split('\t')[0])

    proteins = mapping.map_names(proteins, 'uniprot', 'uniprot')

    if organism:

        uniprots = uniprot_input.all_uniprots(
            organism=organism,
            swissprot=True,
        )
        proteins = proteins & set(uniprots)

    return proteins
Пример #3
0
def _phosphosite_filter_organism(psite_data, ncbi_tax_id=9606):

    all_uniprots = uniprot_input.all_uniprots(organism=ncbi_tax_id)

    return [
        rec for rec in psite_data
        if rec[0] in all_uniprots and rec[1] in all_uniprots
    ]
Пример #4
0
        def loader(ncbi_tax_id=9606):

            all_up = uniprot_input.all_uniprots(organism=ncbi_tax_id)

            return (pfam_input.get_pfam_regions(
                uniprots=all_up,
                dicts='uniprot',
                keepfile=True,
            ))
Пример #5
0
    def load_proteome(self, taxon, swissprot_only=True):

        key = (taxon, swissprot_only)

        if key not in self._proteomes:

            self._proteomes[key] = (set(uniprot_input.all_uniprots(*key)))

            for protein in self._proteomes[key]:

                self._taxonomy[protein] = key

            if not swissprot_only:

                self.load_proteome(taxon, True)
Пример #6
0
def phosphosite_interactions(cache=True, ncbi_tax_id=9606):
    """
    Downloads curated and HTP data from Phosphosite,
    from preprocessed cache file if available.
    Processes BioPAX format.
    Returns list of interactions.
    """

    curated_cache = urls.files['phosphosite']['curated']
    noref_cache = urls.files['phosphosite']['noref']

    if cache and os.path.exists(curated_cache) and os.path.exists(noref_cache):

        return (
            pickle.load(open(curated_cache, 'rb')),
            pickle.load(open(noref_cache, 'rb')),
        )

    result_curated = []
    result_noref = []
    url = urls.urls['psite_bp']['url']
    c = curl.Curl(url, silent=False, large=True)
    bpax = c.gzfile
    xml = ET.parse(bpax)
    xmlroot = xml.getroot()
    bpprefix = '{http://www.biopax.org/release/biopax-level3.owl#}'
    rdfprefix = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'
    proteins = {}
    for p in xmlroot.iter(bpprefix + 'ProteinReference'):
        psid = p.attrib[rdfprefix + 'ID']
        db = p.find(bpprefix + 'xref').find(bpprefix +
                                            'UnificationXref').find(bpprefix +
                                                                    'db').text
        up = p.find(bpprefix + 'xref').find(bpprefix +
                                            'UnificationXref').find(bpprefix +
                                                                    'id').text
        tax = ''
        if p.find(bpprefix + 'organism') is not None:
            tmp = p.find(bpprefix + 'organism')
            if rdfprefix + 'resource' in tmp.attrib:
                tax = tmp.attrib[rdfprefix + 'resource'].split('_')[1]
        if db == 'UniProtKB':
            up = up[0:6]
        proteins[psid] = {'id': up, 'db': db, 'species': tax, 'psid': psid}
    evidences = {}
    for p in xmlroot.iter(bpprefix + 'EvidenceCodeVocabulary'):
        evid = p.attrib[rdfprefix + 'ID'].split('_')[1]
        evname = p.find(bpprefix + 'term').text
        evidences[evid] = evname
    ev_short = {'0113': 'WB', '0427': 'MS', '0074': 'MA', '0421': 'AB'}
    nosrc = []
    notgt = []
    norefs = []
    noev = []
    noth = []
    edges = []

    for c in xmlroot.findall(bpprefix + 'Catalysis'):
        if rdfprefix + 'resource' in c.find(bpprefix + 'controller').attrib:
            src = 'po_' + \
                c.find(
                    bpprefix + 'controller').attrib[rdfprefix + 'resource'].split('_')[1]
        else:
            srcProt = c.find(bpprefix + 'controller').find(bpprefix +
                                                           'Protein')
            if srcProt is not None:
                src = 'po_' + srcProt.attrib[rdfprefix + 'ID'].split('_')[1]
            else:
                nosrc.append(c)
        tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix +
                                                       'ProteinReference')
        tgt = next(tgtProt, None)
        if tgt is not None:
            tgt = tgt.attrib[rdfprefix + 'ID']
        else:
            tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix +
                                                           'entityReference')
            tgt = next(tgtProt, None)
            if tgt is not None:
                if rdfprefix + 'resource' in tgt.attrib:
                    tgt = tgt.attrib[rdfprefix + 'resource'][1:]
            else:
                tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix +
                                                               'left')
                tgt = next(tgtProt, None)
                if tgt is not None:
                    if rdfprefix + 'resource' in tgt.attrib:
                        tgt = 'po_' + \
                            tgt.attrib[rdfprefix + 'resource'].split('_')[1]
                else:
                    notgt.append(c)
        refs = c.iter(bpprefix + 'PublicationXref')
        pmids = []
        for r in refs:
            pm = r.attrib[rdfprefix + 'ID'].split('_')
            if pm[0] == 'pmid':
                pmids.append(pm[1])
        refs = c.iter(bpprefix + 'evidence')
        for r in refs:
            rrefs = r.iter(bpprefix + 'xref')
            for rr in rrefs:
                if rdfprefix + 'resource' in rr.attrib:
                    pm = rr.attrib[rdfprefix + 'resource'].split('_')
                    if pm[0] == 'pubmed':
                        pmids.append(pm[1])
        evs = []
        for e in c.iter(bpprefix + 'evidenceCode'):
            if rdfprefix + 'resource' in e.attrib:
                evs.append(ev_short[e.attrib[rdfprefix +
                                             'resource'].split('_')[1]])
            else:
                ev = e.find(bpprefix + 'EvidenceCodeVocabulary')
                evs.append(ev_short[ev.attrib[rdfprefix + 'ID'].split('_')[1]])
        for e in c.iter(bpprefix + 'evidence'):
            if rdfprefix + 'resource' in e.attrib:
                ev = e.attrib[rdfprefix + 'resource'].split('_')
                if len(ev) == 4:
                    if len(ev[3]) == 4:
                        evs.append(ev_short[ev[3]])
        if (src is not None and tgt is not None and src in proteins
                and tgt in proteins and proteins[src]['id'] is not None
                and proteins[tgt]['id'] is not None):
            edges.append({
                'src': proteins[src],
                'tgt': proteins[tgt],
                'pmids': list(set(pmids)),
                'evs': list(set(evs))
            })
            if len(evs) == 0:
                noev.append(c)
            if len(pmids) == 0:
                norefs.append(c)
            if len(evs) == 0 and len(pmids) == 0:
                noth.append(c)

    if ncbi_tax_id:

        all_uniprots = uniprot_input.all_uniprots(organism=ncbi_tax_id)

    for e in edges:

        if (ncbi_tax_id and (e['src']['id'] not in all_uniprots
                             or e['tgt']['id'] not in all_uniprots)):

            continue

        this_iaction = [
            e['src']['id'], e['tgt']['id'], e['src']['species'],
            e['tgt']['species'], ';'.join(e['evs']), ';'.join(e['pmids'])
        ]

        if len(this_iaction[-1]) > 0:

            result_curated.append(this_iaction)

        else:

            result_noref.append(this_iaction)

    pickle.dump(result_curated, open(curated_cache, 'wb'))
    pickle.dump(result_noref, open(noref_cache, 'wb'))
    return result_curated, result_noref
Пример #7
0
def phosphosite_regsites_one_organism(organism=9606):
    """
    Returns PhosphoSitePlus regulatory sites translated to
    one organism by orthology. Residue numbers will be translated
    where necessary, while gene symbols will be translated to
    UniProt IDs of the given organism.
    This works with human, mouse or rat.

    :param int organism:
        NCBI Taxonomy ID of the target organism. In this
        method possible values are human, mouse or rat, as these species
        provide the vast majority of the data, and are close enough to each
        other that the sites can be safely translated between orthologous
        proteins by sequence alignement.
    """
    def genesymbols2uniprots(genesymbols, tax):
        return (set(
            itertools.chain(*map(
                lambda gs: mapping.map_name(
                    gs,
                    'genesymbol',
                    'uniprot',
                    ncbi_tax_id=tax,
                ), genesymbols))))

    def translate_uniprots(uniprots, h**o):
        return (set(
            itertools.chain(*map(
                lambda usrc: h**o[usrc] if usrc in h**o else [], uniprots))))

    result = {}

    organisms = set([9606, 10090, 10116])

    mod_types = dict(common.psite_mod_types2)

    regsites = phosphosite_regsites()

    other_organisms = organisms - set([organism])

    homology = (dict(
        map(
            lambda other: (other,
                           homology.homologene_uniprot_dict(
                               source=other,
                               target=organism,
                           )), other_organisms)))

    ptm_homology = ptm_orthology()

    proteome = uniprot_input.all_uniprots(organism=organism, swissprot='YES')

    for substrate, regs in iteritems(regsites):

        subs = []

        if substrate in proteome:
            subs = [substrate]
        else:
            for other, h**o in iteritems(homology):
                if substrate in h**o:
                    subs = h**o[substrate]

        for sub in subs:

            if sub not in result:
                result[sub] = {}

            for reg in regs:

                reg_organism = taxonomy.taxa[reg['organism']]

                if reg_organism not in organisms:
                    continue

                mod_type = mod_types[reg['modt']]
                resnum = int(reg['res'])

                psite_key = (
                    substrate,
                    reg['isoform'],
                    reg['aa'],
                    resnum,
                    reg_organism,
                    mod_type,
                )

                if reg_organism != organism:

                    regs_target = []
                    disrupts = []
                    induces = []

                    if psite_key in ptm_homology:

                        if organism in ptm_homology[psite_key]:

                            regs_target = ptm_homology[psite_key][organism]

                    if len(regs_target):

                        disrupts = genesymbols2uniprots(
                            reg['disrupts'],
                            reg_organism,
                        )
                        disrupts = translate_uniprots(
                            disrupts,
                            homology[reg_organism],
                        )
                        induces = genesymbols2uniprots(
                            reg['induces'],
                            reg_organism,
                        )
                        induces = translate_uniprots(
                            induces,
                            homology[reg_organism],
                        )

                else:

                    regs_target = [psite_key]

                    disrupts = genesymbols2uniprots(reg['disrupts'], organism)
                    induces = genesymbols2uniprots(reg['induces'], organism)

                for regt in regs_target:

                    modkey = (regt[2], regt[3], regt[5])

                    if modkey not in result[sub]:

                        result[sub][modkey] = {
                            'induces': set([]),
                            'disrupts': set([]),
                            'pmids': set([]),
                            'isoforms': set([]),
                            'process': set([]),
                            'function': set([]),
                            'positive': False,
                            'negative': False,
                            'comments': []
                        }

                    result[sub][modkey]['induces'].update(induces)
                    result[sub][modkey]['disrupts'].update(disrupts)
                    result[sub][modkey]['process'].update(reg['process'])
                    result[sub][modkey]['function'].update(reg['function'])
                    result[sub][modkey]['isoforms'].update([regt[1]])
                    result[sub][modkey]['pmids'].update(reg['pmids'])
                    result[sub][modkey]['positive'] = \
                        result[sub][modkey]['positive'] or reg['positive']
                    result[sub][modkey]['negative'] = \
                        result[sub][modkey]['negative'] or reg['negative']
                    if len(reg['comments']):
                        result[sub][modkey]['comments'].append(reg['comments'])

    return result
Пример #8
0
def get_pfam(uniprots=None, organism=9606):

    if uniprots is None:

        uniprots = uniprot_input.all_uniprots(
            organism=organism,
            swissprot=True,
        )

    u_pfam = {}
    pfam_u = {}

    if uniprots is not None:

        prg = progress.Progress(
            len(uniprots) / 30,
            'Downloading data from UniProt',
            1,
        )
        data_all = []

        for i in xrange(0, len(uniprots), 30):

            to = i + 30
            thisPart = uniprots[i:to]
            thisPart = ' OR '.join(['accession:%s' % u for u in thisPart])
            get = {
                'query': thisPart,
                'format': 'tab',
                'columns': 'id,database(Pfam)'
            }
            for j in xrange(3):
                c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get)
                data = c.result
                if data is not None:
                    break
            if data is None:
                return None, None
            data = data.split('\n')
            del data[0]
            del data[-1]
            data_all += data
            prg.step()

        prg.terminate()

    else:

        organism = taxonomy.ensure_ncbi_tax_id(organism)

        if not organism:

            return None, None

        organismQuery = 'organism:%u AND reviewed:yes' % organism
        get = {
            'query': organismQuery,
            'format': 'tab',
            'columns': 'id,database(Pfam)'
        }

        for j in xrange(3):

            c = curl.Curl(
                urls.urls['uniprot_basic']['url'],
                get=get,
                silent=False,
                outf='uniprot-pfam-%u.tab' % organism,
            )
            data_all = c.result
            if data_all is not None:
                break

        if data_all is None:
            return None

        data_all = data_all.split('\n')
        del data_all[0]

    for l in data_all:

        l = l.split('\t')

        pfams = re.sub(';$', '', l[1]).strip()
        pfams = pfams.split(';') if pfams else []

        if l[0] not in u_pfam:

            u_pfam[l[0]] = []

        u_pfam[l[0]] += pfams

        for pfam in pfams:

            if pfam not in pfam_u:
                pfam_u[pfam] = []

            pfam_u[pfam].append(l[0])

    return u_pfam, pfam_u
Пример #9
0
def locate_localizations(
    organism=9606,
    literature=True,
    external=True,
    predictions=False,
):

    record = collections.namedtuple(
        'LocateAnnotation',
        ('source', 'location', 'cls', 'pmid', 'score'),
    )
    record.__new__.__defaults__ = (None, None, None)

    organism_uniprots = set(
        uniprot_input.all_uniprots(organism=organism, swissprot=True))

    organism_str = taxonomy.taxids[organism]
    url = urls.urls['locate']['url'] % organism_str
    fname = url.split('/')[-1][:-4]

    c = curl.Curl(
        url,
        large=True,
        default_mode='rb',
        silent=False,
        files_needed=[fname],
    )
    c.result[fname]

    parser = etree.iterparse(c.result[fname], events=('start', 'end'))

    result = collections.defaultdict(set)
    root = next(parser)
    used_elements = []

    for ev, elem in parser:

        if ev == 'end' and elem.tag == 'LOCATE_protein':

            tag_protein = elem.find('protein')
            this_uniprot = None
            this_uniprots = None
            this_entrez = None
            this_organism = (tag_protein.find('organism').text
                             if tag_protein is not None else None)
            this_class = (tag_protein.find('class').text
                          if tag_protein is not None else None)

            xrefs = elem.find('xrefs')

            if xrefs is None:
                continue

            for xref in xrefs.findall('xref'):
                src = xref.find('source')
                src_name = src.find('source_name').text

                if src_name == 'UniProtKB-SwissProt':
                    this_uniprot = src.find('accn').text

                if src_name == 'Entrez Gene':
                    this_entrez = src.find('accn').text

                if src_name == 'UniProt/SPTrEMBL' and this_uniprot is None:
                    this_uniprot = src.find('accn').text

            # if we don't know what it is, does not make sense to proceed
            if this_uniprot is None and this_entrez is None:
                continue

            if this_uniprot:
                this_uniprots = mapping.map_name(
                    this_uniprot,
                    'uniprot',
                    'uniprot',
                    ncbi_tax_id=organism,
                )

            if not this_uniprots and this_entrez:
                this_uniprots = mapping.map_name(
                    this_entrez,
                    'entrez',
                    'uniprot',
                    ncbi_tax_id=organism,
                )

            this_uniprots = set(this_uniprots) & organism_uniprots

            # if we don't know what it is, does not make sense to proceed
            if not this_uniprots:
                continue

            if external:
                # External database annotations
                extannot = elem.find('externalannot')

                if extannot is not None:
                    for extannotref in extannot.findall('reference'):
                        sources = []

                        for src in extannotref.findall('source'):
                            src_name = src.find('source_name')

                            if src_name is not None:
                                sources.append(src_name.text)

                        sources = ';'.join(sources) if sources else None
                        locations = extannotref.find('locations')

                        if locations is not None:
                            for location in locations.findall('location'):
                                for loc in location.iterchildren():
                                    if loc.tag[:4] == 'tier':
                                        this_loc = loc.text.lower().split(',')

                                        for uniprot in this_uniprots:
                                            for _loc in this_loc:
                                                result[uniprot].add(
                                                    record(
                                                        source=sources,
                                                        location=_loc.strip(),
                                                        cls=this_class,
                                                        score=None,
                                                    ))

            if predictions:
                # Predictions
                sclpred = elem.find('scl_prediction')

                if sclpred is not None:
                    for sclpred_src in sclpred.findall('source'):
                        score = float(sclpred_src.find('evaluation').text)

                        if score == 0.0:
                            continue

                        this_src = sclpred_src.find('method').text
                        this_loc = sclpred_src.find('location').text.lower()

                        if this_loc == 'no prediction':
                            continue

                        for uniprot in this_uniprots:
                            result[uniprot].add(
                                record(
                                    source=this_src,
                                    location=this_loc,
                                    cls=this_class,
                                    score=score,
                                ))

            if literature:
                # Literature curation
                lit = elem.find('literature')

                if lit is not None:

                    for litref in lit.findall('reference'):

                        locs = set()

                        for lloc in (
                                litref.find('locations').findall('location')):

                            for loc in lloc.iterchildren():
                                if loc.tag[:4] == 'tier':
                                    locs.add(loc.text.lower())

                        pmid = litref.find('source')
                        pmid = (None
                                if pmid is None else pmid.find('accn').text)

                        for loc in locs:

                            for uniprot in this_uniprots:

                                result[uniprot].add(
                                    record(
                                        source='literature',
                                        location=loc,
                                        pmid=pmid,
                                        cls=this_class,
                                        score=None,
                                    ))

        used_elements.append(elem)

        # removing used elements to keep memory low
        if len(used_elements) > 1000:
            for _ in xrange(500):
                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return result