Exemplo n.º 1
0
def depod_enzyme_substrate(organism=9606):

    result = []

    reunip = re.compile(r'uniprotkb:([A-Z0-9]+)')
    reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)')
    repmidsep = re.compile(r'[,|]\s?')

    url = urls.urls['depod']['urls'][0]
    c = curl.Curl(url, silent=False, encoding='ascii')
    data = c.result
    data = [x.split('\t') for x in data.split('\n')]
    del data[0]

    url_mitab = urls.urls['depod']['urls'][1]
    c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1')
    data_mitab = c_mitab.result
    data_mitab = [x.split('\t') for x in data_mitab.split('\n')]
    del data_mitab[0]

    for i, l in enumerate(data):

        if (len(l) > 6 and l[2] == 'protein substrate'
                and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip())
                == organism and l[4].strip() != 'N/A'):

            enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0]
            substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0]

            for enzyme_up, substrate_up in itertools.product(
                    mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'),
                    mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'),
            ):

                for resaa, resnum in reptm.findall(l[4]):

                    resnum = int(resnum)
                    resaa = (common.aminoa_3_to_1_letter[resaa] if resaa
                             in common.aminoa_3_to_1_letter else resaa)

                    result.append({
                        'instance': None,
                        'kinase': enzyme_up,
                        'resaa': resaa,
                        'resnum': resnum,
                        'references': repmidsep.split(l[6].strip()),
                        'substrate': substrate_up,
                        'start': None,
                        'end': None,
                        'typ': 'dephosphorylation',
                    })

    return result
Exemplo n.º 2
0
def celltalkdb_annotations(organism=9606):
    """
    Retrieves annotation of protein ligand and receptor roles from CellTalkDB
    http://tcm.zju.edu.cn/celltalkdb/index.php

    :param int,str organism:
        Human and mouse supported, in case of incomprehensible value will
        fall back to human.

    :return:
        Dictionary of annotations with UniProt IDs as keys.
    """

    CellTalkDBAnnotation = collections.namedtuple('CellTalkDBAnnotation', [
        'role',
        'pmid',
    ])

    ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism)
    ncbi_tax_id = ncbi_tax_id if ncbi_tax_id in {9606, 10090} else 9606

    annot = collections.defaultdict(set)

    for rec in celltalkdb_download(organism=ncbi_tax_id):

        for role in ('ligand', 'receptor'):

            uniprots = mapping.map_name(
                getattr(rec, '%s_gene_symbol' % role),
                'genesymbol',
                'uniprot',
                ncbi_tax_id=ncbi_tax_id,
            )

            for uniprot in uniprots:

                annot[uniprot].add(
                    CellTalkDBAnnotation(
                        role=role,
                        pmid=rec.evidence,
                    ))

    return annot
Exemplo n.º 3
0
def hippie_interactions(
    score_threshold=.75,
    only_human=False,
    only_sources=None,
    only_methods=None,
    methods=False,
    sources=False,
    references=True,
    organisms=False,
):

    only_sources = common.to_set(only_sources)
    only_methods = common.to_set(only_methods)

    HippieInteraction = collections.namedtuple(
        'HippieInteraction',
        [
            'id_a',
            'id_b',
            'score',
            'methods',
            'references',
            'sources',
            'organisms',
        ],
    )

    tps = lambda i: tuple(sorted(i))

    url = urls.urls['hippie']['url']
    c = curl.Curl(url, large=True, silent=False)

    result = set()

    for i, l in enumerate(c.result):

        l = l.strip('\r\n').split('\t')

        score = float(l[4])

        if score < score_threshold:

            continue

        ids_a_1 = mapping.map_name(l[0], 'uniprot-entry', 'uniprot')
        ids_a_2 = mapping.map_name(l[1], 'entrez', 'uniprot')
        ids_b_1 = mapping.map_name(l[2], 'uniprot-entry', 'uniprot')
        ids_b_2 = mapping.map_name(l[3], 'entrez', 'uniprot')

        for id_a, id_b in itertools.product(ids_a_1 | ids_a_2,
                                            ids_b_1 | ids_b_2):

            details = dict((
                dd[0],
                set(dd[1].split(',')),
            ) for dd in (d.split(':') for d in l[5].split(';')))

            _sources = details['sources'] if 'sources' in details else set()
            experiments = (details['experiments']
                           if 'experiments' in details else set())

            if not all((
                    not only_methods or experiments & only_methods,
                    not only_methods or _sources & only_sources,
            )):

                continue

            _organisms = {9606}

            if 'species' in details:

                names = {
                    spec.split('(')[0].strip()
                    for spec in details['species']
                }
                _organisms = {
                    taxonomy.ensure_ncbi_tax_id(name)
                    for name in names
                }
                _organisms.discard(None)

                if only_human and 9606 not in _organisms:

                    continue

            result.add(
                HippieInteraction(
                    id_a=id_a,
                    id_b=id_b,
                    score=score,
                    methods=tps(experiments) if methods else None,
                    references=(tps(details['pmids']) if references else None),
                    sources=tps(_sources) if sources else None,
                    organisms=tps(_organisms) if organisms else None,
                ))

    return list(result)
Exemplo n.º 4
0
def iptmnet_interactions(organism=9606):

    ptm_url = urls.urls['iptmnet']['ptms']
    score_url = urls.urls['iptmnet']['scores']

    c = curl.Curl(score_url, large=True, silent=False)

    scores = {}

    for line in c.result:

        line = line.strip('\n\r').split('\t')

        if not line[2]:

            continue

        site = resite.match(line[1])

        if not site:

            continue

        resaa, resnum = site.groups()

        resnum = int(resnum)
        score = int(line[4])
        substrate, isoform = inputs_common._try_isoform(line[0])
        enzyme = line[2]

        key = (
            enzyme,
            substrate,
            isoform,
            line[3].lower(),  # PTM type
            resaa,
            resnum,
        )

        scores[key] = score

    c = curl.Curl(ptm_url, large=True, silent=False)

    for line in c.result:

        line = line.strip('\n\r').split('\t')

        if not line or not line[6]:

            continue

        ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(line[4].strip())

        if organism and ncbi_tax_id != organism:

            continue

        substrate, s_isoform = inputs_common._try_isoform(line[2])
        ptm_type = line[0].lower()

        enzyme, e_isoform = inputs_common._try_isoform(line[6])

        enzyme_ids = (mapping.map_name(
            line[6],
            'pro',
            'uniprot',
            ncbi_tax_id=organism,
        ) if line[6].startswith('PR:') else (enzyme, ))

        refs = line[9].split(',')
        resnum, resaa = resite.match(line[5]).groups()

        key = (
            line[6],
            substrate,
            isoform,
            ptm_type,
            resaa,
            resnum,
        )

        score = scores[key] if key in scores else None

        for _enzyme in enzyme_ids:

            yield IptmnetInteraction(
                enzyme=_enzyme,
                substrate=substrate,
                enzyme_isoform=e_isoform,
                substrate_isoform=s_isoform,
                ptm_type=ptm_type,
                resaa=resaa,
                resnum=resnum,
                score=score,
                references=refs,
            )
Exemplo n.º 5
0
def get_pfam(uniprots=None, organism=9606):

    if uniprots is None:

        uniprots = uniprot_input.all_uniprots(
            organism=organism,
            swissprot=True,
        )

    u_pfam = {}
    pfam_u = {}

    if uniprots is not None:

        prg = progress.Progress(
            len(uniprots) / 30,
            'Downloading data from UniProt',
            1,
        )
        data_all = []

        for i in xrange(0, len(uniprots), 30):

            to = i + 30
            thisPart = uniprots[i:to]
            thisPart = ' OR '.join(['accession:%s' % u for u in thisPart])
            get = {
                'query': thisPart,
                'format': 'tab',
                'columns': 'id,database(Pfam)'
            }
            for j in xrange(3):
                c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get)
                data = c.result
                if data is not None:
                    break
            if data is None:
                return None, None
            data = data.split('\n')
            del data[0]
            del data[-1]
            data_all += data
            prg.step()

        prg.terminate()

    else:

        organism = taxonomy.ensure_ncbi_tax_id(organism)

        if not organism:

            return None, None

        organismQuery = 'organism:%u AND reviewed:yes' % organism
        get = {
            'query': organismQuery,
            'format': 'tab',
            'columns': 'id,database(Pfam)'
        }

        for j in xrange(3):

            c = curl.Curl(
                urls.urls['uniprot_basic']['url'],
                get=get,
                silent=False,
                outf='uniprot-pfam-%u.tab' % organism,
            )
            data_all = c.result
            if data_all is not None:
                break

        if data_all is None:
            return None

        data_all = data_all.split('\n')
        del data_all[0]

    for l in data_all:

        l = l.split('\t')

        pfams = re.sub(';$', '', l[1]).strip()
        pfams = pfams.split(';') if pfams else []

        if l[0] not in u_pfam:

            u_pfam[l[0]] = []

        u_pfam[l[0]] += pfams

        for pfam in pfams:

            if pfam not in pfam_u:
                pfam_u[pfam] = []

            pfam_u[pfam].append(l[0])

    return u_pfam, pfam_u
Exemplo n.º 6
0
def gpcrdb_annotations(organism=9606):
    """
    :param int,str organism:
        Only human and mouse (9606 and 10090) are supported.
    """

    GpcrdbAnnotation = collections.namedtuple('GpcrdbAnnotation', [
        'gpcr_class',
        'family',
        'subfamily',
    ])

    organism = taxonomy.ensure_ncbi_tax_id(organism)

    if organism not in (9606, 10090):

        return {}

    i_uniprot = 31 if organism == 10090 else 15

    url = urls.urls['gpcrdb']['families']

    c = curl.Curl(url, silent=False, large=True)

    result = collections.defaultdict(set)

    for line in c.result:

        if line[0] != ' ':

            cls = line.split('|')[0].strip()
            family = None
            subfamily = None

        elif line[4] != ' ':

            family = line.strip()
            subfamily = None

        elif line[8] != ' ':

            subfamily = line.strip()

        else:

            line = line.strip().strip('"')

            if line.startswith('gpcr'):

                line = line.split('","')
                uniprot = line[i_uniprot]

                if uniprot:

                    result[uniprot].add(
                        GpcrdbAnnotation(
                            gpcr_class=cls,
                            family=family,
                            subfamily=subfamily,
                        ))

    return dict(result)
Exemplo n.º 7
0
def _cellchatdb_organism(organism = 9606):

    ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism)
    ncbi_tax_id = 10090 if ncbi_tax_id == 10090 else 9606

    return ncbi_tax_id
Exemplo n.º 8
0
    def __init__(
        self,
        components,
        ncbi_tax_id=9606,
        name=None,
        ids=None,
        sources=None,
        interactions=None,
        references=None,
        proteins=None,
        attrs=None,
    ):
        """
        Represents a molecular complex.
        
        components : list,dict
            Either a list of identifiers or a dict with identifiers as keys
            and stoichiometric coefficients as values. List of identifiers
            also assumed to represent stoichiometry by repetition
            of identifiers.
        ncbi_tax_id : int
            NCBI taxonomy identifier of the complex. It implies all members
            of the complex belong to the same organism. Support for multi-
            organism complexes will be implemented in the future.
        name : str
            A custom name or identifier of the complex.
        ids : dict
            Identifiers. If ``sources`` is a set, list or tuple it should be
            a dict with database names as keys and set of identifiers as
            values. If ``sources`` is a string, it can be a set of
            identifiers or a single identifier.
        sources : set,str
            Database(s) the complex has been defined in.
        interactions : list,dict
            Interactions between the components of the complex. Either
            a list of tuples of component IDs or a dict with tuples as
            keys and custom interaction properties as values.
        proteins : list,dict
            Synonym for `components`, kept for compatibility.
        """

        components = components or proteins

        if not isinstance(components, dict):

            self.components = dict(collections.Counter(components))

        else:

            self.components = components

        self.proteins = self.components
        self.name = name
        self.ids = collections.defaultdict(set)
        self.add_ids(ids, source=sources)
        self.sources = common.to_set(sources)
        self.references = common.to_set(references)
        self.ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(ncbi_tax_id)
        self.attrs = {}
        if isinstance(attrs, dict):
            self.attrs.update(attrs)

        self.interactions = interactions