示例#1
0
def lit_bm_interactions():
    """
    Literature collected interactions from Luck 2020.
    """

    LitBmInteraction = collections.namedtuple(
        'LitBmInteraction',
        ['uniprot_a', 'uniprot_b'],
    )

    url = urls.urls['hid']['lit-bm']
    c = curl.Curl(url, large=True, silent=False)

    for row in c.result:

        row = row.strip().split('\t')

        uniprots_a = mapping.map_name(row[0], 'ensembl', 'uniprot')
        uniprots_b = mapping.map_name(row[1], 'ensembl', 'uniprot')

        for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b):

            yield LitBmInteraction(
                uniprot_a=uniprot_a,
                uniprot_b=uniprot_b,
            )
示例#2
0
def homologene_uniprot_dict(source, target, only_swissprot=True):
    """
    Returns orthology translation table as dict from UniProt to Uniprot,
    obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for
    translation.

    :param int source: NCBI Taxonomy ID of the source species (keys).
    :param int target: NCBI Taxonomy ID of the target species (values).
    :param bool only_swissprot: Translate only SwissProt IDs.
    """
    result = {}

    hge = homologene_dict(source, target, 'entrez')
    hgr = homologene_dict(source, target, 'refseq')

    all_source = set(
        uniprot_input.all_uniprots(organism=source, swissprot='YES'))

    if not only_swissprot:
        all_source_trembl = uniprot_input.all_uniprots(organism=source,
                                                       swissprot='NO')
        all_source.update(set(all_source_trembl))

    for u in all_source:

        source_e = mapping.map_name(u, 'uniprot', 'entrez', source)
        source_r = mapping.map_name(u, 'uniprot', 'refseqp', source)
        target_u = set([])
        target_r = set([])
        target_e = set([])

        for e in source_e:
            if e in hge:
                target_e.update(hge[e])

        for r in source_r:
            if r in hgr:
                target_r.update(hgr[r])

        for e in target_e:
            target_u.update(mapping.map_name(e, 'entrez', 'uniprot', target))

        for r in target_r:
            target_u.update(mapping.map_name(e, 'refseqp', 'uniprot', target))


        target_u = \
            itertools.chain(
                *map(
                    lambda tu:
                        mapping.map_name(tu, 'uniprot', 'uniprot', target),
                    target_u
                )
            )

        result[u] = sorted(list(target_u))

    return result
示例#3
0
def depod_enzyme_substrate(organism=9606):

    result = []

    reunip = re.compile(r'uniprotkb:([A-Z0-9]+)')
    reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)')
    repmidsep = re.compile(r'[,|]\s?')

    url = urls.urls['depod']['urls'][0]
    c = curl.Curl(url, silent=False, encoding='ascii')
    data = c.result
    data = [x.split('\t') for x in data.split('\n')]
    del data[0]

    url_mitab = urls.urls['depod']['urls'][1]
    c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1')
    data_mitab = c_mitab.result
    data_mitab = [x.split('\t') for x in data_mitab.split('\n')]
    del data_mitab[0]

    for i, l in enumerate(data):

        if (len(l) > 6 and l[2] == 'protein substrate'
                and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip())
                == organism and l[4].strip() != 'N/A'):

            enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0]
            substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0]

            for enzyme_up, substrate_up in itertools.product(
                    mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'),
                    mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'),
            ):

                for resaa, resnum in reptm.findall(l[4]):

                    resnum = int(resnum)
                    resaa = (common.aminoa_3_to_1_letter[resaa] if resaa
                             in common.aminoa_3_to_1_letter else resaa)

                    result.append({
                        'instance': None,
                        'kinase': enzyme_up,
                        'resaa': resaa,
                        'resnum': resnum,
                        'references': repmidsep.split(l[6].strip()),
                        'substrate': substrate_up,
                        'start': None,
                        'end': None,
                        'typ': 'dephosphorylation',
                    })

    return result
示例#4
0
    def homologene_uniprot_dict(self, source):
        """
        Builds orthology translation table as dict from UniProt to Uniprot,
        obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for
        translation.
        """

        source = self.get_source(source)

        self.h**o[source] = {}

        hge = homologene_dict(source, self.target, 'entrez')
        hgr = homologene_dict(source, self.target, 'refseq')

        self.load_proteome(source, self.only_swissprot)

        for u in self._proteomes[(source, self.only_swissprot)]:

            source_e = mapping.map_name(u, 'uniprot', 'entrez', source)
            source_r = mapping.map_name(u, 'uniprot', 'refseqp', source)
            target_u = set([])
            target_r = set([])
            target_e = set([])

            for e in source_e:
                if e in hge:
                    target_e.update(hge[e])

            for r in source_r:
                if r in hgr:
                    target_r.update(hgr[r])

            for e in target_e:
                target_u.update(
                    set(mapping.map_name(e, 'entrez', 'uniprot', self.target)))

            for r in target_r:
                target_u.update(
                    set(mapping.map_name(e, 'refseqp', 'uniprot',
                                         self.target)))

            target_u = \
                itertools.chain(
                    *map(
                        lambda tu:
                            mapping.map_name(
                                tu, 'uniprot', 'uniprot', self.target),
                        target_u
                    )
                )

            self.h**o[source][u] = sorted(list(target_u))
示例#5
0
    def build_gene(self):

        self.cpdb_gene = set()

        for entity in self._entities:

            # we add the components of the complexes to the protein data
            # frame; I don't know if it's necessary but does not harm I guess
            if hasattr(entity, 'components'):

                components = entity.components

            else:

                components = (entity, )

            for comp in components:

                name = mapping.map_name0(comp, 'uniprot', 'genesymbol')
                ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl')

                for ensembl in ensembl_genes:

                    self.cpdb_gene.add(
                        CellPhoneDBGene(
                            gene_name=name,
                            uniprot=comp,
                            hgnc_symbol=name,
                            ensembl=ensembl,
                        ))
示例#6
0
def cellcellinteractions_annotations():
    
    
    CellcellinteractionsAnnotation = collections.namedtuple(
        'CellcellinteractionsAnnotation',
        [
            'mainclass',
        ]
    )
    
    
    url = urls.urls['cellcellinteractions']['url']
    
    c = curl.Curl(url, silent = False, large = True)
    
    _ = next(c.result)
    
    result = collections.defaultdict(set)
    
    for row in c.result:
        
        row = row.strip('\r\n').split('\t')
        
        uniprots = mapping.map_name(row[0], 'genesymbol', 'uniprot')
        classes = row[1].split('/')
        
        for uniprot in uniprots:
            
            for cls in classes:
                
                result[uniprot].add(
                    CellcellinteractionsAnnotation(mainclass = cls)
                )
    
    return dict(result)
示例#7
0
文件: huri.py 项目: rfour92/pypath
    def _map_ids(_id):

        return mapping.map_name(
            _id,
            _id[:4].lower() if _id[:4] in {'ensp', 'enst'} else 'uniprot',
            'uniprot',
        )
示例#8
0
文件: lrdb.py 项目: rfour92/pypath
def lrdb_annotations():

    result = collections.defaultdict(set)

    lrdb = lrdb_interactions()

    for rec in lrdb:

        for role in ('ligand', 'receptor'):

            uniprots = mapping.map_name(
                getattr(rec, '%s_genesymbol' % role),
                'genesymbol',
                'uniprot',
            )

            for uniprot in uniprots:

                cell_types = getattr(rec, '%s_cells' % role) or (None, )

                for cell_type in cell_types:

                    cell_type = ('T lymphocyte' if cell_type
                                 == 'tymphocyte' else cell_type.replace(
                                     'cells', 'cell') if cell_type else None)

                    result[uniprot].add(
                        LrdbAnnotation(
                            role=role,
                            cell_type=cell_type,
                            sources=tuple(sorted(rec.sources)),
                            references=tuple(sorted(rec.references)),
                        ))

    return dict(result)
示例#9
0
文件: netpath.py 项目: rfour92/pypath
def netpath_pathway_annotations():

    NetpathPathway = collections.namedtuple(
        'NetpathPathway',
        ['pathway'],
    )

    result = collections.defaultdict(set)

    url_template = urls.urls['netpath_pw']['url']

    url_main = urls.urls['netpath_pw']['mainpage']
    c = curl.Curl(url_main, cache = False)
    cookie = [
        h.decode().split(':')[1].split(';')[0].strip()
        for h in c.resp_headers
        if h.startswith(b'Set-Cookie')
    ]
    cookie_hdr = ['Cookie: %s' % '; '.join(cookie)]

    pathway_ids = netpath_names()

    for _id, pathway in iteritems(pathway_ids):

        url = url_template % int(_id)
        c = curl.Curl(
            url,
            req_headers = cookie_hdr,
            silent = False,
            encoding = 'iso-8859-1',
        )

        soup = bs4.BeautifulSoup(c.result, 'html.parser')

        for tbl in soup.find_all('table'):
            hdr = tbl.find('td', {'class': 'barhead'})

            if not hdr or not hdr.text.strip().startswith('Molecules Invol'):
                continue

            for td in tbl.find_all('td'):
                genesymbol = td.text.strip()

                if not genesymbol:
                    continue

                uniprots = mapping.map_name(
                    genesymbol,
                    'genesymbol',
                    'uniprot',
                )

                for uniprot in uniprots:
                    result[uniprot].add(
                        NetpathPathway(
                            pathway = pathway
                        )
                    )

    return result
示例#10
0
def cspa_cell_types(organism = 9606):

    sheets = {
        'Human': 'Table_E',
        'Mouse': 'Table_F',
    }

    str_organism = taxonomy.taxids[organism].capitalize()

    url = urls.urls['cspa']['url_s1']
    c = curl.Curl(url, large = True, silent = False)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, sheets[str_organism])

    result = collections.defaultdict(lambda: collections.defaultdict(dict))

    cell_types = raw[0][1:]

    for row in raw[1:]:

        for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'):

            for col, cell_type in enumerate(cell_types):

                value = row[col + 1]

                result[cell_type][uniprot] = (
                    float(value)
                        if common.is_float(value) else
                    None
                )

    return result
示例#11
0
def disgenet_annotations(dataset='curated'):
    """
    Downloads and processes the list of all human disease related proteins
    from DisGeNet.
    Returns dict of dicts.

    @dataset : str
        Name of DisGeNet dataset to be obtained:
        `curated`, `literature`, `befree` or `all`.
    """

    DisGeNetAnnotation = collections.namedtuple('DisGeNetAnnotation', [
        'disease',
        'score',
        'dsi',
        'dpi',
        'nof_pmids',
        'nof_snps',
        'source',
    ])

    url = urls.urls['disgenet']['url'] % dataset
    c = curl.Curl(
        url,
        silent=False,
        large=True,
        encoding='utf-8',
        default_mode='r',
    )
    reader = csv.DictReader(c.result, delimiter='\t')
    data = collections.defaultdict(set)

    for rec in reader:

        uniprots = mapping.map_name(
            rec['geneSymbol'],
            'genesymbol',
            'uniprot',
        )

        if not uniprots:

            continue

        for uniprot in uniprots:

            data[uniprot].add(
                DisGeNetAnnotation(
                    disease=rec['diseaseName'],
                    score=float(rec['score']),
                    dsi=float(rec['DSI']) if rec['DSI'] else None,
                    dpi=float(rec['DPI']) if rec['DPI'] else None,
                    nof_pmids=int(rec['NofPmids']),
                    nof_snps=int(rec['NofSnps']),
                    source=tuple(x.strip() for x in rec['source'].split(';')),
                ))

    return data
示例#12
0
文件: kea.py 项目: rfour92/pypath
def kea_interactions():

    KeaRecord = collections.namedtuple('KeaRecord', [
        'enzyme',
        'substrate',
        'residue_type',
        'residue_offset',
        'pmid',
        'resource',
    ])

    resub = re.compile(r'(\w+)_([A-Z])([0-9]+)')

    url = urls.urls['kea']['kinase_substrate']

    c = curl.Curl(url, silent=False, large=True)

    result = []

    for rec in c.result:

        rec = rec.strip().split('\t')

        site = resub.match(rec[1].strip())

        if not site:

            continue

        target, resaa, resnum = site.groups()

        e_uniprots = mapping.map_name(rec[0], 'genesymbol', 'uniprot')
        s_uniprots = mapping.map_name(target, 'genesymbol', 'uniprot')

        for enz, sub in itertools.product(e_uniprots, s_uniprots):

            result.append(
                KeaRecord(enzyme=enz,
                          substrate=sub,
                          residue_type=resaa,
                          residue_offset=int(resnum),
                          pmid=rec[2].strip(),
                          resource=_resources[rec[3].strip()]))

    return result
示例#13
0
def almen2009_annotations():
    
    
    resep = re.compile(r'[;/]')
    
    
    Almen2009Annotation = collections.namedtuple(
        'Almen2009Annotation',
        [
            'mainclass',
            'classes',
            'phobius_secreted',
            'phobius_transmembrane',
            'sosui_transmembrane',
            'tmhmm_transmembrane',
        ]
    )
    
    
    url = urls.urls['almen2009']['url']
    
    c = curl.Curl(url, silent = False, large = True)
    
    xls = c.fileobj
    xlsfile = xls.name
    xls.close()
    tbl = inputs_common.read_xls(xlsfile, sheet = 'Data')[1:]
    
    result = collections.defaultdict(set)
    
    for row in tbl:
        
        uniprots = mapping.map_name(row[0], 'ipi', 'uniprot')
        
        mainclass = row[2]
        classes = row[3].replace('KInase', 'Kinase')
        classes = tuple(sorted(resep.split(classes)))
        phobius_transmembrane = int(float(row[5]))
        phobius_secreted = row[6] == 'Y'
        sosui_transmembrane = int(float(row[8])) if row[8] != 'ERROR' else 0
        tmhmm_transmembrane = int(float(row[10]))
        
        for uniprot in uniprots:
            
            result[uniprot].add(
                Almen2009Annotation(
                    mainclass = mainclass,
                    classes = classes,
                    phobius_secreted = phobius_secreted,
                    phobius_transmembrane = phobius_transmembrane,
                    sosui_transmembrane = sosui_transmembrane,
                    tmhmm_transmembrane = tmhmm_transmembrane,
                )
            )
    
    return result
示例#14
0
    def process_name(name):

        return (
            (complexes_by_name[name],)
                if name in complexes_by_name else
            mapping.map_name(
                name,
                'genesymbol',
                'uniprot',
                ncbi_tax_id = ncbi_tax_id,
            )
        )
示例#15
0
def signalink_annotations(organism = 9606):

    SignalinkPathway = collections.namedtuple(
        'SignalinkPathway',
        [
            'pathway',
        ]
    )

    SignalinkFunction = collections.namedtuple(
        'SignalinkFunction',
        [
            'function',
        ]
    )


    result = {
        'pathway': collections.defaultdict(set),
        'function': collections.defaultdict(set),
    }

    interactions = signalink_interactions(organism = organism)

    for i in interactions:

        for postfix in ('_a', '_b'):

            _id = getattr(i, 'id%s' % postfix)

            for uniprot in mapping.map_name(_id, 'uniprot', 'uniprot'):

                for attr, record in zip(
                    ('pathway', 'function'),
                    (SignalinkPathway, SignalinkFunction),
                ):

                    values = getattr(i, '%ss%s' % (attr, postfix))

                    for value in values:

                        result[attr][uniprot].add(
                            record(value)
                        )

    return result
示例#16
0
def cspa_annotations(organism = 9606):


    CspaAnnotation = collections.namedtuple(
        'CspaAnnotation',
        [
            'high_confidence',
            'n_cell_types',
            'tm',
            'gpi',
            'uniprot_cell_surface',
        ],
    )


    sheets = {
        'Human': 'Table A',
        'Mouse': 'Table B',
    }

    str_organism = taxonomy.taxids[organism].capitalize()

    url = urls.urls['cspa']['url_s2']
    c = curl.Curl(url, large = True, silent = False)
    xlsname = c.fname
    del(c)
    raw = inputs_common.read_xls(xlsname, sheets[str_organism])[1:]

    result = collections.defaultdict(set)

    for row in raw:

        for uniprot in mapping.map_name(row[1], 'uniprot', 'uniprot'):

            result[uniprot].add(
                CspaAnnotation(
                    high_confidence = 'high confidence' in row[2],
                    n_cell_types = int(float(row[9])),
                    tm = int(float(row[11])),
                    gpi = int(float(row[12])),
                    uniprot_cell_surface = row[13] == 'yes',
                )
            )

    return dict(result)
示例#17
0
文件: embrace.py 项目: rfour92/pypath
def _embrace_id_translation(mouse_genesymbol, organism=9606):

    uniprots = mapping.map_name(
        mouse_genesymbol,
        'genesymbol',
        'uniprot',
        ncbi_tax_id=10090,
    )

    if organism != 10090:

        uniprots = homology.translate(
            uniprots,
            target=organism,
            source=10090,
        )

    return uniprots or [None]
示例#18
0
def surfaceome_annotations():
    """
    Downloads the "In silico human surfaceome".
    Dict with UniProt IDs as key and tuples of surface prediction score,
    class and subclass as values (columns B, N, S and T of table S3).
    """

    url = urls.urls['surfaceome']['url']
    c = curl.Curl(url, large=True, silent=False)
    xlsname = c.fname
    del (c)
    raw = inputs_common.read_xls(xlsname, 'in silico surfaceome only')[2:]

    return dict((
        uniprot,  # uniprot
        (
            float(r[13]),  # score
            r[18] if r[18] else None,  # class
            set(r[19].replace('KInase', 'Kinase').split(';')
                ) if r[19] else set(),  # subclass
        )) for r in raw
                for uniprot in mapping.map_name(r[1], 'uniprot', 'uniprot'))
示例#19
0
def cancersea_annotations():
    """
    Retrieves genes annotated with cancer funcitonal states from the
    CancerSEA database.
    """

    CancerseaAnnotation = collections.namedtuple(
        'CancerseaAnnotation',
        [
            'state',
        ],
    )

    annotations = collections.defaultdict(set)

    url = urls.urls['cancersea']['url']
    c = curl.Curl(url, silent=False, large=False)
    soup = bs4.BeautifulSoup(c.result, 'html.parser')

    for row in soup.find_all('tbody')[1].find_all('tr'):

        state = row.find_all('td')[0].text
        url_end = row.find_all('td')[-1].find('a').attrs['href']
        data_url = urls.urls['cancersea']['data_url'] % url_end
        c = curl.Curl(data_url, silent=False, large=True)

        _ = next(c.result)

        for line in c.result:

            line = line.strip().split('\t')

            uniprots = mapping.map_name(line[1], 'genesymbol', 'uniprot')

            for uniprot in uniprots:
                annotations[uniprot].add(CancerseaAnnotation(state=state))

    return dict(annotations)