Exemplo n.º 1
0
def kegg_pathways():

    data = kegg_interactions()
    pws = common.uniq_list(map(lambda i: i[3], data))
    proteins_pws = dict(map(lambda pw: (pw, set([])), pws))
    interactions_pws = dict(map(lambda pw: (pw, set([])), pws))

    for u1, u2, eff, pw in data:
        proteins_pws[pw].add(u1)
        proteins_pws[pw].add(u2)
        interactions_pws[pw].add((u1, u2))

    return proteins_pws, interactions_pws
Exemplo n.º 2
0
def li2012_interactions():
    """
    Converts table read by ``pypath.inputs.li2012.get_li2012`` to
    list of interactions.
    """

    result = []
    data = get_li2012()

    for l in data:

        subs_protein = l[1].split('/')[0]
        tk_protein = l[2].split()[0]
        reader_protein = l[3].split()[0]
        route = l[4]
        result.append((tk_protein, subs_protein, route, 'phosphorylation'))
        result.append(
            (subs_protein, reader_protein, route, 'phosphomotif_binding'))

    return [list(l) for l in common.uniq_list(result)]
Exemplo n.º 3
0
def li2012_enzyme_substrate():
    """
    Converts table read by `pypath.dataio.get_li2012()` to
    list of dicts of kinase-substrate interactions.
    """

    result = []
    non_digit = re.compile(r'[^\d]+')
    data = get_li2012()

    for l in data:

        subs_protein = l[1].split('/')[0]
        tk_protein = l[2].split()[0]
        subs_resnum = int(non_digit.sub('', l[1].split('/')[1]))
        result.append((
            subs_protein,  # substrate
            tk_protein,  # kinase
            None,  # instance
            None,  # start
            None,  # end
            'Y',  # residue letter
            subs_resnum,  # residue offset
        ))

    result = [
        dict(
            zip([
                'substrate',
                'kinase',
                'instance',
                'start',
                'end',
                'resaa',
                'resnum',
            ], list(l))) for l in common.uniq_list(result)
    ]

    return result
Exemplo n.º 4
0
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href = True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(
        len(hsa_list), 'Processing KEGG Pathways', 1, percent = False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent = True,
                      req_headers = req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid, common.uniq_list(
            common.flat_list([
                mapping.map_name(
                    gn, 'genesymbol', 'uniprot', strict = True) for gn in gns
            ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):
            st = rel.find('subtype')

            if (
                rel.attrs['entry1'] in uentries and
                rel.attrs['entry2'] in uentries and
                st and
                'name' in st.attrs
            ):
                for u1 in uentries[rel.attrs['entry1']]:
                    for u2 in uentries[rel.attrs['entry2']]:
                        interactions.append((u1, u2, st.attrs['name'], pw))

    prg.terminate()

    return common.uniq_list(interactions)
Exemplo n.º 5
0
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    positive_terms = {'activation', 'expression'}
    negative_terms = {'inhibition', 'repression'}
    transc_terms = {'expression', 'repression'}
    mechanism_terms = {
        'phosphorylation',
        'binding/association',
        'dissociation',
        'ubiquitination',
        'dephosphorylation',
        'glycosylation',
        'state change',
        'methylation',
    }
    direct_terms = {'indirect effect'}

    KeggInteraction = collections.namedtuple(
        'KeggInteraction',
        [
            'id_a',
            'id_b',
            'effect',
            'pathway',
            'mechanism',
            'is_direct',
            'transcriptional',
        ],
    )

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href=True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(len(hsa_list),
                            'Processing KEGG Pathways',
                            1,
                            percent=False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent=True,
                      req_headers=req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid,
                          common.uniq_list(
                              common.flat_list([
                                  mapping.map_name(gn,
                                                   'genesymbol',
                                                   'uniprot',
                                                   strict=True) for gn in gns
                              ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):

            subtypes = {st.attrs['name'] for st in rel.find_all('subtype')}

            if (rel.attrs['entry1'] in uentries
                    and rel.attrs['entry2'] in uentries and subtypes):

                is_direct = 'indirect effect' not in subtypes
                effect = ('inhibition' if negative_terms
                          & subtypes else 'activation' if positive_terms
                          & subtypes else 'unknown')
                mechanism = ';'.join(mechanism_terms & subtypes)
                transcriptional = bool(transc_terms & subtypes)

                for u1 in uentries[rel.attrs['entry1']]:

                    for u2 in uentries[rel.attrs['entry2']]:

                        interactions.append(
                            KeggInteraction(
                                id_a=u1,
                                id_b=u2,
                                effect=effect,
                                pathway=pw,
                                mechanism=mechanism,
                                is_direct=is_direct,
                                transcriptional=transcriptional,
                            ))

    prg.terminate()

    return common.uniq_list(interactions)
Exemplo n.º 6
0
def get_pubmed_data(pp, cachefile=None, htp_threshold=20):
    """
    For one PyPath object, obtains metadata for all PubMed IDs
    through NCBI E-utils.

    :param pp:
        ``pypath.PyPath`` object
    :param htp_threshold:
        The number of interactions for one reference
        above the study considered to be high-throughput.
    """

    if cachefile is None:

        cachefile = settings.get('pubmed_cache')

    if htp_threshold is not None:
        pp.htp_stats()

    pubmeds = common.uniq_list(
        common.flat_list([[r.pmid for r in e['references']]
                          for e in pp.graph.es]))

    if htp_threshold is not None:
        pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs']

    notpmid = [i for i in pubmeds if not i.isdigit()]

    sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' %
                     len(notpmid))

    pmdata = {}
    if os.path.exists(cachefile):
        sys.stdout.write('\t:: Loading data previously downloaded '
                         'from PubMed, from file `%s`\n' % cachefile)
        pmdata = pickle.load(open(cachefile, 'rb'))

    missing = list(set(pubmeds) - set(pmdata.keys()))
    sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' %
                     len(missing))
    cached_pubmeds_len = len(pmdata)
    pmdata_new = pubmed_input.get_pubmeds(missing)
    pmdata.update(pmdata_new)

    sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile)

    if len(pmdata) > cached_pubmeds_len:
        pickle.dump(pmdata, open(cachefile, 'wb'))

    pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds)

    points = []
    earliest = []

    for e in pp.graph.es:

        for s, rs in iteritems(e['refs_by_source']):

            pms = [
                r.pmid for r in rs
                if (htp_threshold is None
                    or r.pmid not in pp.htp[htp_threshold]['htrefs'])
                and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid]
            ]
            if len(pms) > 0:
                yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms]
                earliest.append((s, 0, min(yrs), '', e.index))
                for pm in pms:
                    points.append((s, pm, int(pmdata[pm]['pubdate'][:4]),
                                   pmdata[pm]['source'], e.index))

    points = common.uniq_list(points)
    earliest = common.uniq_list(earliest)

    points = pd.DataFrame.from_records(points)
    earliest = pd.DataFrame.from_records(earliest)
    points.columns = ['database', 'pmid', 'year', 'journal', 'eid']
    earliest.columns = ['database', 'none', 'year', 'none', 'eid']

    return points, earliest
Exemplo n.º 7
0
 def write_set(self, id_list, setname, id_type, map_ids=True):
     self.sets[setname] = set(common.uniq_list(common.flat_list(
         self.mapper.map_name(n, self.ids[id_type], self.target_id)
         for n in id_list))) if map_ids \
         else set(id_list)
Exemplo n.º 8
0
def trip_process(
    exclude_methods=['Inference', 'Speculation'],
    predictions=False,
    species='Human',
    strict=False,
):
    """
    Downloads TRIP data by calling `pypath.dadio.take_a_trip()` and
    further provcesses it.
    Returns dict of dict with TRIP data.

    @exclude_methods : list
        Interaction detection methods to be discarded.
    @predictions : bool
        Whether to include predicted interactions.
    @species : str
        Organism name, e.g. `Human`.
    @strict : bool
        Whether include interactions with species not
        used as a bait or not specified.
    """

    nd = 'Not determined'
    spec = set([]) if strict \
        else set(['Not specified', 'Not used as a bait', ''])
    spec.add(species)
    result = {}
    data = take_a_trip()

    for uniprots in common.uniq_list(
            common.flat_list([v.keys() for v in data.values()])):
        to_process = False
        refs = set([])
        mets = set([])
        tiss = set([])
        reg = set([])
        eff = set([])

        if uniprots in data['sc']:
            for sc in data['sc'][uniprots]:
                if sc[4] in spec and sc[6] in spec and \
                    (predictions or sc[9] != 'Prediction') and \
                        sc[3] not in exclude_methods:
                    refs.add(sc[10])
                    mets.add(sc[3])
                    tiss.add(sc[7])

        if uniprots in data['vtc']:
            for vtc in data['vtc'][uniprots]:
                if vtc[4] in spec and vtc[7] in spec and \
                        vtc[3] not in exclude_methods:
                    refs.add(vtc[10])
                    mets.add(vtc[3])

        if uniprots in data['vvc']:
            for vvc in data['vvc'][uniprots]:
                if vvc[6] in spec and vvc[8] in spec and \
                        vvc[3] not in exclude_methods:
                    refs.add(vvc[10])
                    mets.add(vvc[3])

                    if len(vvc[4]) > 0:
                        tiss.add(vvc[4])

                    if len(vvc[5]) > 0:
                        tiss.add(vvc[5])

        if uniprots in data['cc']:
            for cc in data['cc'][uniprots]:
                if cc[4] in spec and cc[6] in spec and \
                        cc[3] not in exclude_methods:
                    refs.add(cc[10])
                    mets.add(cc[3])

                    if (cc[5] != nd and len(cc[5]) > 0) or \
                            (cc[7] != nd and len(cc[7]) > 0):
                        reg.add((cc[5], cc[7]))

        if uniprots in data['fc']:
            for fc in data['fc'][uniprots]:
                mets.add(fc[3])
                refs.add(fc[7])

                if len(fc[5]) > 0:
                    eff.add(fc[5])

                if len(fc[6]) > 0:
                    eff.add(fc[6])

        if len(refs) > 0:
            result[uniprots] = {
                'refs': refs,
                'methods': mets,
                'tissues': tiss,
                'effect': eff,
                'regions': reg
            }

    return result