Exemplo n.º 1
0
 def toplist(self,
             length=None,
             alpha=None,
             significant=True,
             min_set_size=0,
             groups=None,
             filtr=lambda x: True,
             **kwargs):
     args = get_args(locals(), ['filtr', 'groups'])
     if groups is None:
         groups = self.gsea.groups.keys()  # all by default
     sets = set(
         common.flat_list(s for g, s in iteritems(self.gsea.groups)
                         if g in groups))
     return super(GSEABinaryEnrichmentSet, self).toplist(
         filtr=lambda x: x[0] in sets and filtr(x), **args)
Exemplo n.º 2
0
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href = True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(
        len(hsa_list), 'Processing KEGG Pathways', 1, percent = False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent = True,
                      req_headers = req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid, common.uniq_list(
            common.flat_list([
                mapping.map_name(
                    gn, 'genesymbol', 'uniprot', strict = True) for gn in gns
            ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):
            st = rel.find('subtype')

            if (
                rel.attrs['entry1'] in uentries and
                rel.attrs['entry2'] in uentries and
                st and
                'name' in st.attrs
            ):
                for u1 in uentries[rel.attrs['entry1']]:
                    for u2 in uentries[rel.attrs['entry2']]:
                        interactions.append((u1, u2, st.attrs['name'], pw))

    prg.terminate()

    return common.uniq_list(interactions)
Exemplo n.º 3
0
def kegg_interactions():
    """
    Downloads and processes KEGG Pathways.
    Returns list of interactions.
    """

    positive_terms = {'activation', 'expression'}
    negative_terms = {'inhibition', 'repression'}
    transc_terms = {'expression', 'repression'}
    mechanism_terms = {
        'phosphorylation',
        'binding/association',
        'dissociation',
        'ubiquitination',
        'dephosphorylation',
        'glycosylation',
        'state change',
        'methylation',
    }
    direct_terms = {'indirect effect'}

    KeggInteraction = collections.namedtuple(
        'KeggInteraction',
        [
            'id_a',
            'id_b',
            'effect',
            'pathway',
            'mechanism',
            'is_direct',
            'transcriptional',
        ],
    )

    rehsa = re.compile(r'.*(hsa[0-9]+).*')
    req_hdrs = [
        'Referer: http://www.genome.jp/kegg-bin/show_pathway'
        '?map=hsa04710&show_description=show'
    ]
    hsa_list = []
    interactions = []

    c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True)
    htmllst = c.result
    lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser')

    for a in lstsoup.find_all('a', href=True):
        m = rehsa.match(a['href'])

        if m:
            hsa_list.append((m.groups(0)[0], a.text))

    prg = progress.Progress(len(hsa_list),
                            'Processing KEGG Pathways',
                            1,
                            percent=False)

    for hsa, pw in hsa_list:

        prg.step()
        c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa,
                      silent=True,
                      req_headers=req_hdrs)
        kgml = c.result
        kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser')
        entries = {}

        for ent in kgmlsoup.find_all('entry'):
            gr = ent.find('graphics')

            if gr and 'name' in gr.attrs:
                entries[ent.attrs['id']] = [
                    n.strip()
                    for n in gr.attrs['name'].replace('...', '').split(',')
                ]

        uentries = dict([(eid,
                          common.uniq_list(
                              common.flat_list([
                                  mapping.map_name(gn,
                                                   'genesymbol',
                                                   'uniprot',
                                                   strict=True) for gn in gns
                              ]))) for eid, gns in iteritems(entries)])

        for rel in kgmlsoup.find_all('relation'):

            subtypes = {st.attrs['name'] for st in rel.find_all('subtype')}

            if (rel.attrs['entry1'] in uentries
                    and rel.attrs['entry2'] in uentries and subtypes):

                is_direct = 'indirect effect' not in subtypes
                effect = ('inhibition' if negative_terms
                          & subtypes else 'activation' if positive_terms
                          & subtypes else 'unknown')
                mechanism = ';'.join(mechanism_terms & subtypes)
                transcriptional = bool(transc_terms & subtypes)

                for u1 in uentries[rel.attrs['entry1']]:

                    for u2 in uentries[rel.attrs['entry2']]:

                        interactions.append(
                            KeggInteraction(
                                id_a=u1,
                                id_b=u2,
                                effect=effect,
                                pathway=pw,
                                mechanism=mechanism,
                                is_direct=is_direct,
                                transcriptional=transcriptional,
                            ))

    prg.terminate()

    return common.uniq_list(interactions)
Exemplo n.º 4
0
def get_pubmed_data(pp, cachefile=None, htp_threshold=20):
    """
    For one PyPath object, obtains metadata for all PubMed IDs
    through NCBI E-utils.

    :param pp:
        ``pypath.PyPath`` object
    :param htp_threshold:
        The number of interactions for one reference
        above the study considered to be high-throughput.
    """

    if cachefile is None:

        cachefile = settings.get('pubmed_cache')

    if htp_threshold is not None:
        pp.htp_stats()

    pubmeds = common.uniq_list(
        common.flat_list([[r.pmid for r in e['references']]
                          for e in pp.graph.es]))

    if htp_threshold is not None:
        pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs']

    notpmid = [i for i in pubmeds if not i.isdigit()]

    sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' %
                     len(notpmid))

    pmdata = {}
    if os.path.exists(cachefile):
        sys.stdout.write('\t:: Loading data previously downloaded '
                         'from PubMed, from file `%s`\n' % cachefile)
        pmdata = pickle.load(open(cachefile, 'rb'))

    missing = list(set(pubmeds) - set(pmdata.keys()))
    sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' %
                     len(missing))
    cached_pubmeds_len = len(pmdata)
    pmdata_new = pubmed_input.get_pubmeds(missing)
    pmdata.update(pmdata_new)

    sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile)

    if len(pmdata) > cached_pubmeds_len:
        pickle.dump(pmdata, open(cachefile, 'wb'))

    pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds)

    points = []
    earliest = []

    for e in pp.graph.es:

        for s, rs in iteritems(e['refs_by_source']):

            pms = [
                r.pmid for r in rs
                if (htp_threshold is None
                    or r.pmid not in pp.htp[htp_threshold]['htrefs'])
                and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid]
            ]
            if len(pms) > 0:
                yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms]
                earliest.append((s, 0, min(yrs), '', e.index))
                for pm in pms:
                    points.append((s, pm, int(pmdata[pm]['pubdate'][:4]),
                                   pmdata[pm]['source'], e.index))

    points = common.uniq_list(points)
    earliest = common.uniq_list(earliest)

    points = pd.DataFrame.from_records(points)
    earliest = pd.DataFrame.from_records(earliest)
    points.columns = ['database', 'pmid', 'year', 'journal', 'eid']
    earliest.columns = ['database', 'none', 'year', 'none', 'eid']

    return points, earliest
Exemplo n.º 5
0
 def write_set(self, id_list, setname, id_type, map_ids=True):
     self.sets[setname] = set(common.uniq_list(common.flat_list(
         self.mapper.map_name(n, self.ids[id_type], self.target_id)
         for n in id_list))) if map_ids \
         else set(id_list)
Exemplo n.º 6
0
def take_a_trip(cachefile=None):
    """
    Downloads TRIP data from webpage and preprocesses it.
    Saves preprocessed data into `cachefile` and next
    time loads from this file.

    :arg cachefile str:
        Path to pickle dump of preprocessed TRIP database. If does not exist
        the database will be downloaded and saved to this file. By default
        the path queried from the ``settings`` module.
    """

    cachefile = cachefile or settings.get('trip_preprocessed')

    if os.path.exists(cachefile):
        _log('Loading preprocessed TRIP database '
             'content from `%s`' % cachefile)
        result = pickle.load(open(cachefile, 'rb'))

        return result

    _log('No cache found, downloading and preprocessing TRIP database.')

    result = {'sc': {}, 'cc': {}, 'vvc': {}, 'vtc': {}, 'fc': {}}
    intrs = {}
    titles = {
        'Characterization': 'cc',
        'Screening': 'sc',
        'Validation: In vitro validation': 'vtc',
        'Validation: In vivo validation': 'vvc',
        'Functional consequence': 'fc',
    }

    interactors = {}
    base_url = urls.urls['trip']['base']
    show_url = urls.urls['trip']['show']
    c = curl.Curl(base_url)
    mainhtml = c.result
    mainsoup = bs4.BeautifulSoup(mainhtml, 'html.parser')
    trppages = common.flat_list(
        [[a.attrs['href'] for a in ul.find_all('a')] for ul in mainsoup.find(
            'div', id='trp_selector').find('ul').find_all('ul')])

    for trpp in trppages:
        trp = trpp.split('/')[-1]
        trpurl = show_url % trp
        c = curl.Curl(trpurl, silent=False)
        trphtml = c.result
        trpsoup = bs4.BeautifulSoup(trphtml, 'html.parser')
        trp_uniprot = trip_find_uniprot(trpsoup)

        if trp_uniprot is None or len(trp_uniprot) < 6:
            _log('Could not find UniProt for %s' % trp)

        for tab in trpsoup.find_all('th', colspan=['11', '13']):
            ttl = titles[tab.text.strip()]
            tab = tab.find_parent('table')
            trip_process_table(tab, result[ttl], intrs, trp_uniprot)

    _log('Saving processed TRIP database content to `%s`' % cachefile)
    pickle.dump(result, open(cachefile, 'wb'))

    return result
Exemplo n.º 7
0
def trip_process(
    exclude_methods=['Inference', 'Speculation'],
    predictions=False,
    species='Human',
    strict=False,
):
    """
    Downloads TRIP data by calling `pypath.dadio.take_a_trip()` and
    further provcesses it.
    Returns dict of dict with TRIP data.

    @exclude_methods : list
        Interaction detection methods to be discarded.
    @predictions : bool
        Whether to include predicted interactions.
    @species : str
        Organism name, e.g. `Human`.
    @strict : bool
        Whether include interactions with species not
        used as a bait or not specified.
    """

    nd = 'Not determined'
    spec = set([]) if strict \
        else set(['Not specified', 'Not used as a bait', ''])
    spec.add(species)
    result = {}
    data = take_a_trip()

    for uniprots in common.uniq_list(
            common.flat_list([v.keys() for v in data.values()])):
        to_process = False
        refs = set([])
        mets = set([])
        tiss = set([])
        reg = set([])
        eff = set([])

        if uniprots in data['sc']:
            for sc in data['sc'][uniprots]:
                if sc[4] in spec and sc[6] in spec and \
                    (predictions or sc[9] != 'Prediction') and \
                        sc[3] not in exclude_methods:
                    refs.add(sc[10])
                    mets.add(sc[3])
                    tiss.add(sc[7])

        if uniprots in data['vtc']:
            for vtc in data['vtc'][uniprots]:
                if vtc[4] in spec and vtc[7] in spec and \
                        vtc[3] not in exclude_methods:
                    refs.add(vtc[10])
                    mets.add(vtc[3])

        if uniprots in data['vvc']:
            for vvc in data['vvc'][uniprots]:
                if vvc[6] in spec and vvc[8] in spec and \
                        vvc[3] not in exclude_methods:
                    refs.add(vvc[10])
                    mets.add(vvc[3])

                    if len(vvc[4]) > 0:
                        tiss.add(vvc[4])

                    if len(vvc[5]) > 0:
                        tiss.add(vvc[5])

        if uniprots in data['cc']:
            for cc in data['cc'][uniprots]:
                if cc[4] in spec and cc[6] in spec and \
                        cc[3] not in exclude_methods:
                    refs.add(cc[10])
                    mets.add(cc[3])

                    if (cc[5] != nd and len(cc[5]) > 0) or \
                            (cc[7] != nd and len(cc[7]) > 0):
                        reg.add((cc[5], cc[7]))

        if uniprots in data['fc']:
            for fc in data['fc'][uniprots]:
                mets.add(fc[3])
                refs.add(fc[7])

                if len(fc[5]) > 0:
                    eff.add(fc[5])

                if len(fc[6]) > 0:
                    eff.add(fc[6])

        if len(refs) > 0:
            result[uniprots] = {
                'refs': refs,
                'methods': mets,
                'tissues': tiss,
                'effect': eff,
                'regions': reg
            }

    return result