def toplist(self, length=None, alpha=None, significant=True, min_set_size=0, groups=None, filtr=lambda x: True, **kwargs): args = get_args(locals(), ['filtr', 'groups']) if groups is None: groups = self.gsea.groups.keys() # all by default sets = set( common.flat_list(s for g, s in iteritems(self.gsea.groups) if g in groups)) return super(GSEABinaryEnrichmentSet, self).toplist( filtr=lambda x: x[0] in sets and filtr(x), **args)
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href = True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress( len(hsa_list), 'Processing KEGG Pathways', 1, percent = False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent = True, req_headers = req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name( gn, 'genesymbol', 'uniprot', strict = True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): st = rel.find('subtype') if ( rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and st and 'name' in st.attrs ): for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append((u1, u2, st.attrs['name'], pw)) prg.terminate() return common.uniq_list(interactions)
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ positive_terms = {'activation', 'expression'} negative_terms = {'inhibition', 'repression'} transc_terms = {'expression', 'repression'} mechanism_terms = { 'phosphorylation', 'binding/association', 'dissociation', 'ubiquitination', 'dephosphorylation', 'glycosylation', 'state change', 'methylation', } direct_terms = {'indirect effect'} KeggInteraction = collections.namedtuple( 'KeggInteraction', [ 'id_a', 'id_b', 'effect', 'pathway', 'mechanism', 'is_direct', 'transcriptional', ], ) rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href=True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress(len(hsa_list), 'Processing KEGG Pathways', 1, percent=False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent=True, req_headers=req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name(gn, 'genesymbol', 'uniprot', strict=True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): subtypes = {st.attrs['name'] for st in rel.find_all('subtype')} if (rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and subtypes): is_direct = 'indirect effect' not in subtypes effect = ('inhibition' if negative_terms & subtypes else 'activation' if positive_terms & subtypes else 'unknown') mechanism = ';'.join(mechanism_terms & subtypes) transcriptional = bool(transc_terms & subtypes) for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append( KeggInteraction( id_a=u1, id_b=u2, effect=effect, pathway=pw, mechanism=mechanism, is_direct=is_direct, transcriptional=transcriptional, )) prg.terminate() return common.uniq_list(interactions)
def get_pubmed_data(pp, cachefile=None, htp_threshold=20): """ For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ if cachefile is None: cachefile = settings.get('pubmed_cache') if htp_threshold is not None: pp.htp_stats() pubmeds = common.uniq_list( common.flat_list([[r.pmid for r in e['references']] for e in pp.graph.es])) if htp_threshold is not None: pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs'] notpmid = [i for i in pubmeds if not i.isdigit()] sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' % len(notpmid)) pmdata = {} if os.path.exists(cachefile): sys.stdout.write('\t:: Loading data previously downloaded ' 'from PubMed, from file `%s`\n' % cachefile) pmdata = pickle.load(open(cachefile, 'rb')) missing = list(set(pubmeds) - set(pmdata.keys())) sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' % len(missing)) cached_pubmeds_len = len(pmdata) pmdata_new = pubmed_input.get_pubmeds(missing) pmdata.update(pmdata_new) sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile) if len(pmdata) > cached_pubmeds_len: pickle.dump(pmdata, open(cachefile, 'wb')) pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds) points = [] earliest = [] for e in pp.graph.es: for s, rs in iteritems(e['refs_by_source']): pms = [ r.pmid for r in rs if (htp_threshold is None or r.pmid not in pp.htp[htp_threshold]['htrefs']) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid] ] if len(pms) > 0: yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms] earliest.append((s, 0, min(yrs), '', e.index)) for pm in pms: points.append((s, pm, int(pmdata[pm]['pubdate'][:4]), pmdata[pm]['source'], e.index)) points = common.uniq_list(points) earliest = common.uniq_list(earliest) points = pd.DataFrame.from_records(points) earliest = pd.DataFrame.from_records(earliest) points.columns = ['database', 'pmid', 'year', 'journal', 'eid'] earliest.columns = ['database', 'none', 'year', 'none', 'eid'] return points, earliest
def write_set(self, id_list, setname, id_type, map_ids=True): self.sets[setname] = set(common.uniq_list(common.flat_list( self.mapper.map_name(n, self.ids[id_type], self.target_id) for n in id_list))) if map_ids \ else set(id_list)
def take_a_trip(cachefile=None): """ Downloads TRIP data from webpage and preprocesses it. Saves preprocessed data into `cachefile` and next time loads from this file. :arg cachefile str: Path to pickle dump of preprocessed TRIP database. If does not exist the database will be downloaded and saved to this file. By default the path queried from the ``settings`` module. """ cachefile = cachefile or settings.get('trip_preprocessed') if os.path.exists(cachefile): _log('Loading preprocessed TRIP database ' 'content from `%s`' % cachefile) result = pickle.load(open(cachefile, 'rb')) return result _log('No cache found, downloading and preprocessing TRIP database.') result = {'sc': {}, 'cc': {}, 'vvc': {}, 'vtc': {}, 'fc': {}} intrs = {} titles = { 'Characterization': 'cc', 'Screening': 'sc', 'Validation: In vitro validation': 'vtc', 'Validation: In vivo validation': 'vvc', 'Functional consequence': 'fc', } interactors = {} base_url = urls.urls['trip']['base'] show_url = urls.urls['trip']['show'] c = curl.Curl(base_url) mainhtml = c.result mainsoup = bs4.BeautifulSoup(mainhtml, 'html.parser') trppages = common.flat_list( [[a.attrs['href'] for a in ul.find_all('a')] for ul in mainsoup.find( 'div', id='trp_selector').find('ul').find_all('ul')]) for trpp in trppages: trp = trpp.split('/')[-1] trpurl = show_url % trp c = curl.Curl(trpurl, silent=False) trphtml = c.result trpsoup = bs4.BeautifulSoup(trphtml, 'html.parser') trp_uniprot = trip_find_uniprot(trpsoup) if trp_uniprot is None or len(trp_uniprot) < 6: _log('Could not find UniProt for %s' % trp) for tab in trpsoup.find_all('th', colspan=['11', '13']): ttl = titles[tab.text.strip()] tab = tab.find_parent('table') trip_process_table(tab, result[ttl], intrs, trp_uniprot) _log('Saving processed TRIP database content to `%s`' % cachefile) pickle.dump(result, open(cachefile, 'wb')) return result
def trip_process( exclude_methods=['Inference', 'Speculation'], predictions=False, species='Human', strict=False, ): """ Downloads TRIP data by calling `pypath.dadio.take_a_trip()` and further provcesses it. Returns dict of dict with TRIP data. @exclude_methods : list Interaction detection methods to be discarded. @predictions : bool Whether to include predicted interactions. @species : str Organism name, e.g. `Human`. @strict : bool Whether include interactions with species not used as a bait or not specified. """ nd = 'Not determined' spec = set([]) if strict \ else set(['Not specified', 'Not used as a bait', '']) spec.add(species) result = {} data = take_a_trip() for uniprots in common.uniq_list( common.flat_list([v.keys() for v in data.values()])): to_process = False refs = set([]) mets = set([]) tiss = set([]) reg = set([]) eff = set([]) if uniprots in data['sc']: for sc in data['sc'][uniprots]: if sc[4] in spec and sc[6] in spec and \ (predictions or sc[9] != 'Prediction') and \ sc[3] not in exclude_methods: refs.add(sc[10]) mets.add(sc[3]) tiss.add(sc[7]) if uniprots in data['vtc']: for vtc in data['vtc'][uniprots]: if vtc[4] in spec and vtc[7] in spec and \ vtc[3] not in exclude_methods: refs.add(vtc[10]) mets.add(vtc[3]) if uniprots in data['vvc']: for vvc in data['vvc'][uniprots]: if vvc[6] in spec and vvc[8] in spec and \ vvc[3] not in exclude_methods: refs.add(vvc[10]) mets.add(vvc[3]) if len(vvc[4]) > 0: tiss.add(vvc[4]) if len(vvc[5]) > 0: tiss.add(vvc[5]) if uniprots in data['cc']: for cc in data['cc'][uniprots]: if cc[4] in spec and cc[6] in spec and \ cc[3] not in exclude_methods: refs.add(cc[10]) mets.add(cc[3]) if (cc[5] != nd and len(cc[5]) > 0) or \ (cc[7] != nd and len(cc[7]) > 0): reg.add((cc[5], cc[7])) if uniprots in data['fc']: for fc in data['fc'][uniprots]: mets.add(fc[3]) refs.add(fc[7]) if len(fc[5]) > 0: eff.add(fc[5]) if len(fc[6]) > 0: eff.add(fc[6]) if len(refs) > 0: result[uniprots] = { 'refs': refs, 'methods': mets, 'tissues': tiss, 'effect': eff, 'regions': reg } return result