def homologene_uniprot_dict(source, target, only_swissprot=True): """ Returns orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. :param int source: NCBI Taxonomy ID of the source species (keys). :param int target: NCBI Taxonomy ID of the target species (values). :param bool only_swissprot: Translate only SwissProt IDs. """ result = {} hge = homologene_dict(source, target, 'entrez') hgr = homologene_dict(source, target, 'refseq') all_source = set( uniprot_input.all_uniprots(organism=source, swissprot='YES')) if not only_swissprot: all_source_trembl = uniprot_input.all_uniprots(organism=source, swissprot='NO') all_source.update(set(all_source_trembl)) for u in all_source: source_e = mapping.map_name(u, 'uniprot', 'entrez', source) source_r = mapping.map_name(u, 'uniprot', 'refseqp', source) target_u = set([]) target_r = set([]) target_e = set([]) for e in source_e: if e in hge: target_e.update(hge[e]) for r in source_r: if r in hgr: target_r.update(hgr[r]) for e in target_e: target_u.update(mapping.map_name(e, 'entrez', 'uniprot', target)) for r in target_r: target_u.update(mapping.map_name(e, 'refseqp', 'uniprot', target)) target_u = \ itertools.chain( *map( lambda tu: mapping.map_name(tu, 'uniprot', 'uniprot', target), target_u ) ) result[u] = sorted(list(target_u)) return result
def _matrixdb_protein_list(category, organism=9606): """ Returns a set of proteins annotated by MatrixDB. :arg str category: The protein annotation category. Possible values: `ecm`, `membrane` or `secreted`. """ url = urls.urls['matrixdb']['%s_proteins' % category] c = curl.Curl(url, silent=False, large=True) proteins = set() # header row _ = next(c.result) for l in c.result: if not l: continue proteins.add(l.strip().replace('"', '').split('\t')[0]) proteins = mapping.map_names(proteins, 'uniprot', 'uniprot') if organism: uniprots = uniprot_input.all_uniprots( organism=organism, swissprot=True, ) proteins = proteins & set(uniprots) return proteins
def _phosphosite_filter_organism(psite_data, ncbi_tax_id=9606): all_uniprots = uniprot_input.all_uniprots(organism=ncbi_tax_id) return [ rec for rec in psite_data if rec[0] in all_uniprots and rec[1] in all_uniprots ]
def loader(ncbi_tax_id=9606): all_up = uniprot_input.all_uniprots(organism=ncbi_tax_id) return (pfam_input.get_pfam_regions( uniprots=all_up, dicts='uniprot', keepfile=True, ))
def load_proteome(self, taxon, swissprot_only=True): key = (taxon, swissprot_only) if key not in self._proteomes: self._proteomes[key] = (set(uniprot_input.all_uniprots(*key))) for protein in self._proteomes[key]: self._taxonomy[protein] = key if not swissprot_only: self.load_proteome(taxon, True)
def phosphosite_interactions(cache=True, ncbi_tax_id=9606): """ Downloads curated and HTP data from Phosphosite, from preprocessed cache file if available. Processes BioPAX format. Returns list of interactions. """ curated_cache = urls.files['phosphosite']['curated'] noref_cache = urls.files['phosphosite']['noref'] if cache and os.path.exists(curated_cache) and os.path.exists(noref_cache): return ( pickle.load(open(curated_cache, 'rb')), pickle.load(open(noref_cache, 'rb')), ) result_curated = [] result_noref = [] url = urls.urls['psite_bp']['url'] c = curl.Curl(url, silent=False, large=True) bpax = c.gzfile xml = ET.parse(bpax) xmlroot = xml.getroot() bpprefix = '{http://www.biopax.org/release/biopax-level3.owl#}' rdfprefix = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}' proteins = {} for p in xmlroot.iter(bpprefix + 'ProteinReference'): psid = p.attrib[rdfprefix + 'ID'] db = p.find(bpprefix + 'xref').find(bpprefix + 'UnificationXref').find(bpprefix + 'db').text up = p.find(bpprefix + 'xref').find(bpprefix + 'UnificationXref').find(bpprefix + 'id').text tax = '' if p.find(bpprefix + 'organism') is not None: tmp = p.find(bpprefix + 'organism') if rdfprefix + 'resource' in tmp.attrib: tax = tmp.attrib[rdfprefix + 'resource'].split('_')[1] if db == 'UniProtKB': up = up[0:6] proteins[psid] = {'id': up, 'db': db, 'species': tax, 'psid': psid} evidences = {} for p in xmlroot.iter(bpprefix + 'EvidenceCodeVocabulary'): evid = p.attrib[rdfprefix + 'ID'].split('_')[1] evname = p.find(bpprefix + 'term').text evidences[evid] = evname ev_short = {'0113': 'WB', '0427': 'MS', '0074': 'MA', '0421': 'AB'} nosrc = [] notgt = [] norefs = [] noev = [] noth = [] edges = [] for c in xmlroot.findall(bpprefix + 'Catalysis'): if rdfprefix + 'resource' in c.find(bpprefix + 'controller').attrib: src = 'po_' + \ c.find( bpprefix + 'controller').attrib[rdfprefix + 'resource'].split('_')[1] else: srcProt = c.find(bpprefix + 'controller').find(bpprefix + 'Protein') if srcProt is not None: src = 'po_' + srcProt.attrib[rdfprefix + 'ID'].split('_')[1] else: nosrc.append(c) tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'ProteinReference') tgt = next(tgtProt, None) if tgt is not None: tgt = tgt.attrib[rdfprefix + 'ID'] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'entityReference') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = tgt.attrib[rdfprefix + 'resource'][1:] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'left') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = 'po_' + \ tgt.attrib[rdfprefix + 'resource'].split('_')[1] else: notgt.append(c) refs = c.iter(bpprefix + 'PublicationXref') pmids = [] for r in refs: pm = r.attrib[rdfprefix + 'ID'].split('_') if pm[0] == 'pmid': pmids.append(pm[1]) refs = c.iter(bpprefix + 'evidence') for r in refs: rrefs = r.iter(bpprefix + 'xref') for rr in rrefs: if rdfprefix + 'resource' in rr.attrib: pm = rr.attrib[rdfprefix + 'resource'].split('_') if pm[0] == 'pubmed': pmids.append(pm[1]) evs = [] for e in c.iter(bpprefix + 'evidenceCode'): if rdfprefix + 'resource' in e.attrib: evs.append(ev_short[e.attrib[rdfprefix + 'resource'].split('_')[1]]) else: ev = e.find(bpprefix + 'EvidenceCodeVocabulary') evs.append(ev_short[ev.attrib[rdfprefix + 'ID'].split('_')[1]]) for e in c.iter(bpprefix + 'evidence'): if rdfprefix + 'resource' in e.attrib: ev = e.attrib[rdfprefix + 'resource'].split('_') if len(ev) == 4: if len(ev[3]) == 4: evs.append(ev_short[ev[3]]) if (src is not None and tgt is not None and src in proteins and tgt in proteins and proteins[src]['id'] is not None and proteins[tgt]['id'] is not None): edges.append({ 'src': proteins[src], 'tgt': proteins[tgt], 'pmids': list(set(pmids)), 'evs': list(set(evs)) }) if len(evs) == 0: noev.append(c) if len(pmids) == 0: norefs.append(c) if len(evs) == 0 and len(pmids) == 0: noth.append(c) if ncbi_tax_id: all_uniprots = uniprot_input.all_uniprots(organism=ncbi_tax_id) for e in edges: if (ncbi_tax_id and (e['src']['id'] not in all_uniprots or e['tgt']['id'] not in all_uniprots)): continue this_iaction = [ e['src']['id'], e['tgt']['id'], e['src']['species'], e['tgt']['species'], ';'.join(e['evs']), ';'.join(e['pmids']) ] if len(this_iaction[-1]) > 0: result_curated.append(this_iaction) else: result_noref.append(this_iaction) pickle.dump(result_curated, open(curated_cache, 'wb')) pickle.dump(result_noref, open(noref_cache, 'wb')) return result_curated, result_noref
def phosphosite_regsites_one_organism(organism=9606): """ Returns PhosphoSitePlus regulatory sites translated to one organism by orthology. Residue numbers will be translated where necessary, while gene symbols will be translated to UniProt IDs of the given organism. This works with human, mouse or rat. :param int organism: NCBI Taxonomy ID of the target organism. In this method possible values are human, mouse or rat, as these species provide the vast majority of the data, and are close enough to each other that the sites can be safely translated between orthologous proteins by sequence alignement. """ def genesymbols2uniprots(genesymbols, tax): return (set( itertools.chain(*map( lambda gs: mapping.map_name( gs, 'genesymbol', 'uniprot', ncbi_tax_id=tax, ), genesymbols)))) def translate_uniprots(uniprots, h**o): return (set( itertools.chain(*map( lambda usrc: h**o[usrc] if usrc in h**o else [], uniprots)))) result = {} organisms = set([9606, 10090, 10116]) mod_types = dict(common.psite_mod_types2) regsites = phosphosite_regsites() other_organisms = organisms - set([organism]) homology = (dict( map( lambda other: (other, homology.homologene_uniprot_dict( source=other, target=organism, )), other_organisms))) ptm_homology = ptm_orthology() proteome = uniprot_input.all_uniprots(organism=organism, swissprot='YES') for substrate, regs in iteritems(regsites): subs = [] if substrate in proteome: subs = [substrate] else: for other, h**o in iteritems(homology): if substrate in h**o: subs = h**o[substrate] for sub in subs: if sub not in result: result[sub] = {} for reg in regs: reg_organism = taxonomy.taxa[reg['organism']] if reg_organism not in organisms: continue mod_type = mod_types[reg['modt']] resnum = int(reg['res']) psite_key = ( substrate, reg['isoform'], reg['aa'], resnum, reg_organism, mod_type, ) if reg_organism != organism: regs_target = [] disrupts = [] induces = [] if psite_key in ptm_homology: if organism in ptm_homology[psite_key]: regs_target = ptm_homology[psite_key][organism] if len(regs_target): disrupts = genesymbols2uniprots( reg['disrupts'], reg_organism, ) disrupts = translate_uniprots( disrupts, homology[reg_organism], ) induces = genesymbols2uniprots( reg['induces'], reg_organism, ) induces = translate_uniprots( induces, homology[reg_organism], ) else: regs_target = [psite_key] disrupts = genesymbols2uniprots(reg['disrupts'], organism) induces = genesymbols2uniprots(reg['induces'], organism) for regt in regs_target: modkey = (regt[2], regt[3], regt[5]) if modkey not in result[sub]: result[sub][modkey] = { 'induces': set([]), 'disrupts': set([]), 'pmids': set([]), 'isoforms': set([]), 'process': set([]), 'function': set([]), 'positive': False, 'negative': False, 'comments': [] } result[sub][modkey]['induces'].update(induces) result[sub][modkey]['disrupts'].update(disrupts) result[sub][modkey]['process'].update(reg['process']) result[sub][modkey]['function'].update(reg['function']) result[sub][modkey]['isoforms'].update([regt[1]]) result[sub][modkey]['pmids'].update(reg['pmids']) result[sub][modkey]['positive'] = \ result[sub][modkey]['positive'] or reg['positive'] result[sub][modkey]['negative'] = \ result[sub][modkey]['negative'] or reg['negative'] if len(reg['comments']): result[sub][modkey]['comments'].append(reg['comments']) return result
def get_pfam(uniprots=None, organism=9606): if uniprots is None: uniprots = uniprot_input.all_uniprots( organism=organism, swissprot=True, ) u_pfam = {} pfam_u = {} if uniprots is not None: prg = progress.Progress( len(uniprots) / 30, 'Downloading data from UniProt', 1, ) data_all = [] for i in xrange(0, len(uniprots), 30): to = i + 30 thisPart = uniprots[i:to] thisPart = ' OR '.join(['accession:%s' % u for u in thisPart]) get = { 'query': thisPart, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get) data = c.result if data is not None: break if data is None: return None, None data = data.split('\n') del data[0] del data[-1] data_all += data prg.step() prg.terminate() else: organism = taxonomy.ensure_ncbi_tax_id(organism) if not organism: return None, None organismQuery = 'organism:%u AND reviewed:yes' % organism get = { 'query': organismQuery, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl( urls.urls['uniprot_basic']['url'], get=get, silent=False, outf='uniprot-pfam-%u.tab' % organism, ) data_all = c.result if data_all is not None: break if data_all is None: return None data_all = data_all.split('\n') del data_all[0] for l in data_all: l = l.split('\t') pfams = re.sub(';$', '', l[1]).strip() pfams = pfams.split(';') if pfams else [] if l[0] not in u_pfam: u_pfam[l[0]] = [] u_pfam[l[0]] += pfams for pfam in pfams: if pfam not in pfam_u: pfam_u[pfam] = [] pfam_u[pfam].append(l[0]) return u_pfam, pfam_u
def locate_localizations( organism=9606, literature=True, external=True, predictions=False, ): record = collections.namedtuple( 'LocateAnnotation', ('source', 'location', 'cls', 'pmid', 'score'), ) record.__new__.__defaults__ = (None, None, None) organism_uniprots = set( uniprot_input.all_uniprots(organism=organism, swissprot=True)) organism_str = taxonomy.taxids[organism] url = urls.urls['locate']['url'] % organism_str fname = url.split('/')[-1][:-4] c = curl.Curl( url, large=True, default_mode='rb', silent=False, files_needed=[fname], ) c.result[fname] parser = etree.iterparse(c.result[fname], events=('start', 'end')) result = collections.defaultdict(set) root = next(parser) used_elements = [] for ev, elem in parser: if ev == 'end' and elem.tag == 'LOCATE_protein': tag_protein = elem.find('protein') this_uniprot = None this_uniprots = None this_entrez = None this_organism = (tag_protein.find('organism').text if tag_protein is not None else None) this_class = (tag_protein.find('class').text if tag_protein is not None else None) xrefs = elem.find('xrefs') if xrefs is None: continue for xref in xrefs.findall('xref'): src = xref.find('source') src_name = src.find('source_name').text if src_name == 'UniProtKB-SwissProt': this_uniprot = src.find('accn').text if src_name == 'Entrez Gene': this_entrez = src.find('accn').text if src_name == 'UniProt/SPTrEMBL' and this_uniprot is None: this_uniprot = src.find('accn').text # if we don't know what it is, does not make sense to proceed if this_uniprot is None and this_entrez is None: continue if this_uniprot: this_uniprots = mapping.map_name( this_uniprot, 'uniprot', 'uniprot', ncbi_tax_id=organism, ) if not this_uniprots and this_entrez: this_uniprots = mapping.map_name( this_entrez, 'entrez', 'uniprot', ncbi_tax_id=organism, ) this_uniprots = set(this_uniprots) & organism_uniprots # if we don't know what it is, does not make sense to proceed if not this_uniprots: continue if external: # External database annotations extannot = elem.find('externalannot') if extannot is not None: for extannotref in extannot.findall('reference'): sources = [] for src in extannotref.findall('source'): src_name = src.find('source_name') if src_name is not None: sources.append(src_name.text) sources = ';'.join(sources) if sources else None locations = extannotref.find('locations') if locations is not None: for location in locations.findall('location'): for loc in location.iterchildren(): if loc.tag[:4] == 'tier': this_loc = loc.text.lower().split(',') for uniprot in this_uniprots: for _loc in this_loc: result[uniprot].add( record( source=sources, location=_loc.strip(), cls=this_class, score=None, )) if predictions: # Predictions sclpred = elem.find('scl_prediction') if sclpred is not None: for sclpred_src in sclpred.findall('source'): score = float(sclpred_src.find('evaluation').text) if score == 0.0: continue this_src = sclpred_src.find('method').text this_loc = sclpred_src.find('location').text.lower() if this_loc == 'no prediction': continue for uniprot in this_uniprots: result[uniprot].add( record( source=this_src, location=this_loc, cls=this_class, score=score, )) if literature: # Literature curation lit = elem.find('literature') if lit is not None: for litref in lit.findall('reference'): locs = set() for lloc in ( litref.find('locations').findall('location')): for loc in lloc.iterchildren(): if loc.tag[:4] == 'tier': locs.add(loc.text.lower()) pmid = litref.find('source') pmid = (None if pmid is None else pmid.find('accn').text) for loc in locs: for uniprot in this_uniprots: result[uniprot].add( record( source='literature', location=loc, pmid=pmid, cls=this_class, score=None, )) used_elements.append(elem) # removing used elements to keep memory low if len(used_elements) > 1000: for _ in xrange(500): e = used_elements.pop(0) e.clear() # closing the XML c.fileobj.close() del c return result