def pathwayInfo(code): # Function to get info about a pathway, from the code # Intialize searcher kSearcher = KEGG() # Get result and parse it in a dictionnary result = kSearcher.get(code) # Add code at the begining of the list dictResult = kSearcher.parse(result) # Initialize an empty list pathwayList = [] # If name exist as a key in dictionnary, else 'NA' insted pathwayList.append(code) if 'NAME' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script nameStr = str(dictResult['NAME'][0].replace(',', ';')) pathwayList.append(nameStr) else: pathwayList.append('NA') # If class exist as a key in dictionnary, else 'NA' instead if 'CLASS' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script classStr = str(dictResult['CLASS']).replace(',', ';') pathwayList.append(classStr) else: pathwayList.append('NA') return pathwayList
def get_kegg_info(stId): """ Get kegg dict by pathway id. """ k = KEGG() data = k.get(stId) dict_data = k.parse(data) return dict_data
def get_genes_from_kegg_pathway(pathway): from bioservices.kegg import KEGG k = KEGG() k.organism = 'hsa' pathway = k.get(pathway) genes = k.parse(pathway)['GENE'] entrez, symbol = zip(*[i.split(' ') for i in genes]) return symbol
def get_single_compound_metadata_online(compound_id): if compound_id.upper().startswith('C'): s = KEGG() res = s.get(compound_id) return s.parse(res) else: ch = ChEBI() res = ch.getCompleteEntity('CHEBI:'+compound_id) return res
def extract_orthologs(filename): ''' Create dictionnary with keggid as key and list of orthologs as value arg: csv with keggids return : dict with orthologs ''' orthos_dict = {} k = KEGG() #get list of gammaproteobacteria from csv df = pd.read_csv(filename, sep="\t", tupleize_cols=1) df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1) gamma_list = df_gamma['KEGG'].tolist() #loop through keggid to get orthologs for keggid in df['kegg_id']: if keggid == "no": continue print(str(keggid)) ortho_list = [] #get orthologs on kegg data = k.get(keggid) dict_data = k.parse(data) if isinstance(dict_data, int): continue #loop through kegg orthologs data and verify that organisms are gammaproteobacteria for key, value in dict_data['GENES'].items(): if key.lower() in gamma_list: # print(key.lower(), value.split('(')[0].split()) para_num = len(value.split('(')[0].split()) para_list = [] for i in range(0, para_num): #print(value.split('(')[0].split()[i]) para_list.append(key.lower() + ":" + value.split('(')[0].split()[i]) ortho_list.append(para_list) orthos_dict[keggid] = ortho_list return orthos_dict
def id2seq(self, hsa): s = KEGG() d = s.get(hsa) dict_d = s.parse(d) pattern = re.compile(r'\s+') try: seq = re.sub(pattern, '', dict_d['AASEQ']) except: seq = '' #print('SEQ:', seq) text_file = open("dummy.txt", "w") text_file.write('>' + str(hsa) + '\n' + seq) text_file.close() return None
def get_metabs(KEGG, reac_id): subs_list = [] prod_list = [] # Get reaction data from KEGG using a KEGG reaction ID r_data = KEGG.get(reac_id) # Parse the information retrieved r_parsed = KEGG.parse(r_data) # Split the equation into substrates and products split_eq = re.split('<=>', r_parsed['EQUATION']) # Remove the plus signs between the metabolites subs_list = [s.strip() for s in split_eq[0].split('+')] prod_list = [p.strip() for p in split_eq[1].split('+')] return [subs_list, prod_list]
def get_compound_metadata_online(kegg_ids): s = KEGG() metadata_map = {} for i in range(len(kegg_ids)): try: if i % 10 == 0: print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids))) kegg_id = kegg_ids[i] res = s.get(kegg_id) d = s.parse(res) first_name = d['NAME'][0] first_name = first_name.replace(';', '') # strip last ';' character metadata_map[kegg_id] = {'display_name': first_name} except TypeError: print('kegg_id=%s parsed_data=%s' % (kegg_id, d)) return metadata_map
def get_seq(filename): ''' Create dictionnary with species as keys and sequences as values for an alignment arg: filename with gene name return: organism dictionnary with sequences ''' k = KEGG() records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta")) idlist = [] orglist = [] seqlist = [] orgdict = {} #go through sequences and search for organism name on kegg for record in records: idsplit = (record.id).split('_',1) id = idsplit[0] + ':' + idsplit[1] handle = k.get(id) if isinstance( handle, int ): print(id) continue org = k.parse(handle)['ORGANISM'] org = org.split() org = org[1] +" "+ org[2] seqlist.append(list(str(record.seq))) orglist.append(org) idlist.append(id) duplist = set(orglist) # create dict with organism as key and sequences for organism as values for org in duplist: indices = [i for i, x in enumerate(orglist) if x == org] seqs = [] for e in indices: seqs.append(seqlist[e]) orgdict[org] = seqs #print(orgdict) return orgdict
def queryKegg(theIDs): print("Currently querying KEGG...") k = KEGG() keggData = list() IDlist = list() for id in theIDs: ids = id[3:] query = k.find("acb", ids) query = query.split('\t') finalQuery = query[0] data = k.get(finalQuery) dictData = k.parse(data) keggData.append(dictData) IDlist.append(ids) return keggData, IDlist
def main(): # Start KEGG interface k = KEGG() # Create a dict to store final result data = dict() # Read in KEGG gene ID & gene symbol pairs with open("hsa_gene_list.json", "r") as g: gene_data = json.load(g) for gene in gene_data.keys(): print gene g_data = k.get(gene) g_prsd = k.parse(g_data) data[gene] = g_prsd with open('ginfo.json', 'w') as fw: json.dump(data, fw)
def get_reaction_ECs_from_kegg(self): self.reaction_ECs = defaultdict(set) kegg = KEGG() for r in self.model.reactions: ECs = [] try: reacts = r.split(" ") for i in reacts: if i not in self.reaction_ECs: print("KEGG reaction", i) ECs += kegg.parse(kegg.get(i))['ENZYME'] for e in ECs: self.reaction_ECs[i].add(e) except Exception as inst: print(inst) #for e in ECs: # self.reaction_ECs[r].add(e) print("EC data loaded from KEGG")
def mapSpecies(mousepeptrackfilename): #increase the field size of CSV csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) uniproturl = 'https://www.uniprot.org/uploadlists/' RETRY_TIME = 20.0 mdf= pd.read_csv(mousepeptrackfilename, delimiter='\t') mouseGenes=list(mdf['Gene'].unique()) mouseGenes=[g for g in mouseGenes if str(g) !='nan'] mousehumandic={} for gx in range(0,len(mouseGenes),1000): genecodes=' '.join(mouseGenes[gx:gx+1000]) geneuniprotparams = { 'from':'GENENAME', 'to':'ACC', 'format':'tab', 'query':genecodes, 'columns':'id,genes(PREFERRED),organism-id,reviewed' } while True: try: geneuniprotdata = urllib.urlencode(geneuniprotparams) geneuniprotrequest = urllib2.Request(uniproturl, geneuniprotdata) geneuniprotresponse = urllib2.urlopen(geneuniprotrequest) for guniprotline in geneuniprotresponse: gudata=guniprotline.strip() if not gudata.startswith("Entry"): guinfo=gudata.split("\t") if '9606' == guinfo[2].lower() and 'reviewed' == guinfo[3].lower() and guinfo[-1].lower() ==guinfo[1].lower() and len(guinfo[0].strip())>1: mousehumandic[guinfo[-1].strip()]=guinfo[0].strip() break except urllib2.HTTPError: time.sleep(RETRY_TIME) print ('Hey, I am trying again until succeeds to get data from uniprot data!',str(datetime.datetime.now())) except httplib.BadStatusLine: time.sleep(RETRY_TIME) print ('Hey, I am trying again until succeeds to get data from uniprot data!',str(datetime.datetime.now())) colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\ 'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\ 'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession'] finalresult=[] finalresult.append(colname) humanUniprotID=[] with open(mousepeptrackfilename) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: templist=[] for i in colname[:-1]: tempdata=str(row[i]).strip() templist.append(tempdata) if len(str(templist[2]).strip())>0: if templist[2] in mousehumandic: huUniId=mousehumandic[templist[2]] humanUniprotID.append(huUniId) templist.append(huUniId) else: templist.append('NA') finalresult.append(templist) with open(mousepeptrackfilename,'wb') as pf: pwriter =csv.writer(pf,delimiter='\t') pwriter.writerows(finalresult) unqhumanUniprotID=list(set(humanUniprotID)) humanUniprotfuncinfodic={} countProt=0 for subcode in unqhumanUniprotID: time.sleep(2) drugbanklist=[] PN='NA' GN='NA' OG='NA' OGID='NA' dislist=[] GoIDList=[] GoNamList=[] GoTermList=[] try: countProt+=1 if countProt%1000 ==0: print str(countProt), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts",str(datetime.datetime.now()) SGrequestURL="https://www.uniprot.org/uniprot/"+str(subcode)+".xml" SGunifile=urllib.urlopen(SGrequestURL) SGunidata= SGunifile.read() SGunifile.close() try: SGunidata=minidom.parseString(SGunidata) try: drugdata=(SGunidata.getElementsByTagName('dbReference')) for duItem in drugdata: if (duItem.attributes['type'].value).upper() == 'DRUGBANK': try: drugname=(str(duItem.getElementsByTagName('property')[0].attributes['value'].value).strip()) drugid=str(duItem.attributes['id'].value).strip() durl='<a target="_blank" href="https://www.drugbank.ca/drugs/'+drugid+'">'+drugname+'</a>' drugbanklist.append(durl) except: pass if (duItem.attributes['type'].value).strip() == 'NCBI Taxonomy': try: OGID=str(duItem.attributes['id'].value).strip() except: pass except IndexError: pass try: godata=(SGunidata.getElementsByTagName('dbReference')) for gItem in godata: if (gItem.attributes['type'].value).upper() == 'GO': try: gonamedetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[1] gotermdetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[0] GoNamList.append(gonamedetails) goid=str(gItem.attributes['id'].value).strip() GoIDList.append(goid) if gotermdetails.lower()=='p': GoTermList.append('Biological Process') if gotermdetails.lower()=='f': GoTermList.append('Molecular Function') if gotermdetails.lower()=='c': GoTermList.append('Cellular Component') except: pass if (gItem.attributes['type'].value).strip() == 'NCBI Taxonomy': try: OGID=str(gItem.attributes['id'].value).strip() except: pass except IndexError: pass try: try: PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('recommendedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue except: PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('submittedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue except IndexError: pass try: try: GN=((SGunidata.getElementsByTagName('gene')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue except: GN='NA' except IndexError: pass try: try: OG=((SGunidata.getElementsByTagName('organism')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue except: OG='NA' except IndexError: pass try: disdata=SGunidata.getElementsByTagName('disease') for dItem in disdata: disname='' disshort='' try: disname=(dItem.getElementsByTagName('name')[0]).firstChild.nodeValue except: pass try: disshort=(dItem.getElementsByTagName('acronym')[0]).firstChild.nodeValue except: pass if len(disname.strip())>0: dislist.append(str(disname.strip())+'('+str(disshort)+')') except IndexError: pass except ExpatError: pass except IOError: pass drugbankdata='NA' disdata='NA' goiddata='NA' gonamedata='NA' gotermdata='NA' if len(GoIDList)>0: goiddata='|'.join(list(set(GoIDList))) if len(GoNamList)>0: gonamedata='|'.join(list(set(GoNamList))) if len(GoTermList)>0: gotermdata='|'.join(list(set(GoTermList))) if len(drugbanklist)>0: drugbankdata='|'.join(list(set(drugbanklist))) if len(dislist)>0: disdata='|'.join(list(set(dislist))) humanUniprotfuncinfodic[subcode]=[PN,GN,OG,OGID,disdata,drugbankdata,goiddata,gonamedata,gotermdata] hudicfile='humanUniprotfuncinfodic.obj' hudicf = open(hudicfile, 'wb') pickle.dump(humanUniprotfuncinfodic, hudicf , pickle.HIGHEST_PROTOCOL) hudicf.close() print ("Extracting KEGG pathway name, job starts",str(datetime.datetime.now())) hkeggdictfile={} hk = KEGG() for hkx in range(0,len(unqhumanUniprotID),2000): countProt+=hkx+2000 if countProt%2000 ==0: print (str(countProt), "th protein kegg job starts",str(datetime.datetime.now())) huniprotcodes=' '.join(unqhumanUniprotID[hkx:hkx+2000]) huniprotparams = { 'from':'ACC', 'to':'KEGG_ID', 'format':'tab', 'query':huniprotcodes } while True: try: hkuniprotdata = urllib.urlencode(huniprotparams) hkuniprotrequest = urllib2.Request(uniproturl, hkuniprotdata) hkuniprotresponse = urllib2.urlopen(hkuniprotrequest) for hkuniprotline in hkuniprotresponse: hkudata=hkuniprotline.strip() if not hkudata.startswith("From"): hkuinfo=hkudata.split("\t") if len(hkuinfo[1].strip()): hkegg=hk.get(hkuinfo[1].strip()) hkudict_data = hk.parse(hkegg) try: try: if len(str(hkuinfo[0]).strip()) >5: hkeggdictfile[hkuinfo[0].strip()]=hkudict_data['PATHWAY'].values() except TypeError: pass except KeyError: pass break except urllib2.HTTPError: time.sleep(RETRY_TIME) print ('Hey, I am trying again until succeeds to get data from KEGG!',str(datetime.datetime.now())) pass hkdicfile='humankeggdic.obj' hkdicf = open(hkdicfile, 'wb') pickle.dump(hkeggdictfile, hkdicf , pickle.HIGHEST_PROTOCOL) hkdicf.close()
from bioservices.kegg import KEGG k = KEGG() path = k.get("K00855") kdict = k.parse(path) print(kdict) help(kdict) with open("play.out", "wt") as result: result.write("\n".join(kdict.keys()))
ko ) #here append may be used, bc only one item is added at a time else: EC2KO_dic[item] = [ko] else: KO_dic[ko] = [definition] #here a list of KOs is made to be searched for - if pathway is defined kostoget = {} if args.pathway != 'none': output = open('eclist.txt', 'w') kegg = KEGG() pathway = kegg.get(args.pathway) dict_data = kegg.parse(pathway) if dict_data == 404: print("WARNING: BAD PATHWAY SUBMITTED TO KEGG!") elif dict_data == None: print("WARNING: ERROR CONNECTING TO KEGG SERVER!") #print(dict_data) print("ECQUERY: PROCESSING PATHWAY/MODULE") try: for key in dict_data['ORTHOLOGY'].keys(): print("adding ortholog {} to eclist".format(key)) value = dict_data['ORTHOLOGY'][key] print(value) if "," in key: for item in key.split(","): kostoget[item] = value output.write(item + '\n')
from bioservices.kegg import KEGG kegg = KEGG() pathway = kegg.get("ko01230") dict_data = kegg.parse(pathway) print(dict_data) output = open("modules.txt", "w") modules_dict = {} for key in dict_data['MODULE'].keys(): pathway = kegg.get(key) module_data = kegg.parse(pathway) #print(module_data) orthologs = [] for ortholog in module_data['ORTHOLOGY'].keys(): data = [ortholog, module_data['ORTHOLOGY'][ortholog]] orthologs.append("_".join(data)) modules_dict[ortholog] = key output.write('{}\t{}\t{}\t{}\n'.format(key, module_data['NAME'], module_data['DEFINITION'], "//".join(orthologs)))
def enzymeInfo(code, ignored, stats, verbosity): # Function to get info about an enzyme, from the code # This function return a double list # Intialize KEGG searcher kSearch = KEGG(verbose=verbosity) # Get result and parse it in a dictionnary print(f"[+] Get info about enzyme {code}") result = kSearch.get(code) # If KEGG return an int, the enzyme code doesn't match in databases if type(result) is int: return False else: dictResult = kSearch.parse(result) # Create prefix list, info about enzyme herself prefixList = [] # Add code at the begining of the list prefixList.append(code) # If name is present as key, else 'NA' insted if 'NAME' in dictResult.keys(): #prefixList.append(dictResult['NAME']) # Convert names from list into a string # with strop '[]' part, and replace initial separator , by ; namesStr = str(dictResult['NAME']).strip("'[]'").replace(',', ';') prefixList.append(namesStr) else: prefixList.append('NA') # If definition is present as key, else 'NA' insted if 'DEFINITION' in dictResult.keys(): # If definition is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script definitionStr = str(dictResult['DEFINITION']).replace(',', ';') prefixList.append(definitionStr) else: prefixList.append('NA') # If pathway exist as a key in result if 'PATHWAY' in dictResult.keys(): # Get all pathways as keys in dictionnary pathwayList = list(dictResult['PATHWAY'].keys()) # Create final list, which contain : # - prefix (info about enzyme) # - suffix list (info about each enzyme's pathways) finalList = [] # Add suffix in final list finalList.append(prefixList) for pathway in pathwayList: # If pathway not in ignored list if pathway not in ignored: print(f" [-] Get info about {pathway} pathway") suffixList = pathwayInfo(pathway) # Add number of pathway for stats stats['NB_PATHWAY'] = stats['NB_PATHWAY'] + 1 # Add suffix of pathway in final list finalList.append(suffixList) # If enzyme have only 1 pathway and this pathway is in ignored list # Bad luck ! elif len(pathwayList) == 1 and pathway in ignored: print( f" [!] Enzyme {code} have only 1 pathway : {pathway}") print(f" [!] and this pathway is ignored") # Add entries for stats stats['ENZYME_ONLY_IGNORED_PATHWAY'] = stats[ 'ENZYME_ONLY_IGNORED_PATHWAY'] + 1 stats['LIST_ENZYME_ONLY_IGNORED_PATHWAY'].append(code) # Artificially create pathway entry, but empty suffixList = ['NA'] # Add suffix of pathway in final list finalList.append(suffixList) else: print(f" [!] Ignored pathway : {pathway}") # Else, if pathway doesn't exist as a key in result elif 'PATHWAY' not in dictResult.keys(): # Initialize an empty list finalList = [] # Add suffix in final list finalList.append(prefixList) # Display a alert message print(f"[!] No pathway detected for enzyme {code}\n") # Increment number of failed pathway in stats and add enzyme in list stats['MISSING_PATHWAY_IN_KEGG'] = stats[ 'MISSING_PATHWAY_IN_KEGG'] + 1 stats['LIST_MISSING_PATHWAY_IN_KEGG'].append(code) # Artificially create pathway entry, but empty suffixList = ['NA'] # Add suffix of pathway in final list finalList.append(suffixList) return finalList
def mapSpecies(mousepeptrackfilename): RETRY_TIME = 20.0 mouseTohumanfilepath = os.path.join(os.getcwd(), 'MouseToHuman.tsv') print("Extracting Mouse to Human Map data, job starts", str(datetime.datetime.now())) #increase the field size of CSV csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) try: urllib.urlretrieve( 'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt', mouseTohumanfilepath) urllib.urlcleanup() except: print("Can't able to download MouseToHuman.tsv file!!") colnameMousHu = [ 'HomoloGene ID', 'Common Organism Name', 'NCBI Taxon ID', 'Symbol', 'EntrezGene ID', 'Mouse MGI ID', 'HGNC ID', 'OMIM Gene ID', 'Genetic Location', 'Genomic Coordinates (mouse: , human: )', 'Nucleotide RefSeq IDs', 'Protein RefSeq IDs', 'SWISS_PROT IDs' ] mouseHumandata = [] homologID = [] with open(mouseTohumanfilepath) as mhtsvfile: mhreader = csv.DictReader(mhtsvfile, delimiter='\t') for mhrow in mhreader: mhtemplist = [] for i in colnameMousHu: mhtempdata = str(mhrow[i]).strip() mhtemplist.append(mhtempdata) if len(mhtemplist[-1].strip()) > 0: homologID.append(mhtemplist[0]) mouseHumandata.append(mhtemplist) homologID = list(set(homologID)) homologID.sort() mousehumandic = {} for homologidItem in homologID: tempHumanHomoUniID = '' tempMouseHomoUniID = '' for item in mouseHumandata: if homologidItem == item[0]: if 'mouse' in item[1].strip().lower(): tempMouseHomoUniID = item[-1].strip() else: tempHumanHomoUniID = item[-1].strip() if len(tempMouseHomoUniID.strip()) > 0 and len( tempHumanHomoUniID.strip()) > 0 and tempHumanHomoUniID.strip( ).upper() != 'NA': mousehumandic[tempMouseHomoUniID] = tempHumanHomoUniID colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\ 'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\ 'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession'] finalresult = [] finalresult.append(colname) humanUniprotID = [] with open(mousepeptrackfilename) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: templist = [] for i in colname[:-1]: tempdata = str(row[i]).strip() templist.append(tempdata) if len(str(templist[0]).strip()) > 0: if templist[0].split('-')[0] in mousehumandic: humanUniprotID.append( mousehumandic[templist[0].split('-')[0]]) templist.append(mousehumandic[templist[0].split('-')[0]]) else: templist.append('NA') finalresult.append(templist) with open(mousepeptrackfilename, 'wb') as pf: pwriter = csv.writer(pf, delimiter='\t') pwriter.writerows(finalresult) disGenDataDicName = disGenData() #disGenDataDicName='disGen.obj' disGenDataDic = cPickle.load(open(disGenDataDicName, 'rb')) unqhumanUniprotID = list(set(humanUniprotID)) humanUniprotfuncinfodic = {} countProt = 0 for subcode in unqhumanUniprotID: time.sleep(2) drugbanklist = [] PN = 'NA' GN = 'NA' OG = 'NA' OGID = 'NA' dislist = [] unidislist = [] unidisURLlist = [] disgendislist = [] disgendisURLlist = [] GoIDList = [] GoNamList = [] GoTermList = [] GOinfo = [] try: countProt += 1 if countProt % 1000 == 0: print str( countProt ), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts", str( datetime.datetime.now()) SGrequestURL = "https://www.uniprot.org/uniprot/" + str( subcode) + ".xml" SGunifile = urllib.urlopen(SGrequestURL) SGunidata = SGunifile.read() SGunifile.close() try: SGunidata = minidom.parseString(SGunidata) try: drugdata = (SGunidata.getElementsByTagName('dbReference')) for duItem in drugdata: if (duItem.attributes['type'].value ).upper() == 'DRUGBANK': try: drugname = (str( duItem.getElementsByTagName('property') [0].attributes['value'].value).strip()) drugid = str( duItem.attributes['id'].value).strip() durl = '<a target="_blank" href="https://www.drugbank.ca/drugs/' + drugid + '">' + drugname + '</a>' drugbanklist.append(durl) except: pass if (duItem.attributes['type'].value ).strip() == 'NCBI Taxonomy': try: OGID = str( duItem.attributes['id'].value).strip() except: pass except IndexError: pass try: godata = (SGunidata.getElementsByTagName('dbReference')) for gItem in godata: if (gItem.attributes['type'].value).upper() == 'GO': try: gonamedetails = (str( gItem.getElementsByTagName('property') [0].attributes['value'].value).strip() ).split(':')[1] gotermdetails = (str( gItem.getElementsByTagName('property') [0].attributes['value'].value).strip() ).split(':')[0] GoNamList.append(gonamedetails) goid = str( gItem.attributes['id'].value).strip() GoIDList.append(goid) tempGoTerm = None if gotermdetails.lower() == 'p': tempGoTerm = 'Biological Process' if gotermdetails.lower() == 'f': tempGoTerm = 'Molecular Function' if gotermdetails.lower() == 'c': tempGoTerm = 'Cellular Component' GoTermList.append(tempGoTerm) tempGOData = gonamedetails + ';' + goid + ';' + tempGoTerm GOinfo.append(tempGOData) except: pass if (gItem.attributes['type'].value ).strip() == 'NCBI Taxonomy': try: OGID = str( gItem.attributes['id'].value).strip() except: pass except IndexError: pass try: try: PN = (((SGunidata.getElementsByTagName('protein')[0] ).getElementsByTagName('recommendedName')[0] ).getElementsByTagName('fullName')[0] ).firstChild.nodeValue except: PN = (((SGunidata.getElementsByTagName('protein')[0] ).getElementsByTagName('submittedName')[0] ).getElementsByTagName('fullName')[0] ).firstChild.nodeValue except IndexError: pass try: try: GN = (( SGunidata.getElementsByTagName('gene')[0] ).getElementsByTagName('name')[0]).firstChild.nodeValue except: GN = 'NA' except IndexError: pass try: try: OG = (( SGunidata.getElementsByTagName('organism')[0] ).getElementsByTagName('name')[0]).firstChild.nodeValue except: OG = 'NA' except IndexError: pass try: disdata = SGunidata.getElementsByTagName('disease') for dItem in disdata: disname = '' disshort = '' disURL = '' disID = '' try: disname = (dItem.getElementsByTagName('name')[0] ).firstChild.nodeValue disID = (dItem.attributes['id'].value).upper() except: pass try: disshort = (dItem.getElementsByTagName('acronym') [0]).firstChild.nodeValue except: pass if len(disname.strip()) > 0: disURL = '<a target="_blank" href="https://www.uniprot.org/diseases/' + disID + '">' + str( disname.strip()) + '(' + str( disshort) + ')' + '</a>' dislist.append( str(disname.strip()) + '(' + str(disshort) + ')') unidislist.append( str(disname.strip()) + '(' + str(disshort) + ')') unidisURLlist.append(disURL) except IndexError: pass except ExpatError: pass except IOError: pass drugbankdata = 'NA' disdata = 'NA' uniDisData = 'NA' uniDisURLData = 'NA' disgenDisData = 'NA' disgenDisURLData = 'NA' goiddata = 'NA' gonamedata = 'NA' gotermdata = 'NA' goData = 'NA' if GN != 'NA' and GN in disGenDataDic: disgendislist = disGenDataDic[GN][0] disgendisURLlist = disGenDataDic[GN][1] if len(dislist) > 0: dislist = dislist + disGenDataDic[GN][0] else: dislist = disGenDataDic[GN][0] if len(GoIDList) > 0: goiddata = '|'.join(list(set(GoIDList))) if len(GoNamList) > 0: gonamedata = '|'.join(list(set(GoNamList))) if len(GoTermList) > 0: gotermdata = '|'.join(list(set(GoTermList))) if len(GOinfo) > 0: goData = '|'.join(list(set(GOinfo))) if len(drugbanklist) > 0: drugbankdata = '|'.join(list(set(drugbanklist))) if len(dislist) > 0: disdata = '|'.join(list(set(dislist))) if len(unidislist) > 0: uniDisData = '|'.join(list(set(unidislist))) if len(unidisURLlist) > 0: uniDisURLData = '|'.join(list(set(unidisURLlist))) if len(disgendislist) > 0: disgenDisData = '|'.join(list(set(disgendislist))) if len(disgendisURLlist) > 0: disgenDisURLData = '|'.join(list(set(disgendisURLlist))) humanUniprotfuncinfodic[subcode] = [ PN, GN, OG, OGID, disdata, uniDisData, uniDisURLData, disgenDisData, disgenDisURLData, drugbankdata, goiddata, gonamedata, gotermdata, goData ] hudicfile = 'humanUniprotfuncinfodic.obj' hudicf = open(hudicfile, 'wb') pickle.dump(humanUniprotfuncinfodic, hudicf, pickle.HIGHEST_PROTOCOL) hudicf.close() print("Extracting KEGG pathway name, job starts", str(datetime.datetime.now())) hkeggdictfile = {} huniproturl = 'https://www.uniprot.org/uploadlists/' hk = KEGG() for hkx in range(0, len(unqhumanUniprotID), 2000): countProt += hkx + 2000 if countProt % 2000 == 0: print(str(countProt), "th protein kegg job starts", str(datetime.datetime.now())) huniprotcodes = ' '.join(unqhumanUniprotID[hkx:hkx + 2000]) huniprotparams = { 'from': 'ACC', 'to': 'KEGG_ID', 'format': 'tab', 'query': huniprotcodes } while True: try: hkuniprotdata = urllib.urlencode(huniprotparams) hkuniprotrequest = urllib2.Request(huniproturl, hkuniprotdata) hkuniprotresponse = urllib2.urlopen(hkuniprotrequest) for hkuniprotline in hkuniprotresponse: hkudata = hkuniprotline.strip() if not hkudata.startswith("From"): hkuinfo = hkudata.split("\t") if len(hkuinfo[1].strip()): hkegg = hk.get(hkuinfo[1].strip()) hkudict_data = hk.parse(hkegg) try: try: if len(str(hkuinfo[0]).strip()) > 5: tempkeggData = '|'.join( '{};{}'.format(key, value) for key, value in hkudict_data['PATHWAY'].items()) hkeggdictfile[hkuinfo[0].strip()] = [ hkudict_data['PATHWAY'].values(), tempkeggData ] except TypeError: pass except KeyError: pass break except urllib2.HTTPError: time.sleep(RETRY_TIME) print( 'Hey, I am trying again until succeeds to get data from KEGG!', str(datetime.datetime.now())) pass hkdicfile = 'humankeggdic.obj' hkdicf = open(hkdicfile, 'wb') pickle.dump(hkeggdictfile, hkdicf, pickle.HIGHEST_PROTOCOL) hkdicf.close()
def get_kegg_info(stId): k = KEGG() data = k.get(stId) dict_data = k.parse(data) return dict_data