def create_keggids_csv(filename, org): ''' Extract keggids for an organism and save it to a csv file args: filename is the file containing gene name/ locus for all the organism genes org is the abrievation of the organism in kegg ''' #Open csv as panda dataframe (df) df = pd.read_csv(filename, sep="\t", tupleize_cols=1) gene_list = tuple(df['Locus'].tolist()) bid_list = tuple(df['Locus tag'].tolist()) kid_list = [] k = KEGG() #find keggid for each genes for gene in bid_list: kstrg = (k.find(org, gene)) kid_list.append(kstrg.split()[1]) #create new df and save it to csv new_df = pd.DataFrame(columns=['gene', 'b_id', 'kegg_id']) new_df.gene = gene_list new_df.b_id = bid_list new_df.kegg_id = kid_list new_df.to_csv("ecoli_keggids.csv", sep="\t", index=False)
def print_alignment_kegg(model): f = open("cor.txt") f_o = open("cor_readable.txt", "w") kegg = KEGG() for i in f: if ":***:" in i: k, b = i.split(":***:") b = b.strip() if not k == "MULTIR": k = kegg.get(k) i1 = k.find("NAME") + 4 i2 = k[i1:].find("\n") k = k[i1:i1 + i2].strip() if not b == "MULTIR": b = model.reactions[b] print(k, ":***:", b) f_o.write(k + ":***:" + b + "\n") f.close() f_o.close()
def show_pathway(): """ function that shows p53 pathway in KEGG """ k = KEGG(verbose=True) k.lookfor_pathway("p53 signaling pathway - H**o sapiens (human)") print(k.show_pathway("path:hsa04115"))
def extract_sequences(dict, flist): ''' Get orthologs sequences on KEGG and write to a fasta file for each kegg id arg: dictionnary with keggid as key and orthologs as value (list) ''' k = KEGG() ocount = {} #loop through orthologs dictionnary to get sequences from kegg for key, list in dict.items(): #print(key) if (key + ".fas") in flist: print(key + " is already created !!!") continue #create string with sequences to write fasta file for each genes string = "" for x in range(0, len(list)): for i in range(0, len(list[x])): data_seq = k.get(list[x][i], option="ntseq", parse=True) string = string + data_seq + "\n" #print(data_seq) print("writing : " + key + ".fas") #write file with open(os.path.join('orthologs_fastas/', key + '.fas'), 'w') as f: read_data = f.write(string) f.closed
def pathwayInfo(code): # Function to get info about a pathway, from the code # Intialize searcher kSearcher = KEGG() # Get result and parse it in a dictionnary result = kSearcher.get(code) # Add code at the begining of the list dictResult = kSearcher.parse(result) # Initialize an empty list pathwayList = [] # If name exist as a key in dictionnary, else 'NA' insted pathwayList.append(code) if 'NAME' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script nameStr = str(dictResult['NAME'][0].replace(',', ';')) pathwayList.append(nameStr) else: pathwayList.append('NA') # If class exist as a key in dictionnary, else 'NA' instead if 'CLASS' in dictResult.keys(): # If pathway name is a string comma separated, replace comma by semicolon # Fix to avoid wrong column formating at the end of the script classStr = str(dictResult['CLASS']).replace(',', ';') pathwayList.append(classStr) else: pathwayList.append('NA') return pathwayList
def get_genes_from_kegg_pathway(pathway): from bioservices.kegg import KEGG k = KEGG() k.organism = 'hsa' pathway = k.get(pathway) genes = k.parse(pathway)['GENE'] entrez, symbol = zip(*[i.split(' ') for i in genes]) return symbol
def get_kegg_info(stId): """ Get kegg dict by pathway id. """ k = KEGG() data = k.get(stId) dict_data = k.parse(data) return dict_data
def retrieve_kegg_formula(reactome_compound_name): k = KEGG() compound_name = reactome_compound_name.replace('COMPOUND', 'cpd') res = k.get(compound_name).split('\n') for line in res: if line.startswith('FORMULA'): formula = line.split()[1] # get the second token return formula return None
def get_single_compound_metadata_online(compound_id): if compound_id.upper().startswith('C'): s = KEGG() res = s.get(compound_id) return s.parse(res) else: ch = ChEBI() res = ch.getCompleteEntity('CHEBI:'+compound_id) return res
def load_kegg(gene, organism): k = KEGG() result_line = '' try: a = k.get_pathway_by_gene(gene, organism) if a: k_list = list(a.values()) result_line = ', '.join(k_list) except: print(" Gene '{0}' is not in KEGG database".format(gene)) return result_line
def __connect(self, organism_code): """ Purpose: Connect to the KEGG database specified by organism_code. @param organism_code: Use 'hsa' to connect to h**o sapien. @return: n/a """ k = KEGG() k.organism = organism_code return k
def extract_orthologs(filename): ''' Create dictionnary with keggid as key and list of orthologs as value arg: csv with keggids return : dict with orthologs ''' orthos_dict = {} k = KEGG() #get list of gammaproteobacteria from csv df = pd.read_csv(filename, sep="\t", tupleize_cols=1) df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1) gamma_list = df_gamma['KEGG'].tolist() #loop through keggid to get orthologs for keggid in df['kegg_id']: if keggid == "no": continue print(str(keggid)) ortho_list = [] #get orthologs on kegg data = k.get(keggid) dict_data = k.parse(data) if isinstance(dict_data, int): continue #loop through kegg orthologs data and verify that organisms are gammaproteobacteria for key, value in dict_data['GENES'].items(): if key.lower() in gamma_list: # print(key.lower(), value.split('(')[0].split()) para_num = len(value.split('(')[0].split()) para_list = [] for i in range(0, para_num): #print(value.split('(')[0].split()[i]) para_list.append(key.lower() + ":" + value.split('(')[0].split()[i]) ortho_list.append(para_list) orthos_dict[keggid] = ortho_list return orthos_dict
def id2seq(self, hsa): s = KEGG() d = s.get(hsa) dict_d = s.parse(d) pattern = re.compile(r'\s+') try: seq = re.sub(pattern, '', dict_d['AASEQ']) except: seq = '' #print('SEQ:', seq) text_file = open("dummy.txt", "w") text_file.write('>' + str(hsa) + '\n' + seq) text_file.close() return None
def kegg(inputInteractions): from bioservices.kegg import KEGG k = KEGG() interactions = [] for items in inputInteractions: print(items[1].getName()) try: pathways = k.get_pathway_by_gene(items[1].getName(), "hsa") #print(pathways) if pathways: for key, value in list(pathways.items()): interactions.append([items[0], value]) except AttributeError: print("Gene name error!!!!!!!!!") return interactions
def get_metabs(KEGG, reac_id): subs_list = [] prod_list = [] # Get reaction data from KEGG using a KEGG reaction ID r_data = KEGG.get(reac_id) # Parse the information retrieved r_parsed = KEGG.parse(r_data) # Split the equation into substrates and products split_eq = re.split('<=>', r_parsed['EQUATION']) # Remove the plus signs between the metabolites subs_list = [s.strip() for s in split_eq[0].split('+')] prod_list = [p.strip() for p in split_eq[1].split('+')] return [subs_list, prod_list]
def parse_kgml(self, ec_file=""): # http://biopython.org/DIST/docs/api/Bio.KEGG.KGML.KGML_parser-pysrc.html # https://github.com/deep-introspection/kegg-kgml-parser-python/blob/master/keggparser/parse_KGML.py tree = ET.fromstring(self.kgml) for reaction in tree.getiterator('reaction'): r_id = reaction.get('id') r_name = reaction.get( 'name') # lahko je vec imen locenih s presledki r_names = set(reaction.get('name').split()) # mnozica imen self.reactions[r_id] = r_names self.reaction_ids[r_name] = r_id self.listed_reactions.append(r_id) for sub in reaction.getiterator('substrate'): self.reaction_metabolites[r_id].add(sub.get('id')) self.reaction_reactants[r_id].add(sub.get('id')) #substrates.append(sub.get('name')) for prod in reaction.getiterator('product'): self.reaction_metabolites[r_id].add(prod.get('id')) self.reaction_products[r_id].add(prod.get('id')) #products.append(prod.get('name')) self.reversibility_reactions[r_id] = 1 if reaction.get( 'type') == 'reversible' else 0 #reactions[i] = {'reaction': reaction, 'substrates': substrates, 'products': products, 'gene':[], 'reversible': reversible} EC_file_loaded = False if ec_file: try: self.load_ECs(ec_file) EC_file_loaded = True except: self.kegg = KEGG() for entry in tree.getiterator('entry'): if not EC_file_loaded: if entry.get( 'type' ) == 'gene': # or entry.get('type') == 'ortholog': genes = entry.get('name').split() gene_reaction_name = entry.get('reaction') #print(gene_reaction_name) gene_reaction_id = self.reaction_ids[gene_reaction_name] for g in genes: #self.reaction_genes[gene_reaction_id].add(g) EC = self.get_EC(g) #self.gene_EC[g] = EC for e in EC: self.reaction_ECs[gene_reaction_id].add(e) if entry.get('type') == 'compound': metabolite = entry.get('name') metabolite_id = entry.get('id') self.metabolites[metabolite_id] = metabolite self.metabolite_ids[metabolite] = metabolite_id self.listed_metabolites.append(metabolite_id)
def get_compound_metadata_online(kegg_ids): s = KEGG() metadata_map = {} for i in range(len(kegg_ids)): try: if i % 10 == 0: print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids))) kegg_id = kegg_ids[i] res = s.get(kegg_id) d = s.parse(res) first_name = d['NAME'][0] first_name = first_name.replace(';', '') # strip last ';' character metadata_map[kegg_id] = {'display_name': first_name} except TypeError: print('kegg_id=%s parsed_data=%s' % (kegg_id, d)) return metadata_map
def get_seq(filename): ''' Create dictionnary with species as keys and sequences as values for an alignment arg: filename with gene name return: organism dictionnary with sequences ''' k = KEGG() records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta")) idlist = [] orglist = [] seqlist = [] orgdict = {} #go through sequences and search for organism name on kegg for record in records: idsplit = (record.id).split('_',1) id = idsplit[0] + ':' + idsplit[1] handle = k.get(id) if isinstance( handle, int ): print(id) continue org = k.parse(handle)['ORGANISM'] org = org.split() org = org[1] +" "+ org[2] seqlist.append(list(str(record.seq))) orglist.append(org) idlist.append(id) duplist = set(orglist) # create dict with organism as key and sequences for organism as values for org in duplist: indices = [i for i, x in enumerate(orglist) if x == org] seqs = [] for e in indices: seqs.append(seqlist[e]) orgdict[org] = seqs #print(orgdict) return orgdict
def queryKegg(theIDs): print("Currently querying KEGG...") k = KEGG() keggData = list() IDlist = list() for id in theIDs: ids = id[3:] query = k.find("acb", ids) query = query.split('\t') finalQuery = query[0] data = k.get(finalQuery) dictData = k.parse(data) keggData.append(dictData) IDlist.append(ids) return keggData, IDlist
def main(): # Start KEGG interface k = KEGG() # Create a dict to store final result data = dict() # Read in KEGG gene ID & gene symbol pairs with open("hsa_gene_list.json", "r") as g: gene_data = json.load(g) for gene in gene_data.keys(): print gene g_data = k.get(gene) g_prsd = k.parse(g_data) data[gene] = g_prsd with open('ginfo.json', 'w') as fw: json.dump(data, fw)
def get_kegg(self, pathway_id): #try: self.kegg = KEGG() kegg = self.kegg #self.kgml = kegg.parse(kegg.get(pathway_id)) #self.pathway = kegg.parse_kgml_pathway(pathway_id) self.kgml = kegg.get(pathway_id, "kgml") self.parse_kgml(pathway_id) self.save_kegg(pathway_id)
def get_reaction_ECs_from_kegg(self): self.reaction_ECs = defaultdict(set) kegg = KEGG() for r in self.model.reactions: ECs = [] try: reacts = r.split(" ") for i in reacts: if i not in self.reaction_ECs: print("KEGG reaction", i) ECs += kegg.parse(kegg.get(i))['ENZYME'] for e in ECs: self.reaction_ECs[i].add(e) except Exception as inst: print(inst) #for e in ECs: # self.reaction_ECs[r].add(e) print("EC data loaded from KEGG")
def getData(self): ''' Gets all the data for the drugs Obs. IT TAKES TIME. ''' mykegg = KEGG() print 'There are', len(mykegg.drugIds), 'drugs in Kegg' data = dict() # Get data from Kegg database. for num, ID in enumerate(k.drugIds): data[ID] = k.get(ID) print 'Finish!' return data
def main(): k = KEGG() # Create a dict to store final result data = dict() # Create list of hsa (human) pathways list_path = open("../hsa_list.txt").read().replace('path:','').split('\n') # Random blank entry removed list_path.pop() i = 0 for hsa in list_path: i+=1 print "# of pathways processed: ",i # Request KGML file for a pathway req_url = 'http://rest.kegg.jp/get/'+hsa+'/kgml' kgml = requests.get(req_url) out = open('pathways/path_'+hsa,'w') out.write(kgml.text) out.close()
``` python3 structure_processor.py "" "" --filter_genes "TP53" ``` saves to different files data for gene named "TP53" (this parameter can be comma-separated list of gene names). Saves to pictures/ fragments if they are found. Otherwise saves to different files. after processing saves to "processed_genes.log" gene names from parameter list. To rerun with the same gene list, remove lines corresponding to names from this file or remove the whole file - currently it is used to skip gene names which were already processed. Another call option might be incorrect now. """ from bioservices.kegg import KEGG keggParser = KEGG() import pickle import argparse ORGANISM = "hsa" GENES = ["p53"] # sample gene PDB_PATH = "pdb" import os, prody, pystache, logging if not os.path.exists(PDB_PATH): os.mkdir(PDB_PATH) # TODO: for now I haven't checked if pathPDBFolder creates this folder - # if it is created, this check should be removed. prody.proteins.localpdb.pathPDBFolder(PDB_PATH)
def search(query, source="wikipathways", result_format="xml", species=None, genes=None, user=None): path_array = [] if source.lower() in ["wikipathways", "all"] and species is None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway( identifier=temp_path_dict["identifier"], identifier_type="WikiPathways ID", name=temp_path_dict["name"], taxon=temp_path_dict["species"], source="WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) elif source.lower() in ["wikipathways", "all"] and species is not None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str( species) r = requests.get(url + ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() path_array = [] for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway( identifier=temp_path_dict["identifier"], identifier_type="WikiPathways ID", name=temp_path_dict["name"], taxon=temp_path_dict["species"], source="WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) if source.lower() in ["kegg", "all"] and genes is not None: k = KEGG() elif source.lower() in ["kegg", "all"] and genes is None: k = KEGG() list_of_pathways = k.find("pathway", query) temp_path_list = list_of_pathways.split("\n") for thing in temp_path_list: temp_split = thing.split("\t") if len(temp_split) != 1: path_id = temp_split[0].strip().split(":")[1] path_name = temp_split[1].strip() if "map" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG MAP PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ko" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG KO PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ec" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG EC PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "rn" in path_id: temp_path = gnomics.objects.pathway.Pathway( identifier=path_id, identifier_type="KEGG RN PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) else: print(k.get(path_id)) return path_array
import numpy as np import pandas as pd import matplotlib.pyplot as plt from bioservices.kegg import KEGG from sklearn.cluster import KMeans np.set_printoptions(threshold=np.nan) data = [] df = pd.read_csv('RPKMs.csv', delimiter=",") k = KEGG() #for i in range(100): # print(i,"****") # print("//\n",k.get_pathway_by_gene(str(df["symbol"][i]), "hsa")) def search_pathways_4_list(list_of_genes): matrix = [[0 for j in range(len(list_of_genes))] for i in range(0)] list_of_pathways = [] dict_of_genes = {} for i, gene in enumerate(list_of_genes): try: pathways = k.get_pathway_by_gene(gene, "hsa") if pathways != None: pathways = pathways.values()
def search(query, source="wikipathways", result_format="xml", species=None, genes=None, user=None): path_array = [] if source.lower() in ["wikipathways", "all"] and species is None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) r = requests.get(url+ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) elif source.lower() in ["wikipathways", "all"] and species is not None: url = "http://webservice.wikipathways.org/" ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(species) r = requests.get(url+ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() tree = ET.ElementTree(ET.fromstring(r.text)) root = tree.getroot() path_array = [] for child in root: temp_path_dict = {} for subchild in child: if subchild.tag == "{http://www.wikipathways.org/webservice}id": temp_path_dict["identifier"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}score": temp_path_dict["score"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}url": temp_path_dict["url"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}name": temp_path_dict["name"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}species": temp_path_dict["species"] = subchild.text elif subchild.tag == "{http://www.wikipathways.org/webservice}revision": temp_path_dict["revision"] = subchild.text temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways") if temp_path_dict["identifier"] not in path_array: path_array.append(temp_path) if source.lower() in ["kegg", "all"] and genes is not None: k = KEGG() elif source.lower() in ["kegg", "all"] and genes is None: k = KEGG() list_of_pathways = k.find("pathway", query) temp_path_list = list_of_pathways.split("\n") for thing in temp_path_list: temp_split = thing.split("\t") if len(temp_split) != 1: path_id = temp_split[0].strip().split(":")[1] path_name = temp_split[1].strip() if "map" in path_id: temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG MAP PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ko" in path_id: temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG KO PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "ec" in path_id: temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG EC PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) elif "rn" in path_id: temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG RN PATHWAY ID", source="KEGG", name=path_name) path_array.append(temp_path) else: print(k.get(path_id)) return path_array
#from Bio import Entrez from bioservices.kegg import KEGG import sys k = KEGG() #Entrez.email = "*****@*****.**" #file = open(sys.argv[1], "r") file = open("../data/ids5.txt", "r") result = "" k.organism = "lpl" k.get() #for line in file.readlines(): # file.close() print(result) # for line in file.readlines(): # handle = Entrez.esearch(db="pubmed", term=line) # record = Entrez.read(handle) # ids = record["IdList"] # print(ids)
t_fa = FactorAnalysis(n_components=3).fit(trans_n.T) t_hfac = DataFrame(t_fa.components_, index=['Factor %d' % (i + 1) for i in range(3)], columns=trans_n.index).T t_hfac['type'] = ['N' if i.split('-')[3].startswith('11') else 'T' for i in t_hfac.index] t_feat = DataFrame(t_fa.transform(trans_n.T), index=trans_n.columns, columns=['Factor %d' % (i + 1) for i in range(3)]) print t_feat['Factor 2'].sort_values() sns.set(style='ticks', context='paper', rc={'axes.linewidth': .3, 'xtick.major.width': .3, 'ytick.major.width': .3}) g = sns.pairplot(t_hfac, hue='type', palette=pal) plt.savefig('%s/reports/transcriptomics_pairplot.pdf' % wd, bbox_inches='tight') plt.close('all') print '[INFO] Corr plotted!' # -- Bioservices KEGG infomration bioser = KEGG(cache=True) bioser.organism = 'hsa' # Get pathways keggp = {p: bioser.get(p) for p in bioser.pathwayIds} print '[INFO] Pathways fetched' keggp_name = {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in keggp} keggp_comp = {p: {c for keggc in re.findall('(COMPOUND.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for c in re.findall('\s+(C[0-9]+)\s+', keggc)} for p in keggp if 'COMPOUND' in keggp[p]} keggp_gene = {p: {g for keggg in re.findall('(GENE.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for g in re.findall('\s+([A-Z]+.+);', keggg)} for p in keggp if 'GENE' in keggp[p]} keggp_tf = {p: {tf for tf in tf_targets_dict if len(tf_targets_dict[tf].intersection(keggp_gene[p])) > 0} for p in keggp_gene} print '[INFO] Pathways genes fetched' keggp_comp_m = DataFrame([(p, m, 1) for p in keggp_comp for m in keggp_comp[p]], columns=['pathway', 'metabolite', 'value']) keggp_comp_m = pivot_table(keggp_comp_m, index='pathway', columns='metabolite', values='value', fill_value=0) print keggp_comp_m.head
def main(): # Start KEGG interface for querying k = KEGG() # Create a dict to store final network output data = dict() # Create list of hsa (human) pathways list_path = open("hsa_list.txt").read().replace('path:', '').split('\n') # Remove newline list_path.pop() # Read in KEGG reaction ID & reversibility information with open("KEGG_Reac.json", "r") as fp: reac_data = json.load(fp) # Read in KEGG gene data with open("ginfo.json", "r") as fp2: gene_data = json.load(fp2) # Keep track of # of pathways processed i = 0 for hsa in list_path: i += 1 print "# of pathways processed: ", i # Open previously extracted KGML files kgml = open("etc_scripts/KEGG_DB_PATH/pathways/path_" + hsa).read() # Construct element tree root = ET.fromstring(kgml) # Iterate through ALL reactions for reaction in root.findall("./reaction"): gene_ids = [] gene_names = [] subs_list = [] prods_list = [] # 'id' to look up in 'graphics' to extract gene name id_look = reaction.attrib["id"] # Iterate through 'entry' to retrieve gene IDs for entry in root.findall("./entry"): if entry.attrib["id"] == id_look: gene_ids = entry.attrib["name"].split(' ') # Define dict for storing {gene id: reaction id's} r_ids = dict() # Iterate through the gene IDs to retrieve corresponding list of reaction IDs for g_id in gene_ids: r_ids[g_id] = [] # Open previously extracted reaction information with open('reacs/reac_' + g_id, 'r') as rp: line = rp.readline() # With gene ids as key, store corresponding reaction ids while line: r_ids[g_id].append(line.split()[1].split('rn:')[1]) line = rp.readline() # Loop to organize into the final output for g_id, r_ids in r_ids.items(): # Stores reaction ids and their info vals = dict() # Iterate through list of reactions to get metabolite information for r_id in r_ids: # Get the list of substrates and products metabs = get_metabs(k, r_id) # Check if reaction exists in reaction DB if r_id in reac_data.keys(): r_type = reac_data[r_id] else: # If it doesn't exist, assign NA as direction r_type = "NA" # Intermediate result to add to a gene of the current loop iteration vals[r_id] = { "DIRECTION": r_type, "R_SUBS": metabs[0], "R_PROD": metabs[1] } # Check to see if the gene has been encountered previously if g_id in data: # Store the current info to a temp reaction information temp = data[g_id] # Retrieve the current reaction information for the gene temp_list = get_react(temp) # Iterate through the existing information on reaction... # If a new reaction is seen, it is added to temp reaction information for r in vals.keys(): if r not in temp_list: temp[r] = vals[r] # Finalize reaction information to be added to the gene data[g_id] = temp else: data[g_id] = vals with open('keggMetabNetwork.json', 'w') as f: json.dump(data, f)
import re from bioservices.kegg import KEGG # -- KEGG bioservice bioser = KEGG(cache=True) bioser.organism = 'hsa' # Get pathways keggp = {p: bioser.get(p) for p in bioser.pathwayIds} print '[INFO] Pathways fetched' # Get reactions keggr = {r: bioser.get(r) for r in bioser.reactionIds} print '[INFO] Reactions fetched' # Get enzymes kegge = {e: bioser.get(e) for e in bioser.enzymeIds} print '[INFO] Enzymes fetched' # keggc = {c: bioser.get(c) for c in bioser.compoundIds} # print '[INFO] Compounds fetched' # # # Get modules # keggm = {m: bioser.get(m) for m in bioser.moduleIds} # print '[INFO] Modules fetched' # -- KEGG methods def get_pathway_names(pathways=None): pathways_ = pathways if pathways else set(keggp) return {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in pathways_}
blast_text = blastHits[ids] else: blast_text = 'NULL' if pfamHits.get(ids) != None: pfam_text = pfamHits[ids] else: pfam_text = 'NULL' if prositeHits.get(ids) != None: prosite_text = prositeHits[ids] else: prosite_text = 'NULL' # Get the KEGG hits kegg = KEGG() kegg_text = '' gene_id = gene_ids[ids] KEGG_IDs = kegg.get_pathway_by_gene(gene_id, "acb") if KEGG_IDs != None: for KEGG_ID in KEGG_IDs: kegg_text += KEGG_IDs[KEGG_ID] + ' [' + KEGG_ID + ']; ' kegg_text = kegg_text[:-2] else: kegg_text = 'NULL' comments = 'NULL' row = ids + '\t' + blast_text + '\t' + pfam_text + '\t' + prosite_text + '\t' + kegg_text + '\t' + GO_IDs + '\t' + comments + '\n' output.write(row) output.close()
ko2locus[ko].append(locus) ######################################################## tqdm.write("collect all KO id, start iterate all KO info") if not exists(join(tmp_dir, 'ko2info')): ko2info = {} ko_list = list(ko2locus.keys()) pack10_up = batch_iter(ko_list, 10) for ko_list in tqdm(pack10_up): ko_info = get_KO_info('+'.join(ko_list)) if ko_info is None: continue ko2info.update(ko_info) pickle.dump(ko2info, open(join(tmp_dir, 'ko2info'), 'wb')) else: ko2info = pickle.load(open(join(tmp_dir, 'ko2info'), 'rb')) locus_df = pack_it_up(ko2info, locus2ko, locus2info) locus_df = locus_df.reindex(columns=[ 'locus_tag', 'ko', 'definition', 'gene_name', 'ncbi_id', 'uniprot_refID', 'source_organism', 'ID', 'AA_seq', 'reference_t' ]) locus_df.to_csv(output_tab, sep='\t', index=1, index_label='locus_tag') with open(output_tab + '.null_ID', 'w') as f1: f1.write('\n'.join(null_ID)) return locus_df if __name__ == '__main__': kegg = KEGG() main()
""" Author: Daniel Esposito Date: 28/12/2015 Purpose: Wrapper Class for accessing KEGG via the bioservices interface. So far this class implements methods to obtain all pathways and then all reactions from those pathways in edgelist format tagged with the type of reaction. """ from bioservices.kegg import KEGG from predict.parsing import PPI import pandas as pd # ----------------------------------- UTILS -------------------------------- # kegg = KEGG() kegg.organism = 'hsa' reactions_to_exclude = [ 'missing-interaction', 'indirect-effect', 'expression', 'repression', 'compound', 'hidden-compound' ] def uniprot_cmp(x, y): t = {'P':0, 'Q':1, 'O':2} try: x_num = t[x[0]] except KeyError:
""" KEGG module example ==================== Histogram of KEGG pathways relations """ ################################################# # from pylab import * # extract all relations from all pathways from bioservices.kegg import KEGG s = KEGG() s.organism = "hsa" # retrieve more than 260 pathways so it takes time max_pathways = 10 results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]] relations = [x['relations'] for x in results] # plot hist([len(this) for this in relations], 20) xlabel('number of relations') ylabel('#') title("number of relations per pathways") grid(True)