Exemplo n.º 1
0
def create_keggids_csv(filename, org):
    '''
    Extract keggids for an organism and save it to a csv file
        
        args: filename is the file containing gene name/ locus for all the organism genes
              org is the abrievation of the organism in kegg

    '''

    #Open csv as panda dataframe (df)
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    gene_list = tuple(df['Locus'].tolist())
    bid_list = tuple(df['Locus tag'].tolist())
    kid_list = []

    k = KEGG()

    #find keggid for each genes
    for gene in bid_list:
        kstrg = (k.find(org, gene))
        kid_list.append(kstrg.split()[1])

    #create new df and save it to csv
    new_df = pd.DataFrame(columns=['gene', 'b_id', 'kegg_id'])
    new_df.gene = gene_list
    new_df.b_id = bid_list
    new_df.kegg_id = kid_list

    new_df.to_csv("ecoli_keggids.csv", sep="\t", index=False)
Exemplo n.º 2
0
def print_alignment_kegg(model):
    f = open("cor.txt")
    f_o = open("cor_readable.txt", "w")
    kegg = KEGG()

    for i in f:
        if ":***:" in i:
            k, b = i.split(":***:")
            b = b.strip()

            if not k == "MULTIR":
                k = kegg.get(k)

                i1 = k.find("NAME") + 4
                i2 = k[i1:].find("\n")

                k = k[i1:i1 + i2].strip()

            if not b == "MULTIR":
                b = model.reactions[b]

            print(k, ":***:", b)
            f_o.write(k + ":***:" + b + "\n")
    f.close()
    f_o.close()
Exemplo n.º 3
0
def show_pathway():
    """
    function that shows p53 pathway in KEGG
    """
    k = KEGG(verbose=True)
    k.lookfor_pathway("p53 signaling pathway - H**o sapiens (human)")
    print(k.show_pathway("path:hsa04115"))
Exemplo n.º 4
0
def extract_sequences(dict, flist):
    '''
    Get orthologs sequences on KEGG and write to a fasta file for each kegg id
        
        arg: dictionnary with keggid as key and orthologs as value (list)
        
    '''
    k = KEGG()

    ocount = {}

    #loop through orthologs dictionnary to get sequences from kegg
    for key, list in dict.items():
        #print(key)
        if (key + ".fas") in flist:
            print(key + " is already created !!!")
            continue

        #create string with sequences to write fasta file for each genes
        string = ""
        for x in range(0, len(list)):
            for i in range(0, len(list[x])):
                data_seq = k.get(list[x][i], option="ntseq", parse=True)
                string = string + data_seq + "\n"
                #print(data_seq)

        print("writing : " + key + ".fas")
        #write file
        with open(os.path.join('orthologs_fastas/', key + '.fas'), 'w') as f:
            read_data = f.write(string)
        f.closed
Exemplo n.º 5
0
def pathwayInfo(code):
    # Function to get info about a pathway, from the code

    # Intialize searcher
    kSearcher = KEGG()
    # Get result and parse it in a dictionnary
    result = kSearcher.get(code)

    # Add code at the begining of the list
    dictResult = kSearcher.parse(result)

    # Initialize an empty list
    pathwayList = []

    # If name exist as a key in dictionnary, else 'NA' insted
    pathwayList.append(code)
    if 'NAME' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        nameStr = str(dictResult['NAME'][0].replace(',', ';'))
        pathwayList.append(nameStr)
    else:
        pathwayList.append('NA')

    # If class exist as a key in dictionnary, else 'NA' instead
    if 'CLASS' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        classStr = str(dictResult['CLASS']).replace(',', ';')
        pathwayList.append(classStr)
    else:
        pathwayList.append('NA')

    return pathwayList
Exemplo n.º 6
0
def get_genes_from_kegg_pathway(pathway):
    from bioservices.kegg import KEGG
    k = KEGG()
    k.organism = 'hsa'
    pathway = k.get(pathway)
    genes = k.parse(pathway)['GENE']
    entrez, symbol = zip(*[i.split('  ') for i in genes])
    return symbol
Exemplo n.º 7
0
def get_kegg_info(stId):
    """
    Get kegg dict by pathway id.
    """
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data
Exemplo n.º 8
0
def retrieve_kegg_formula(reactome_compound_name):
    k = KEGG()
    compound_name = reactome_compound_name.replace('COMPOUND', 'cpd')
    res = k.get(compound_name).split('\n')
    for line in res:
        if line.startswith('FORMULA'):
            formula = line.split()[1]  # get the second token
            return formula
    return None
Exemplo n.º 9
0
def get_single_compound_metadata_online(compound_id):

    if compound_id.upper().startswith('C'):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    else:
        ch = ChEBI()
        res = ch.getCompleteEntity('CHEBI:'+compound_id)
        return res
Exemplo n.º 10
0
def load_kegg(gene, organism):
    k = KEGG()
    result_line = ''
    try:
        a = k.get_pathway_by_gene(gene, organism)
        if a:
            k_list = list(a.values())
            result_line = ', '.join(k_list)
    except:
        print("    Gene '{0}' is not in KEGG database".format(gene))
    return result_line
Exemplo n.º 11
0
    def __connect(self, organism_code):
        """
        Purpose: Connect to the KEGG database specified by organism_code.

        @param organism_code: Use 'hsa' to connect to h**o sapien.

        @return: n/a

        """
        k = KEGG()
        k.organism = organism_code
        return k
Exemplo n.º 12
0
def extract_orthologs(filename):
    '''
    Create dictionnary with keggid as key and list of orthologs as value
        
        arg: csv with keggids
        return : dict with orthologs
    
    '''

    orthos_dict = {}
    k = KEGG()

    #get list of gammaproteobacteria from csv
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1)
    gamma_list = df_gamma['KEGG'].tolist()

    #loop through keggid to get orthologs
    for keggid in df['kegg_id']:

        if keggid == "no":
            continue

        print(str(keggid))
        ortho_list = []

        #get orthologs on kegg
        data = k.get(keggid)
        dict_data = k.parse(data)

        if isinstance(dict_data, int):
            continue

    #loop through kegg orthologs data and verify that organisms are gammaproteobacteria
        for key, value in dict_data['GENES'].items():

            if key.lower() in gamma_list:
                # print(key.lower(), value.split('(')[0].split())
                para_num = len(value.split('(')[0].split())
                para_list = []

                for i in range(0, para_num):
                    #print(value.split('(')[0].split()[i])
                    para_list.append(key.lower() + ":" +
                                     value.split('(')[0].split()[i])

                ortho_list.append(para_list)

        orthos_dict[keggid] = ortho_list

    return orthos_dict
Exemplo n.º 13
0
 def id2seq(self, hsa):
     s = KEGG()
     d = s.get(hsa)
     dict_d = s.parse(d)
     pattern = re.compile(r'\s+')
     try:
         seq = re.sub(pattern, '', dict_d['AASEQ'])
     except:
         seq = ''
     #print('SEQ:', seq)
     text_file = open("dummy.txt", "w")
     text_file.write('>' + str(hsa) + '\n' + seq)
     text_file.close()
     return None
Exemplo n.º 14
0
def kegg(inputInteractions):
    from bioservices.kegg import KEGG
    k = KEGG()
    interactions = []
    for items in inputInteractions:
        print(items[1].getName())
        try:
            pathways = k.get_pathway_by_gene(items[1].getName(), "hsa")
            #print(pathways)
            if pathways:
                for key, value in list(pathways.items()):
                    interactions.append([items[0], value])
        except AttributeError:
            print("Gene name error!!!!!!!!!")
    return interactions
Exemplo n.º 15
0
def get_metabs(KEGG, reac_id):
    subs_list = []
    prod_list = []

    # Get reaction data from KEGG using a KEGG reaction ID
    r_data = KEGG.get(reac_id)
    # Parse the information retrieved
    r_parsed = KEGG.parse(r_data)
    # Split the equation into substrates and products
    split_eq = re.split('<=>', r_parsed['EQUATION'])
    # Remove the plus signs between the metabolites
    subs_list = [s.strip() for s in split_eq[0].split('+')]
    prod_list = [p.strip() for p in split_eq[1].split('+')]

    return [subs_list, prod_list]
Exemplo n.º 16
0
    def parse_kgml(self, ec_file=""):
        # http://biopython.org/DIST/docs/api/Bio.KEGG.KGML.KGML_parser-pysrc.html
        # https://github.com/deep-introspection/kegg-kgml-parser-python/blob/master/keggparser/parse_KGML.py

        tree = ET.fromstring(self.kgml)

        for reaction in tree.getiterator('reaction'):
            r_id = reaction.get('id')
            r_name = reaction.get(
                'name')  # lahko je vec imen locenih s presledki
            r_names = set(reaction.get('name').split())  # mnozica imen

            self.reactions[r_id] = r_names
            self.reaction_ids[r_name] = r_id
            self.listed_reactions.append(r_id)

            for sub in reaction.getiterator('substrate'):
                self.reaction_metabolites[r_id].add(sub.get('id'))
                self.reaction_reactants[r_id].add(sub.get('id'))
                #substrates.append(sub.get('name'))

            for prod in reaction.getiterator('product'):
                self.reaction_metabolites[r_id].add(prod.get('id'))
                self.reaction_products[r_id].add(prod.get('id'))
                #products.append(prod.get('name'))

            self.reversibility_reactions[r_id] = 1 if reaction.get(
                'type') == 'reversible' else 0
            #reactions[i] = {'reaction': reaction, 'substrates': substrates, 'products': products, 'gene':[], 'reversible': reversible}

        EC_file_loaded = False
        if ec_file:
            try:
                self.load_ECs(ec_file)
                EC_file_loaded = True
            except:
                self.kegg = KEGG()

        for entry in tree.getiterator('entry'):
            if not EC_file_loaded:
                if entry.get(
                        'type'
                ) == 'gene':  # or entry.get('type') == 'ortholog':
                    genes = entry.get('name').split()
                    gene_reaction_name = entry.get('reaction')
                    #print(gene_reaction_name)
                    gene_reaction_id = self.reaction_ids[gene_reaction_name]
                    for g in genes:
                        #self.reaction_genes[gene_reaction_id].add(g)
                        EC = self.get_EC(g)
                        #self.gene_EC[g] = EC
                        for e in EC:
                            self.reaction_ECs[gene_reaction_id].add(e)

            if entry.get('type') == 'compound':
                metabolite = entry.get('name')
                metabolite_id = entry.get('id')
                self.metabolites[metabolite_id] = metabolite
                self.metabolite_ids[metabolite] = metabolite_id
                self.listed_metabolites.append(metabolite_id)
Exemplo n.º 17
0
def get_compound_metadata_online(kegg_ids):

    s = KEGG()
    metadata_map = {}
    for i in range(len(kegg_ids)):
        try:
            if i % 10 == 0:
                print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids)))
            kegg_id = kegg_ids[i]
            res = s.get(kegg_id)
            d = s.parse(res)
            first_name = d['NAME'][0]
            first_name = first_name.replace(';', '') # strip last ';' character
            metadata_map[kegg_id] = {'display_name': first_name}
        except TypeError:
            print('kegg_id=%s parsed_data=%s' % (kegg_id, d))
    return metadata_map
Exemplo n.º 18
0
def get_seq(filename):
    '''
    Create dictionnary with species as keys and sequences as values for an alignment
    
    arg: filename with gene name
    return: organism dictionnary with sequences
    '''
    
    k = KEGG()
    records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta"))
    
    idlist = []
    orglist = [] 
    seqlist = []
    orgdict = {}
    
    #go through sequences and search for organism name on kegg
    for record in records:
        
        idsplit = (record.id).split('_',1)
        id = idsplit[0] + ':' + idsplit[1]
        
        handle  = k.get(id)
        if isinstance( handle, int ):
            print(id)
            continue
            
        org = k.parse(handle)['ORGANISM']
        org = org.split()
        org = org[1] +" "+ org[2]
        seqlist.append(list(str(record.seq)))
        orglist.append(org)
        idlist.append(id)

    duplist = set(orglist)
    
    # create dict with organism as key and sequences for organism as values
    for org in duplist:
        indices = [i for i, x in enumerate(orglist) if x == org]
        seqs = []
        for e in indices:
            seqs.append(seqlist[e])
        orgdict[org] = seqs
        
    #print(orgdict)
    return orgdict
def queryKegg(theIDs):
    print("Currently querying KEGG...")
    k = KEGG()
    keggData = list()
    IDlist = list()

    for id in theIDs:
        ids = id[3:]
        query = k.find("acb", ids)
        query = query.split('\t')
        finalQuery = query[0]
        data = k.get(finalQuery)
        dictData = k.parse(data)

        keggData.append(dictData)
        IDlist.append(ids)

    return keggData, IDlist
Exemplo n.º 20
0
def main():
    # Start KEGG interface
    k = KEGG()
    # Create a dict to store final result
    data = dict()

    # Read in KEGG gene ID & gene symbol pairs
    with open("hsa_gene_list.json", "r") as g:
        gene_data = json.load(g)

    for gene in gene_data.keys():
        print gene
        g_data = k.get(gene)
        g_prsd = k.parse(g_data)
        data[gene] = g_prsd

    with open('ginfo.json', 'w') as fw:
        json.dump(data, fw)
Exemplo n.º 21
0
    def get_kegg(self, pathway_id):
        #try:
        self.kegg = KEGG()
        kegg = self.kegg

        #self.kgml = kegg.parse(kegg.get(pathway_id))
        #self.pathway = kegg.parse_kgml_pathway(pathway_id)
        self.kgml = kegg.get(pathway_id, "kgml")
        self.parse_kgml(pathway_id)

        self.save_kegg(pathway_id)
Exemplo n.º 22
0
    def get_reaction_ECs_from_kegg(self):
        self.reaction_ECs = defaultdict(set)

        kegg = KEGG()
        for r in self.model.reactions:
            ECs = []
            try:
                reacts = r.split(" ")
                for i in reacts:
                    if i not in self.reaction_ECs:
                        print("KEGG reaction", i)
                        ECs += kegg.parse(kegg.get(i))['ENZYME']
                        for e in ECs:
                            self.reaction_ECs[i].add(e)

            except Exception as inst:
                print(inst)
            #for e in ECs:
            #    self.reaction_ECs[r].add(e)

        print("EC data loaded from KEGG")
Exemplo n.º 23
0
    def getData(self):
        '''
		  Gets all the data for the drugs
		  Obs. IT TAKES TIME.
		'''
        mykegg = KEGG()

        print 'There are', len(mykegg.drugIds), 'drugs in Kegg'
        data = dict()
        # Get data from Kegg database.
        for num, ID in enumerate(k.drugIds):
            data[ID] = k.get(ID)

        print 'Finish!'

        return data
Exemplo n.º 24
0
def main():
	k = KEGG()
	# Create a dict to store final result
	data = dict()
	# Create list of hsa (human) pathways
	list_path = open("../hsa_list.txt").read().replace('path:','').split('\n')
	# Random blank entry removed
	list_path.pop()

	i = 0
	for hsa in list_path:
		i+=1
		print "# of pathways processed: ",i
		# Request KGML file for a pathway
		req_url = 'http://rest.kegg.jp/get/'+hsa+'/kgml'
		kgml = requests.get(req_url)
		out = open('pathways/path_'+hsa,'w')
		out.write(kgml.text)
		out.close()
Exemplo n.º 25
0
```
python3 structure_processor.py "" "" --filter_genes "TP53"
```
saves to different files data for gene named "TP53" (this parameter can be comma-separated list of gene names).
Saves to pictures/ fragments if they are found.
Otherwise saves to different files.
after processing saves to "processed_genes.log" gene names from parameter list. 
To rerun with the same gene list, remove lines corresponding to names from this file or remove the whole file - 
currently it is used to skip gene names which were already processed.

Another call option might be incorrect now.
"""
from bioservices.kegg import KEGG

keggParser = KEGG()

import pickle
import argparse

ORGANISM = "hsa"
GENES = ["p53"]  # sample gene
PDB_PATH = "pdb"

import os, prody, pystache, logging

if not os.path.exists(PDB_PATH):
    os.mkdir(PDB_PATH)
    # TODO: for now I haven't checked if pathPDBFolder creates this folder -
    # if it is created, this check should be removed.
prody.proteins.localpdb.pathPDBFolder(PDB_PATH)
Exemplo n.º 26
0
def search(query,
           source="wikipathways",
           result_format="xml",
           species=None,
           genes=None,
           user=None):
    path_array = []

    if source.lower() in ["wikipathways", "all"] and species is None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    elif source.lower() in ["wikipathways", "all"] and species is not None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(
            species)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        path_array = []
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    if source.lower() in ["kegg", "all"] and genes is not None:
        k = KEGG()

    elif source.lower() in ["kegg", "all"] and genes is None:
        k = KEGG()
        list_of_pathways = k.find("pathway", query)
        temp_path_list = list_of_pathways.split("\n")

        for thing in temp_path_list:
            temp_split = thing.split("\t")
            if len(temp_split) != 1:
                path_id = temp_split[0].strip().split(":")[1]
                path_name = temp_split[1].strip()

                if "map" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG MAP PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ko" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG KO PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ec" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG EC PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "rn" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG RN PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                else:
                    print(k.get(path_id))

    return path_array
Exemplo n.º 27
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bioservices.kegg import KEGG
from sklearn.cluster import KMeans

np.set_printoptions(threshold=np.nan)

data = []

df = pd.read_csv('RPKMs.csv', delimiter=",")

k = KEGG()

#for i in range(100):
#    print(i,"****")
#    print("//\n",k.get_pathway_by_gene(str(df["symbol"][i]), "hsa"))


def search_pathways_4_list(list_of_genes):

    matrix = [[0 for j in range(len(list_of_genes))] for i in range(0)]
    list_of_pathways = []
    dict_of_genes = {}

    for i, gene in enumerate(list_of_genes):
        try:
            pathways = k.get_pathway_by_gene(gene, "hsa")

            if pathways != None:
                pathways = pathways.values()
Exemplo n.º 28
0
def search(query, source="wikipathways", result_format="xml", species=None, genes=None, user=None):
    path_array = []
    
    if source.lower() in ["wikipathways", "all"] and species is None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query)
        r = requests.get(url+ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text
                
            temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways")
            
            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)
    
    elif source.lower() in ["wikipathways", "all"] and species is not None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(species)
        r = requests.get(url+ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        path_array = []
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text
                
            temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways")
            
            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)
    
    if source.lower() in ["kegg", "all"] and genes is not None:
        k = KEGG()
        
    elif source.lower() in ["kegg", "all"] and genes is None:
        k = KEGG()
        list_of_pathways = k.find("pathway", query)
        temp_path_list = list_of_pathways.split("\n")
        
        for thing in temp_path_list:
            temp_split = thing.split("\t")
            if len(temp_split) != 1:
                path_id = temp_split[0].strip().split(":")[1]
                path_name = temp_split[1].strip()

                if "map" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG MAP PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "ko" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG KO PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "ec" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG EC PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "rn" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG RN PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                else:
                    print(k.get(path_id))
        
    return path_array
Exemplo n.º 29
0
#from Bio import Entrez
from bioservices.kegg import KEGG
import sys

k = KEGG()
#Entrez.email = "*****@*****.**"

#file = open(sys.argv[1], "r")
file = open("../data/ids5.txt", "r")
result = ""
k.organism = "lpl"
k.get()
#for line in file.readlines():
#
file.close()
print(result)

# for line in file.readlines():
#     handle = Entrez.esearch(db="pubmed", term=line)
#     record = Entrez.read(handle)
#     ids = record["IdList"]
#     print(ids)
Exemplo n.º 30
0
t_fa = FactorAnalysis(n_components=3).fit(trans_n.T)
t_hfac = DataFrame(t_fa.components_, index=['Factor %d' % (i + 1) for i in range(3)], columns=trans_n.index).T
t_hfac['type'] = ['N' if i.split('-')[3].startswith('11') else 'T' for i in t_hfac.index]

t_feat = DataFrame(t_fa.transform(trans_n.T), index=trans_n.columns, columns=['Factor %d' % (i + 1) for i in range(3)])
print t_feat['Factor 2'].sort_values()

sns.set(style='ticks', context='paper', rc={'axes.linewidth': .3, 'xtick.major.width': .3, 'ytick.major.width': .3})
g = sns.pairplot(t_hfac, hue='type', palette=pal)
plt.savefig('%s/reports/transcriptomics_pairplot.pdf' % wd, bbox_inches='tight')
plt.close('all')
print '[INFO] Corr plotted!'


# -- Bioservices KEGG infomration
bioser = KEGG(cache=True)
bioser.organism = 'hsa'

# Get pathways
keggp = {p: bioser.get(p) for p in bioser.pathwayIds}
print '[INFO] Pathways fetched'

keggp_name = {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in keggp}
keggp_comp = {p: {c for keggc in re.findall('(COMPOUND.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for c in re.findall('\s+(C[0-9]+)\s+', keggc)} for p in keggp if 'COMPOUND' in keggp[p]}
keggp_gene = {p: {g for keggg in re.findall('(GENE.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for g in re.findall('\s+([A-Z]+.+);', keggg)} for p in keggp if 'GENE' in keggp[p]}
keggp_tf = {p: {tf for tf in tf_targets_dict if len(tf_targets_dict[tf].intersection(keggp_gene[p])) > 0} for p in keggp_gene}
print '[INFO] Pathways genes fetched'

keggp_comp_m = DataFrame([(p, m, 1) for p in keggp_comp for m in keggp_comp[p]], columns=['pathway', 'metabolite', 'value'])
keggp_comp_m = pivot_table(keggp_comp_m, index='pathway', columns='metabolite', values='value', fill_value=0)
print keggp_comp_m.head
Exemplo n.º 31
0
def main():
    # Start KEGG interface for querying
    k = KEGG()
    # Create a dict to store final network output
    data = dict()
    # Create list of hsa (human) pathways
    list_path = open("hsa_list.txt").read().replace('path:', '').split('\n')
    # Remove newline
    list_path.pop()

    # Read in KEGG reaction ID & reversibility information
    with open("KEGG_Reac.json", "r") as fp:
        reac_data = json.load(fp)

    # Read in KEGG gene data
    with open("ginfo.json", "r") as fp2:
        gene_data = json.load(fp2)

    # Keep track of # of pathways processed
    i = 0
    for hsa in list_path:
        i += 1
        print "# of pathways processed: ", i
        # Open previously extracted KGML files
        kgml = open("etc_scripts/KEGG_DB_PATH/pathways/path_" + hsa).read()
        # Construct element tree
        root = ET.fromstring(kgml)

        # Iterate through ALL reactions
        for reaction in root.findall("./reaction"):
            gene_ids = []
            gene_names = []
            subs_list = []
            prods_list = []
            # 'id' to look up in 'graphics' to extract gene name
            id_look = reaction.attrib["id"]
            # Iterate through 'entry' to retrieve gene IDs
            for entry in root.findall("./entry"):
                if entry.attrib["id"] == id_look:
                    gene_ids = entry.attrib["name"].split(' ')
            # Define dict for storing {gene id: reaction id's}
            r_ids = dict()
            # Iterate through the gene IDs to retrieve corresponding list of reaction IDs
            for g_id in gene_ids:
                r_ids[g_id] = []
                # Open previously extracted reaction information
                with open('reacs/reac_' + g_id, 'r') as rp:
                    line = rp.readline()
                    # With gene ids as key, store corresponding reaction ids
                    while line:
                        r_ids[g_id].append(line.split()[1].split('rn:')[1])
                        line = rp.readline()

            # Loop to organize into the final output
            for g_id, r_ids in r_ids.items():
                # Stores reaction ids and their info
                vals = dict()
                # Iterate through list of reactions to get metabolite information
                for r_id in r_ids:
                    # Get the list of substrates and products
                    metabs = get_metabs(k, r_id)
                    # Check if reaction exists in reaction DB
                    if r_id in reac_data.keys():
                        r_type = reac_data[r_id]
                    else:
                        # If it doesn't exist, assign NA as direction
                        r_type = "NA"
                    # Intermediate result to add to a gene of the current loop iteration
                    vals[r_id] = {
                        "DIRECTION": r_type,
                        "R_SUBS": metabs[0],
                        "R_PROD": metabs[1]
                    }

                # Check to see if the gene has been encountered previously
                if g_id in data:
                    # Store the current info to a temp reaction information
                    temp = data[g_id]
                    # Retrieve the current reaction information for the gene
                    temp_list = get_react(temp)
                    # Iterate through the existing information on reaction...
                    # If a new reaction is seen, it is added to temp reaction information
                    for r in vals.keys():
                        if r not in temp_list:
                            temp[r] = vals[r]
                    # Finalize reaction information to be added to the gene
                    data[g_id] = temp
                else:
                    data[g_id] = vals

    with open('keggMetabNetwork.json', 'w') as f:
        json.dump(data, f)
Exemplo n.º 32
0
import re
from bioservices.kegg import KEGG

# -- KEGG bioservice
bioser = KEGG(cache=True)
bioser.organism = 'hsa'

# Get pathways
keggp = {p: bioser.get(p) for p in bioser.pathwayIds}
print '[INFO] Pathways fetched'

# Get reactions
keggr = {r: bioser.get(r) for r in bioser.reactionIds}
print '[INFO] Reactions fetched'

# Get enzymes
kegge = {e: bioser.get(e) for e in bioser.enzymeIds}
print '[INFO] Enzymes fetched'

# keggc = {c: bioser.get(c) for c in bioser.compoundIds}
# print '[INFO] Compounds fetched'
#
# # Get modules
# keggm = {m: bioser.get(m) for m in bioser.moduleIds}
# print '[INFO] Modules fetched'


# -- KEGG methods
def get_pathway_names(pathways=None):
    pathways_ = pathways if pathways else set(keggp)
    return {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in pathways_}
Exemplo n.º 33
0
        blast_text = blastHits[ids]
    else:
        blast_text = 'NULL'

    if pfamHits.get(ids) != None:
        pfam_text = pfamHits[ids]
    else:
        pfam_text = 'NULL'

    if prositeHits.get(ids) != None:
        prosite_text = prositeHits[ids]
    else:
        prosite_text = 'NULL'

    # Get the KEGG hits
    kegg = KEGG()
    kegg_text = ''
    gene_id = gene_ids[ids]
    KEGG_IDs = kegg.get_pathway_by_gene(gene_id, "acb")
    if KEGG_IDs != None:
        for KEGG_ID in KEGG_IDs:
            kegg_text += KEGG_IDs[KEGG_ID] + ' [' + KEGG_ID + ']; '
        kegg_text = kegg_text[:-2]
    else:
        kegg_text = 'NULL'
    comments = 'NULL'

    row = ids + '\t' + blast_text + '\t' + pfam_text + '\t' + prosite_text + '\t' + kegg_text + '\t' + GO_IDs + '\t' + comments + '\n'
    output.write(row)

output.close()
Exemplo n.º 34
0
            ko2locus[ko].append(locus)

    ########################################################
    tqdm.write("collect all KO id, start iterate all KO info")
    if not exists(join(tmp_dir, 'ko2info')):
        ko2info = {}
        ko_list = list(ko2locus.keys())
        pack10_up = batch_iter(ko_list, 10)
        for ko_list in tqdm(pack10_up):
            ko_info = get_KO_info('+'.join(ko_list))
            if ko_info is None:
                continue
            ko2info.update(ko_info)
        pickle.dump(ko2info, open(join(tmp_dir, 'ko2info'), 'wb'))
    else:
        ko2info = pickle.load(open(join(tmp_dir, 'ko2info'), 'rb'))
    locus_df = pack_it_up(ko2info, locus2ko, locus2info)
    locus_df = locus_df.reindex(columns=[
        'locus_tag', 'ko', 'definition', 'gene_name', 'ncbi_id',
        'uniprot_refID', 'source_organism', 'ID', 'AA_seq', 'reference_t'
    ])
    locus_df.to_csv(output_tab, sep='\t', index=1, index_label='locus_tag')
    with open(output_tab + '.null_ID', 'w') as f1:
        f1.write('\n'.join(null_ID))
    return locus_df


if __name__ == '__main__':
    kegg = KEGG()
    main()
Exemplo n.º 35
0
"""
Author: Daniel Esposito

Date: 28/12/2015

Purpose: Wrapper Class for accessing KEGG via the bioservices interface. So far this class implements
methods to obtain all pathways and then all reactions from those pathways in edgelist format
tagged with the type of reaction.
"""

from bioservices.kegg import KEGG
from predict.parsing import PPI
import pandas as pd

# ----------------------------------- UTILS -------------------------------- #
kegg = KEGG()
kegg.organism = 'hsa'
reactions_to_exclude = [
    'missing-interaction',
    'indirect-effect',
    'expression',
    'repression',
    'compound',
    'hidden-compound'
]

def uniprot_cmp(x, y):
    t = {'P':0, 'Q':1, 'O':2}
    try:
        x_num = t[x[0]]
    except KeyError:
Exemplo n.º 36
0
"""
KEGG module example
====================

Histogram of KEGG pathways relations
"""
#################################################
#
from pylab import *


# extract all relations from all pathways
from bioservices.kegg import KEGG
s = KEGG()
s.organism = "hsa"

# retrieve more than 260 pathways so it takes time
max_pathways = 10
results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]]
relations = [x['relations'] for x in results]

# plot
hist([len(this) for this in relations], 20)
xlabel('number of relations')
ylabel('#')
title("number of relations per pathways")
grid(True)