Exemplo n.º 1
0
def create_keggids_csv(filename, org):
    '''
    Extract keggids for an organism and save it to a csv file
        
        args: filename is the file containing gene name/ locus for all the organism genes
              org is the abrievation of the organism in kegg

    '''

    #Open csv as panda dataframe (df)
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    gene_list = tuple(df['Locus'].tolist())
    bid_list = tuple(df['Locus tag'].tolist())
    kid_list = []

    k = KEGG()

    #find keggid for each genes
    for gene in bid_list:
        kstrg = (k.find(org, gene))
        kid_list.append(kstrg.split()[1])

    #create new df and save it to csv
    new_df = pd.DataFrame(columns=['gene', 'b_id', 'kegg_id'])
    new_df.gene = gene_list
    new_df.b_id = bid_list
    new_df.kegg_id = kid_list

    new_df.to_csv("ecoli_keggids.csv", sep="\t", index=False)
Exemplo n.º 2
0
def print_alignment_kegg(model):
    f = open("cor.txt")
    f_o = open("cor_readable.txt", "w")
    kegg = KEGG()

    for i in f:
        if ":***:" in i:
            k, b = i.split(":***:")
            b = b.strip()

            if not k == "MULTIR":
                k = kegg.get(k)

                i1 = k.find("NAME") + 4
                i2 = k[i1:].find("\n")

                k = k[i1:i1 + i2].strip()

            if not b == "MULTIR":
                b = model.reactions[b]

            print(k, ":***:", b)
            f_o.write(k + ":***:" + b + "\n")
    f.close()
    f_o.close()
Exemplo n.º 3
0
def show_pathway():
    """
    function that shows p53 pathway in KEGG
    """
    k = KEGG(verbose=True)
    k.lookfor_pathway("p53 signaling pathway - H**o sapiens (human)")
    print(k.show_pathway("path:hsa04115"))
Exemplo n.º 4
0
    def parse_kgml(self, ec_file=""):
        # http://biopython.org/DIST/docs/api/Bio.KEGG.KGML.KGML_parser-pysrc.html
        # https://github.com/deep-introspection/kegg-kgml-parser-python/blob/master/keggparser/parse_KGML.py

        tree = ET.fromstring(self.kgml)

        for reaction in tree.getiterator('reaction'):
            r_id = reaction.get('id')
            r_name = reaction.get(
                'name')  # lahko je vec imen locenih s presledki
            r_names = set(reaction.get('name').split())  # mnozica imen

            self.reactions[r_id] = r_names
            self.reaction_ids[r_name] = r_id
            self.listed_reactions.append(r_id)

            for sub in reaction.getiterator('substrate'):
                self.reaction_metabolites[r_id].add(sub.get('id'))
                self.reaction_reactants[r_id].add(sub.get('id'))
                #substrates.append(sub.get('name'))

            for prod in reaction.getiterator('product'):
                self.reaction_metabolites[r_id].add(prod.get('id'))
                self.reaction_products[r_id].add(prod.get('id'))
                #products.append(prod.get('name'))

            self.reversibility_reactions[r_id] = 1 if reaction.get(
                'type') == 'reversible' else 0
            #reactions[i] = {'reaction': reaction, 'substrates': substrates, 'products': products, 'gene':[], 'reversible': reversible}

        EC_file_loaded = False
        if ec_file:
            try:
                self.load_ECs(ec_file)
                EC_file_loaded = True
            except:
                self.kegg = KEGG()

        for entry in tree.getiterator('entry'):
            if not EC_file_loaded:
                if entry.get(
                        'type'
                ) == 'gene':  # or entry.get('type') == 'ortholog':
                    genes = entry.get('name').split()
                    gene_reaction_name = entry.get('reaction')
                    #print(gene_reaction_name)
                    gene_reaction_id = self.reaction_ids[gene_reaction_name]
                    for g in genes:
                        #self.reaction_genes[gene_reaction_id].add(g)
                        EC = self.get_EC(g)
                        #self.gene_EC[g] = EC
                        for e in EC:
                            self.reaction_ECs[gene_reaction_id].add(e)

            if entry.get('type') == 'compound':
                metabolite = entry.get('name')
                metabolite_id = entry.get('id')
                self.metabolites[metabolite_id] = metabolite
                self.metabolite_ids[metabolite] = metabolite_id
                self.listed_metabolites.append(metabolite_id)
Exemplo n.º 5
0
def extract_sequences(dict, flist):
    '''
    Get orthologs sequences on KEGG and write to a fasta file for each kegg id
        
        arg: dictionnary with keggid as key and orthologs as value (list)
        
    '''
    k = KEGG()

    ocount = {}

    #loop through orthologs dictionnary to get sequences from kegg
    for key, list in dict.items():
        #print(key)
        if (key + ".fas") in flist:
            print(key + " is already created !!!")
            continue

        #create string with sequences to write fasta file for each genes
        string = ""
        for x in range(0, len(list)):
            for i in range(0, len(list[x])):
                data_seq = k.get(list[x][i], option="ntseq", parse=True)
                string = string + data_seq + "\n"
                #print(data_seq)

        print("writing : " + key + ".fas")
        #write file
        with open(os.path.join('orthologs_fastas/', key + '.fas'), 'w') as f:
            read_data = f.write(string)
        f.closed
Exemplo n.º 6
0
def pathwayInfo(code):
    # Function to get info about a pathway, from the code

    # Intialize searcher
    kSearcher = KEGG()
    # Get result and parse it in a dictionnary
    result = kSearcher.get(code)

    # Add code at the begining of the list
    dictResult = kSearcher.parse(result)

    # Initialize an empty list
    pathwayList = []

    # If name exist as a key in dictionnary, else 'NA' insted
    pathwayList.append(code)
    if 'NAME' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        nameStr = str(dictResult['NAME'][0].replace(',', ';'))
        pathwayList.append(nameStr)
    else:
        pathwayList.append('NA')

    # If class exist as a key in dictionnary, else 'NA' instead
    if 'CLASS' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        classStr = str(dictResult['CLASS']).replace(',', ';')
        pathwayList.append(classStr)
    else:
        pathwayList.append('NA')

    return pathwayList
Exemplo n.º 7
0
def get_kegg_info(stId):
    """
    Get kegg dict by pathway id.
    """
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data
Exemplo n.º 8
0
def get_genes_from_kegg_pathway(pathway):
    from bioservices.kegg import KEGG
    k = KEGG()
    k.organism = 'hsa'
    pathway = k.get(pathway)
    genes = k.parse(pathway)['GENE']
    entrez, symbol = zip(*[i.split('  ') for i in genes])
    return symbol
Exemplo n.º 9
0
def retrieve_kegg_formula(reactome_compound_name):
    k = KEGG()
    compound_name = reactome_compound_name.replace('COMPOUND', 'cpd')
    res = k.get(compound_name).split('\n')
    for line in res:
        if line.startswith('FORMULA'):
            formula = line.split()[1]  # get the second token
            return formula
    return None
Exemplo n.º 10
0
def get_single_compound_metadata_online(compound_id):

    if compound_id.upper().startswith('C'):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    else:
        ch = ChEBI()
        res = ch.getCompleteEntity('CHEBI:'+compound_id)
        return res
Exemplo n.º 11
0
def load_kegg(gene, organism):
    k = KEGG()
    result_line = ''
    try:
        a = k.get_pathway_by_gene(gene, organism)
        if a:
            k_list = list(a.values())
            result_line = ', '.join(k_list)
    except:
        print("    Gene '{0}' is not in KEGG database".format(gene))
    return result_line
Exemplo n.º 12
0
    def get_kegg(self, pathway_id):
        #try:
        self.kegg = KEGG()
        kegg = self.kegg

        #self.kgml = kegg.parse(kegg.get(pathway_id))
        #self.pathway = kegg.parse_kgml_pathway(pathway_id)
        self.kgml = kegg.get(pathway_id, "kgml")
        self.parse_kgml(pathway_id)

        self.save_kegg(pathway_id)
Exemplo n.º 13
0
def extract_orthologs(filename):
    '''
    Create dictionnary with keggid as key and list of orthologs as value
        
        arg: csv with keggids
        return : dict with orthologs
    
    '''

    orthos_dict = {}
    k = KEGG()

    #get list of gammaproteobacteria from csv
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1)
    gamma_list = df_gamma['KEGG'].tolist()

    #loop through keggid to get orthologs
    for keggid in df['kegg_id']:

        if keggid == "no":
            continue

        print(str(keggid))
        ortho_list = []

        #get orthologs on kegg
        data = k.get(keggid)
        dict_data = k.parse(data)

        if isinstance(dict_data, int):
            continue

    #loop through kegg orthologs data and verify that organisms are gammaproteobacteria
        for key, value in dict_data['GENES'].items():

            if key.lower() in gamma_list:
                # print(key.lower(), value.split('(')[0].split())
                para_num = len(value.split('(')[0].split())
                para_list = []

                for i in range(0, para_num):
                    #print(value.split('(')[0].split()[i])
                    para_list.append(key.lower() + ":" +
                                     value.split('(')[0].split()[i])

                ortho_list.append(para_list)

        orthos_dict[keggid] = ortho_list

    return orthos_dict
Exemplo n.º 14
0
 def id2seq(self, hsa):
     s = KEGG()
     d = s.get(hsa)
     dict_d = s.parse(d)
     pattern = re.compile(r'\s+')
     try:
         seq = re.sub(pattern, '', dict_d['AASEQ'])
     except:
         seq = ''
     #print('SEQ:', seq)
     text_file = open("dummy.txt", "w")
     text_file.write('>' + str(hsa) + '\n' + seq)
     text_file.close()
     return None
Exemplo n.º 15
0
def kegg(inputInteractions):
    from bioservices.kegg import KEGG
    k = KEGG()
    interactions = []
    for items in inputInteractions:
        print(items[1].getName())
        try:
            pathways = k.get_pathway_by_gene(items[1].getName(), "hsa")
            #print(pathways)
            if pathways:
                for key, value in list(pathways.items()):
                    interactions.append([items[0], value])
        except AttributeError:
            print("Gene name error!!!!!!!!!")
    return interactions
Exemplo n.º 16
0
    def getData(self):
        '''
		  Gets all the data for the drugs
		  Obs. IT TAKES TIME.
		'''
        mykegg = KEGG()

        print 'There are', len(mykegg.drugIds), 'drugs in Kegg'
        data = dict()
        # Get data from Kegg database.
        for num, ID in enumerate(k.drugIds):
            data[ID] = k.get(ID)

        print 'Finish!'

        return data
Exemplo n.º 17
0
def get_compound_metadata_online(kegg_ids):

    s = KEGG()
    metadata_map = {}
    for i in range(len(kegg_ids)):
        try:
            if i % 10 == 0:
                print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids)))
            kegg_id = kegg_ids[i]
            res = s.get(kegg_id)
            d = s.parse(res)
            first_name = d['NAME'][0]
            first_name = first_name.replace(';', '') # strip last ';' character
            metadata_map[kegg_id] = {'display_name': first_name}
        except TypeError:
            print('kegg_id=%s parsed_data=%s' % (kegg_id, d))
    return metadata_map
Exemplo n.º 18
0
def get_seq(filename):
    '''
    Create dictionnary with species as keys and sequences as values for an alignment
    
    arg: filename with gene name
    return: organism dictionnary with sequences
    '''
    
    k = KEGG()
    records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta"))
    
    idlist = []
    orglist = [] 
    seqlist = []
    orgdict = {}
    
    #go through sequences and search for organism name on kegg
    for record in records:
        
        idsplit = (record.id).split('_',1)
        id = idsplit[0] + ':' + idsplit[1]
        
        handle  = k.get(id)
        if isinstance( handle, int ):
            print(id)
            continue
            
        org = k.parse(handle)['ORGANISM']
        org = org.split()
        org = org[1] +" "+ org[2]
        seqlist.append(list(str(record.seq)))
        orglist.append(org)
        idlist.append(id)

    duplist = set(orglist)
    
    # create dict with organism as key and sequences for organism as values
    for org in duplist:
        indices = [i for i, x in enumerate(orglist) if x == org]
        seqs = []
        for e in indices:
            seqs.append(seqlist[e])
        orgdict[org] = seqs
        
    #print(orgdict)
    return orgdict
def queryKegg(theIDs):
    print("Currently querying KEGG...")
    k = KEGG()
    keggData = list()
    IDlist = list()

    for id in theIDs:
        ids = id[3:]
        query = k.find("acb", ids)
        query = query.split('\t')
        finalQuery = query[0]
        data = k.get(finalQuery)
        dictData = k.parse(data)

        keggData.append(dictData)
        IDlist.append(ids)

    return keggData, IDlist
Exemplo n.º 20
0
def main():
    # Start KEGG interface
    k = KEGG()
    # Create a dict to store final result
    data = dict()

    # Read in KEGG gene ID & gene symbol pairs
    with open("hsa_gene_list.json", "r") as g:
        gene_data = json.load(g)

    for gene in gene_data.keys():
        print gene
        g_data = k.get(gene)
        g_prsd = k.parse(g_data)
        data[gene] = g_prsd

    with open('ginfo.json', 'w') as fw:
        json.dump(data, fw)
Exemplo n.º 21
0
def main():
	k = KEGG()
	# Create a dict to store final result
	data = dict()
	# Create list of hsa (human) pathways
	list_path = open("../hsa_list.txt").read().replace('path:','').split('\n')
	# Random blank entry removed
	list_path.pop()

	i = 0
	for hsa in list_path:
		i+=1
		print "# of pathways processed: ",i
		# Request KGML file for a pathway
		req_url = 'http://rest.kegg.jp/get/'+hsa+'/kgml'
		kgml = requests.get(req_url)
		out = open('pathways/path_'+hsa,'w')
		out.write(kgml.text)
		out.close()
Exemplo n.º 22
0
    def get_reaction_ECs_from_kegg(self):
        self.reaction_ECs = defaultdict(set)

        kegg = KEGG()
        for r in self.model.reactions:
            ECs = []
            try:
                reacts = r.split(" ")
                for i in reacts:
                    if i not in self.reaction_ECs:
                        print("KEGG reaction", i)
                        ECs += kegg.parse(kegg.get(i))['ENZYME']
                        for e in ECs:
                            self.reaction_ECs[i].add(e)

            except Exception as inst:
                print(inst)
            #for e in ECs:
            #    self.reaction_ECs[r].add(e)

        print("EC data loaded from KEGG")
Exemplo n.º 23
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bioservices.kegg import KEGG
from sklearn.cluster import KMeans

np.set_printoptions(threshold=np.nan)

data = []

df = pd.read_csv('RPKMs.csv', delimiter=",")

k = KEGG()

#for i in range(100):
#    print(i,"****")
#    print("//\n",k.get_pathway_by_gene(str(df["symbol"][i]), "hsa"))


def search_pathways_4_list(list_of_genes):

    matrix = [[0 for j in range(len(list_of_genes))] for i in range(0)]
    list_of_pathways = []
    dict_of_genes = {}

    for i, gene in enumerate(list_of_genes):
        try:
            pathways = k.get_pathway_by_gene(gene, "hsa")

            if pathways != None:
                pathways = pathways.values()
Exemplo n.º 24
0
def search(query,
           source="wikipathways",
           result_format="xml",
           species=None,
           genes=None,
           user=None):
    path_array = []

    if source.lower() in ["wikipathways", "all"] and species is None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    elif source.lower() in ["wikipathways", "all"] and species is not None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(
            species)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        path_array = []
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    if source.lower() in ["kegg", "all"] and genes is not None:
        k = KEGG()

    elif source.lower() in ["kegg", "all"] and genes is None:
        k = KEGG()
        list_of_pathways = k.find("pathway", query)
        temp_path_list = list_of_pathways.split("\n")

        for thing in temp_path_list:
            temp_split = thing.split("\t")
            if len(temp_split) != 1:
                path_id = temp_split[0].strip().split(":")[1]
                path_name = temp_split[1].strip()

                if "map" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG MAP PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ko" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG KO PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ec" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG EC PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "rn" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG RN PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                else:
                    print(k.get(path_id))

    return path_array
def pathwayVisualization(KEGG_id, path_to_csv, redirect=True, compound=False):
    """
    The pathwayVisualization function returns a graph visualization based on user input
    
    Args:
        KEGG_id     (str): string specifying KEGG pathway ID to visualize
        path_to_csv (str): string specifying data to overlay on graph
        redirect    (bool): True to split nodes into their components. Defaults to True
        compound    (bool): True to display compounds (such as Ca2+). Defaults to False
        
    Returns:
        A graph visualization using the visjs_network function from visjs_2_jupyter
    """
    
    s = KEGG()
    result = s.parse_kgml_pathway(KEGG_id)
    
    ETroot = parsingXML(KEGG_id, s)
    
    G=nx.DiGraph()
    
    max_id, compound_array = addNodes(G, result)
    setCoord(G, ETroot)
    
    if redirect is False:
        getNodeSymbols(G, s, compound)
    else:
        parent_list, parent_dict = splitNodes(G, s, max_id)
    
    complex_array, component_array, node_dict, comp_dict = undefNodes(G, ETroot)
    
    if redirect is False:
        addEdges(G, result, component_array, node_dict)
    else:
        addAndRedirectEdges(G, result, complex_array, component_array, parent_list, parent_dict, node_dict, comp_dict)
    
    #add reactions to graph
    addReaction(G, ETroot)
    
    edge_to_name = dict()
    for edge in G.edges():
        print edge
        if G.edge[edge[0]][edge[1]]['name'] == 'phosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif G.edge[edge[0]][edge[1]]['name'] == 'dephosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif 'dephosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('dephosphorylation', '-p')
            edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ')
        elif 'phosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('phosphorylation', '+p')
            edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ')
        #remove activation and inhibition labels
        elif 'activation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].remove('activation')
            edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ')
        elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].remove('inhibition')
            edge_to_name[edge] = edge_to_name[edge].replace('\n', ', ')
        else:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name']
            #print edge_to_name[edge]

    #edges are transparent
    edge_to_color = dict()
    for edge in G.edges():
        if 'activation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(26, 148, 49, 0.3)' #green
        elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(255, 0, 0, 0.3)' #red
        else:
            edge_to_color[edge] = 'rgba(0, 0, 255, 0.3)' #blue
    
    #for graph with split nodes
    if redirect is True:
        #remove undefined nodes from graph
        G.remove_nodes_from(complex_array)

        #remove nodes with more than one gene
        G.remove_nodes_from(parent_list)

    if compound is False:
        #remove compound nodes
        G.remove_nodes_from(compound_array)
        
    node_to_symbol = dict()
    for node in G.node:
        if G.node[node]['type'] == 'map':
            node_to_symbol[node] = G.node[node]['gene_names']
        else:
            if 'symbol' in G.node[node]:
                node_to_symbol[node] = G.node[node]['symbol']
            elif 'gene_names'in G.node[node]:
                node_to_symbol[node] = G.node[node]['gene_names']
            else: 
                node_to_symbol[node] = G.node[node]['name']
            
    # getting name of nodes
    node_to_gene = dict()
    for node in G.node:
        node_to_gene[node] = G.node[node]['gene_names']
            
    # getting x coord of nodes
    node_to_x = dict()
    for node in G.node:
        node_to_x[node] = G.node[node]['x']
    
    # getting y coord of nodes
    node_to_y = dict()
    for node in G.node:
        node_to_y[node] = G.node[node]['y']
    
    id_to_log2fold = log2FoldChange(G, path_to_csv)
    
    # Create color scale with negative as green and positive as red
    my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ])
    
    # color nodes based on log2fold data
    node_to_color = dict()
    
    for node in G.nodes():

        if node in id_to_log2fold:
            node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode

        else:
            node_to_color[node] = '#f1f1f1'

    # getting nodes in graph
    nodes = G.nodes()
    numnodes = len(nodes)
    node_map = dict(zip(nodes,range(numnodes)))  # map to indices for source/target in edges
    
    # getting edges in graph
    edges = G.edges()
    numedges = len(edges)

    # dictionaries that hold per node and per edge attributes
    nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box",
                 "node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes]

    edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
                  "color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'',
                 "hidden":'false', "physics":'true'} for i in range(numedges)]        

    # html file label for first graph (must manually increment later)
    time = 1700

    # create graph here
    #return G
    return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num", 
                               edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15,
                               edge_font_align= "top", physics_enabled = False, graph_width = 1000, graph_height = 1000)
Exemplo n.º 26
0
def main():
    # Start KEGG interface for querying
    k = KEGG()
    # Create a dict to store final network output
    data = dict()
    # Create list of hsa (human) pathways
    list_path = open("hsa_list.txt").read().replace('path:', '').split('\n')
    # Remove newline
    list_path.pop()

    # Read in KEGG reaction ID & reversibility information
    with open("KEGG_Reac.json", "r") as fp:
        reac_data = json.load(fp)

    # Read in KEGG gene data
    with open("ginfo.json", "r") as fp2:
        gene_data = json.load(fp2)

    # Keep track of # of pathways processed
    i = 0
    for hsa in list_path:
        i += 1
        print "# of pathways processed: ", i
        # Open previously extracted KGML files
        kgml = open("etc_scripts/KEGG_DB_PATH/pathways/path_" + hsa).read()
        # Construct element tree
        root = ET.fromstring(kgml)

        # Iterate through ALL reactions
        for reaction in root.findall("./reaction"):
            gene_ids = []
            gene_names = []
            subs_list = []
            prods_list = []
            # 'id' to look up in 'graphics' to extract gene name
            id_look = reaction.attrib["id"]
            # Iterate through 'entry' to retrieve gene IDs
            for entry in root.findall("./entry"):
                if entry.attrib["id"] == id_look:
                    gene_ids = entry.attrib["name"].split(' ')
            # Define dict for storing {gene id: reaction id's}
            r_ids = dict()
            # Iterate through the gene IDs to retrieve corresponding list of reaction IDs
            for g_id in gene_ids:
                r_ids[g_id] = []
                # Open previously extracted reaction information
                with open('reacs/reac_' + g_id, 'r') as rp:
                    line = rp.readline()
                    # With gene ids as key, store corresponding reaction ids
                    while line:
                        r_ids[g_id].append(line.split()[1].split('rn:')[1])
                        line = rp.readline()

            # Loop to organize into the final output
            for g_id, r_ids in r_ids.items():
                # Stores reaction ids and their info
                vals = dict()
                # Iterate through list of reactions to get metabolite information
                for r_id in r_ids:
                    # Get the list of substrates and products
                    metabs = get_metabs(k, r_id)
                    # Check if reaction exists in reaction DB
                    if r_id in reac_data.keys():
                        r_type = reac_data[r_id]
                    else:
                        # If it doesn't exist, assign NA as direction
                        r_type = "NA"
                    # Intermediate result to add to a gene of the current loop iteration
                    vals[r_id] = {
                        "DIRECTION": r_type,
                        "R_SUBS": metabs[0],
                        "R_PROD": metabs[1]
                    }

                # Check to see if the gene has been encountered previously
                if g_id in data:
                    # Store the current info to a temp reaction information
                    temp = data[g_id]
                    # Retrieve the current reaction information for the gene
                    temp_list = get_react(temp)
                    # Iterate through the existing information on reaction...
                    # If a new reaction is seen, it is added to temp reaction information
                    for r in vals.keys():
                        if r not in temp_list:
                            temp[r] = vals[r]
                    # Finalize reaction information to be added to the gene
                    data[g_id] = temp
                else:
                    data[g_id] = vals

    with open('keggMetabNetwork.json', 'w') as f:
        json.dump(data, f)
Exemplo n.º 27
0
        blast_text = blastHits[ids]
    else:
        blast_text = 'NULL'

    if pfamHits.get(ids) != None:
        pfam_text = pfamHits[ids]
    else:
        pfam_text = 'NULL'

    if prositeHits.get(ids) != None:
        prosite_text = prositeHits[ids]
    else:
        prosite_text = 'NULL'

    # Get the KEGG hits
    kegg = KEGG()
    kegg_text = ''
    gene_id = gene_ids[ids]
    KEGG_IDs = kegg.get_pathway_by_gene(gene_id, "acb")
    if KEGG_IDs != None:
        for KEGG_ID in KEGG_IDs:
            kegg_text += KEGG_IDs[KEGG_ID] + ' [' + KEGG_ID + ']; '
        kegg_text = kegg_text[:-2]
    else:
        kegg_text = 'NULL'
    comments = 'NULL'

    row = ids + '\t' + blast_text + '\t' + pfam_text + '\t' + prosite_text + '\t' + kegg_text + '\t' + GO_IDs + '\t' + comments + '\n'
    output.write(row)

output.close()
Exemplo n.º 28
0
 def KeggAPI(self):
     kegg_data = KEGG().parse(KEGG().get(self.kegg_id))        
     return kegg_data
Exemplo n.º 29
0
```
python3 structure_processor.py "" "" --filter_genes "TP53"
```
saves to different files data for gene named "TP53" (this parameter can be comma-separated list of gene names).
Saves to pictures/ fragments if they are found.
Otherwise saves to different files.
after processing saves to "processed_genes.log" gene names from parameter list. 
To rerun with the same gene list, remove lines corresponding to names from this file or remove the whole file - 
currently it is used to skip gene names which were already processed.

Another call option might be incorrect now.
"""
from bioservices.kegg import KEGG

keggParser = KEGG()

import pickle
import argparse

ORGANISM = "hsa"
GENES = ["p53"]  # sample gene
PDB_PATH = "pdb"

import os, prody, pystache, logging

if not os.path.exists(PDB_PATH):
    os.mkdir(PDB_PATH)
    # TODO: for now I haven't checked if pathPDBFolder creates this folder -
    # if it is created, this check should be removed.
prody.proteins.localpdb.pathPDBFolder(PDB_PATH)
def mapSpecies(mousepeptrackfilename):
    RETRY_TIME = 20.0
    mouseTohumanfilepath = os.path.join(os.getcwd(), 'MouseToHuman.tsv')
    print("Extracting Mouse to Human Map data, job starts",
          str(datetime.datetime.now()))
    #increase the field size of CSV
    csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
    try:
        urllib.urlretrieve(
            'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt',
            mouseTohumanfilepath)
        urllib.urlcleanup()
    except:
        print("Can't able to download MouseToHuman.tsv file!!")

    colnameMousHu = [
        'HomoloGene ID', 'Common Organism Name', 'NCBI Taxon ID', 'Symbol',
        'EntrezGene ID', 'Mouse MGI ID', 'HGNC ID', 'OMIM Gene ID',
        'Genetic Location', 'Genomic Coordinates (mouse: , human: )',
        'Nucleotide RefSeq IDs', 'Protein RefSeq IDs', 'SWISS_PROT IDs'
    ]

    mouseHumandata = []
    homologID = []
    with open(mouseTohumanfilepath) as mhtsvfile:
        mhreader = csv.DictReader(mhtsvfile, delimiter='\t')
        for mhrow in mhreader:
            mhtemplist = []
            for i in colnameMousHu:
                mhtempdata = str(mhrow[i]).strip()
                mhtemplist.append(mhtempdata)
            if len(mhtemplist[-1].strip()) > 0:
                homologID.append(mhtemplist[0])
                mouseHumandata.append(mhtemplist)
    homologID = list(set(homologID))
    homologID.sort()

    mousehumandic = {}
    for homologidItem in homologID:
        tempHumanHomoUniID = ''
        tempMouseHomoUniID = ''
        for item in mouseHumandata:
            if homologidItem == item[0]:
                if 'mouse' in item[1].strip().lower():
                    tempMouseHomoUniID = item[-1].strip()
                else:
                    tempHumanHomoUniID = item[-1].strip()
        if len(tempMouseHomoUniID.strip()) > 0 and len(
                tempHumanHomoUniID.strip()) > 0 and tempHumanHomoUniID.strip(
                ).upper() != 'NA':
            mousehumandic[tempMouseHomoUniID] = tempHumanHomoUniID

    colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\
    'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\
    'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession']

    finalresult = []
    finalresult.append(colname)
    humanUniprotID = []
    with open(mousepeptrackfilename) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
            templist = []
            for i in colname[:-1]:
                tempdata = str(row[i]).strip()
                templist.append(tempdata)
            if len(str(templist[0]).strip()) > 0:
                if templist[0].split('-')[0] in mousehumandic:
                    humanUniprotID.append(
                        mousehumandic[templist[0].split('-')[0]])
                    templist.append(mousehumandic[templist[0].split('-')[0]])
                else:
                    templist.append('NA')

            finalresult.append(templist)

    with open(mousepeptrackfilename, 'wb') as pf:
        pwriter = csv.writer(pf, delimiter='\t')
        pwriter.writerows(finalresult)

    disGenDataDicName = disGenData()
    #disGenDataDicName='disGen.obj'
    disGenDataDic = cPickle.load(open(disGenDataDicName, 'rb'))
    unqhumanUniprotID = list(set(humanUniprotID))
    humanUniprotfuncinfodic = {}
    countProt = 0
    for subcode in unqhumanUniprotID:
        time.sleep(2)
        drugbanklist = []
        PN = 'NA'
        GN = 'NA'
        OG = 'NA'
        OGID = 'NA'
        dislist = []
        unidislist = []
        unidisURLlist = []
        disgendislist = []
        disgendisURLlist = []
        GoIDList = []
        GoNamList = []
        GoTermList = []
        GOinfo = []
        try:
            countProt += 1
            if countProt % 1000 == 0:
                print str(
                    countProt
                ), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts", str(
                    datetime.datetime.now())

            SGrequestURL = "https://www.uniprot.org/uniprot/" + str(
                subcode) + ".xml"
            SGunifile = urllib.urlopen(SGrequestURL)
            SGunidata = SGunifile.read()
            SGunifile.close()

            try:
                SGunidata = minidom.parseString(SGunidata)
                try:
                    drugdata = (SGunidata.getElementsByTagName('dbReference'))
                    for duItem in drugdata:
                        if (duItem.attributes['type'].value
                            ).upper() == 'DRUGBANK':
                            try:
                                drugname = (str(
                                    duItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip())
                                drugid = str(
                                    duItem.attributes['id'].value).strip()
                                durl = '<a target="_blank" href="https://www.drugbank.ca/drugs/' + drugid + '">' + drugname + '</a>'
                                drugbanklist.append(durl)
                            except:
                                pass
                        if (duItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    duItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    godata = (SGunidata.getElementsByTagName('dbReference'))
                    for gItem in godata:
                        if (gItem.attributes['type'].value).upper() == 'GO':
                            try:
                                gonamedetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[1]
                                gotermdetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[0]
                                GoNamList.append(gonamedetails)
                                goid = str(
                                    gItem.attributes['id'].value).strip()
                                GoIDList.append(goid)
                                tempGoTerm = None

                                if gotermdetails.lower() == 'p':
                                    tempGoTerm = 'Biological Process'
                                if gotermdetails.lower() == 'f':
                                    tempGoTerm = 'Molecular Function'
                                if gotermdetails.lower() == 'c':
                                    tempGoTerm = 'Cellular Component'
                                GoTermList.append(tempGoTerm)
                                tempGOData = gonamedetails + ';' + goid + ';' + tempGoTerm
                                GOinfo.append(tempGOData)
                            except:
                                pass

                        if (gItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    gItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    try:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('recommendedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                    except:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('submittedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                except IndexError:
                    pass

                try:
                    try:
                        GN = ((
                            SGunidata.getElementsByTagName('gene')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        GN = 'NA'
                except IndexError:
                    pass

                try:
                    try:
                        OG = ((
                            SGunidata.getElementsByTagName('organism')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        OG = 'NA'
                except IndexError:
                    pass

                try:
                    disdata = SGunidata.getElementsByTagName('disease')
                    for dItem in disdata:
                        disname = ''
                        disshort = ''
                        disURL = ''
                        disID = ''
                        try:
                            disname = (dItem.getElementsByTagName('name')[0]
                                       ).firstChild.nodeValue
                            disID = (dItem.attributes['id'].value).upper()
                        except:
                            pass
                        try:
                            disshort = (dItem.getElementsByTagName('acronym')
                                        [0]).firstChild.nodeValue
                        except:
                            pass
                        if len(disname.strip()) > 0:
                            disURL = '<a target="_blank" href="https://www.uniprot.org/diseases/' + disID + '">' + str(
                                disname.strip()) + '(' + str(
                                    disshort) + ')' + '</a>'
                            dislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidisURLlist.append(disURL)
                except IndexError:
                    pass

            except ExpatError:
                pass
        except IOError:
            pass
        drugbankdata = 'NA'
        disdata = 'NA'
        uniDisData = 'NA'
        uniDisURLData = 'NA'
        disgenDisData = 'NA'
        disgenDisURLData = 'NA'
        goiddata = 'NA'
        gonamedata = 'NA'
        gotermdata = 'NA'
        goData = 'NA'
        if GN != 'NA' and GN in disGenDataDic:
            disgendislist = disGenDataDic[GN][0]
            disgendisURLlist = disGenDataDic[GN][1]
            if len(dislist) > 0:
                dislist = dislist + disGenDataDic[GN][0]
            else:
                dislist = disGenDataDic[GN][0]

        if len(GoIDList) > 0:
            goiddata = '|'.join(list(set(GoIDList)))
        if len(GoNamList) > 0:
            gonamedata = '|'.join(list(set(GoNamList)))
        if len(GoTermList) > 0:
            gotermdata = '|'.join(list(set(GoTermList)))
        if len(GOinfo) > 0:
            goData = '|'.join(list(set(GOinfo)))
        if len(drugbanklist) > 0:
            drugbankdata = '|'.join(list(set(drugbanklist)))
        if len(dislist) > 0:
            disdata = '|'.join(list(set(dislist)))
        if len(unidislist) > 0:
            uniDisData = '|'.join(list(set(unidislist)))
        if len(unidisURLlist) > 0:
            uniDisURLData = '|'.join(list(set(unidisURLlist)))
        if len(disgendislist) > 0:
            disgenDisData = '|'.join(list(set(disgendislist)))
        if len(disgendisURLlist) > 0:
            disgenDisURLData = '|'.join(list(set(disgendisURLlist)))
        humanUniprotfuncinfodic[subcode] = [
            PN, GN, OG, OGID, disdata, uniDisData, uniDisURLData,
            disgenDisData, disgenDisURLData, drugbankdata, goiddata,
            gonamedata, gotermdata, goData
        ]
    hudicfile = 'humanUniprotfuncinfodic.obj'
    hudicf = open(hudicfile, 'wb')
    pickle.dump(humanUniprotfuncinfodic, hudicf, pickle.HIGHEST_PROTOCOL)
    hudicf.close()

    print("Extracting KEGG pathway name, job starts",
          str(datetime.datetime.now()))
    hkeggdictfile = {}
    huniproturl = 'https://www.uniprot.org/uploadlists/'
    hk = KEGG()
    for hkx in range(0, len(unqhumanUniprotID), 2000):
        countProt += hkx + 2000
        if countProt % 2000 == 0:
            print(str(countProt), "th protein kegg job starts",
                  str(datetime.datetime.now()))

        huniprotcodes = ' '.join(unqhumanUniprotID[hkx:hkx + 2000])
        huniprotparams = {
            'from': 'ACC',
            'to': 'KEGG_ID',
            'format': 'tab',
            'query': huniprotcodes
        }

        while True:
            try:
                hkuniprotdata = urllib.urlencode(huniprotparams)
                hkuniprotrequest = urllib2.Request(huniproturl, hkuniprotdata)
                hkuniprotresponse = urllib2.urlopen(hkuniprotrequest)
                for hkuniprotline in hkuniprotresponse:
                    hkudata = hkuniprotline.strip()
                    if not hkudata.startswith("From"):
                        hkuinfo = hkudata.split("\t")
                        if len(hkuinfo[1].strip()):
                            hkegg = hk.get(hkuinfo[1].strip())
                            hkudict_data = hk.parse(hkegg)
                            try:
                                try:
                                    if len(str(hkuinfo[0]).strip()) > 5:
                                        tempkeggData = '|'.join(
                                            '{};{}'.format(key, value)
                                            for key, value in
                                            hkudict_data['PATHWAY'].items())
                                        hkeggdictfile[hkuinfo[0].strip()] = [
                                            hkudict_data['PATHWAY'].values(),
                                            tempkeggData
                                        ]
                                except TypeError:
                                    pass
                            except KeyError:
                                pass
                break
            except urllib2.HTTPError:
                time.sleep(RETRY_TIME)
                print(
                    'Hey, I am trying again until succeeds to get data from KEGG!',
                    str(datetime.datetime.now()))
                pass

    hkdicfile = 'humankeggdic.obj'
    hkdicf = open(hkdicfile, 'wb')
    pickle.dump(hkeggdictfile, hkdicf, pickle.HIGHEST_PROTOCOL)
    hkdicf.close()