Exemplo n.º 1
0
"""
KEGG module example
====================

Histogram of KEGG pathways relations
"""
#################################################
#
from pylab import *


# extract all relations from all pathways
from bioservices.kegg import KEGG
s = KEGG()
s.organism = "hsa"

# retrieve more than 260 pathways so it takes time
max_pathways = 10
results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]]
relations = [x['relations'] for x in results]

# plot
hist([len(this) for this in relations], 20)
xlabel('number of relations')
ylabel('#')
title("number of relations per pathways")
grid(True)
Exemplo n.º 2
0
class KEGGPathways:
    """
    KEGG PATHWAY Database API
    """
    def __init__(self, organism="H**o sapiens"):
        self.database = KEGG()
        self.organism = self.get_organism_code(organism.lower())

    def search_by_gene(self, gene_name: str):
        """

        Args:
            gene_name: gene name (ex. 'BRCA2')

        Returns:
            Dictionary with ids of all pathways containing given gene as keys and their full names as values.

        """
        try:
            pathways = self.database.get_pathway_by_gene(
                gene_name, self.organism)
            return pathways if pathways else {}
        except AttributeError:
            return {}

    def get_pathway(self, pathway_id: str, self_loops: bool = False):
        """

        Args:
            pathway_id: KEGG pathway id (ex. 'hsa04110')
            self_loops: information about whether or not include self loops in returned graph

        Returns:
            `networkx.DiGraph` object: Directed graph depicting pathway, with a comma-separated string
            containing gene names as graph nodes and directed edges representing interactions between genes.
            Each edge has weight 'type', which is a list of interaction types between two nodes.

        """

        G = nx.DiGraph()
        try:
            pathway = self.database.parse_kgml_pathway(pathway_id)
        except TypeError:
            # incorrect pathway_id
            pathway = None

        if pathway:
            names = {}
            for entry in pathway['entries']:
                # only intra-pathway interactions taken into account
                if entry['gene_names']:
                    names[entry['id']] = {
                        'name': entry['gene_names'],
                        'type': entry['type']
                    }

            for rel in pathway['relations']:
                if rel['entry1'] in names.keys(
                ) and rel['entry2'] in names.keys():
                    e1 = names[rel['entry1']]['name']
                    e2 = names[rel['entry2']]['name']
                    G.add_node(e1, type=names[rel['entry1']]['type'])
                    G.add_node(e2, type=names[rel['entry2']]['type'])
                    if G.has_edge(e1, e2):
                        G[e1][e2]['type'] = G[e1][e2]['type'] + [rel['name']]
                    else:
                        # assumption of interaction direction entry1 -> entry2 #TODO: validate
                        if e1 != e2 or (e1 == e2 and self_loops):
                            G.add_edge(e1, e2, type=[rel['name']])

        not_gene_nodes = []
        for node in G.nodes():
            # only interactions between genes
            if G.node[node]['type'] != 'gene':
                for in_edge in G.in_edges(node):
                    for out_edge in G.out_edges(node):
                        if in_edge[0] != out_edge[1] or (
                                in_edge[0] == out_edge[1] and self_loops):
                            G.add_edge(in_edge[0],
                                       out_edge[1],
                                       type=['indirect'])
                not_gene_nodes.append(node)
        G.remove_nodes_from(not_gene_nodes)

        return G

    def fetch_organism_codes(self):
        """

        Returns:
            Dictionary with organisms as keys, and KEGG organism codes as values
            {   'h**o sapiens' : 'hsa',
                'human' : 'hsa',
                ...
            }

        """
        codes = {}
        for line in self.database.list('organism').split('\n'):
            if line:
                code = line.split('\t')[1]
                org = line.split('\t')[2]
                if '(' in org:
                    org = [x.strip().lower() for x in org[:-1].split('(')]
                    for o in org:
                        codes[o] = code
                else:
                    codes[org] = code
        return codes

    def get_organism_code(self, org: str):
        """

        Args:
            org: organism name (ex. 'H**o sapiens', 'human') - lowercase and uppercase optional

        Returns:
            str: KEGG organism code

        """
        codes = self.fetch_organism_codes()
        try:
            return codes[org]
        except KeyError:
            print('Invalid organism name.')
            raise

    def get_gene_code(self, gen: str):
        """

        Args:
            gen: gene name (ex. 'FGR', 'NIPAL1')

        Returns:
            KEGG gene code

        """
        code_gen = self.database.find(self.organism, gen)

        if code_gen == str('\n'):
            code_gen = str()
            print('Invalid gene name: ' + str(gen))
        return code_gen
Exemplo n.º 3
0
def pathwayVisualization(KEGG_id, path_to_csv, redirect=True, compound=False):
    """
    The pathwayVisualization function returns a graph visualization based on user input
    
    Args:
        KEGG_id     (str): string specifying KEGG pathway ID to visualize
        path_to_csv (str): string specifying data to overlay on graph
        redirect    (bool): True to split nodes into their components. Defaults to True
        compound    (bool): True to display compounds (such as Ca2+). Defaults to False
        
    Returns:
        A graph visualization using the visjs_network function from visjs_2_jupyter
    """
    
    s = KEGG()
    
    res = s.get(KEGG_id, "kgml")
    
    if res == 404 or res == 400:
        print KEGG_id + ' is not a valid KEGG ID'
        return
    
    result = s.parse_kgml_pathway(KEGG_id)
    
    ETroot = parsingXML(KEGG_id, s)
    
    G=nx.DiGraph()
    
    max_id, compound_array = addNodes(G, result)
    setCoord(G, ETroot)
    
    if redirect is False:
        getNodeSymbols(G, s, compound)
    else:
        parent_list, parent_dict = splitNodes(G, s, max_id)
    
    complex_array, component_array, node_dict, comp_dict = undefNodes(G, ETroot)
    
    if redirect is False:
        addEdges(G, result, component_array, node_dict)
    else:
        addAndRedirectEdges(G, result, complex_array, component_array, parent_list, parent_dict, node_dict, comp_dict)
    
    #add reactions to graph
    addReaction(G, ETroot)
    
    edge_to_name = dict()
    for edge in G.edges():
        if G.edge[edge[0]][edge[1]]['name'] == 'phosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif G.edge[edge[0]][edge[1]]['name'] == 'dephosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif 'dephosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('dephosphorylation', '-p')
        elif 'phosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('phosphorylation', '+p')
        else:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name']
            
        edge_to_name[edge] = edge_to_name[edge].replace('activation, ', '')
        edge_to_name[edge] = edge_to_name[edge].replace('inhibition, ', '')
        edge_to_name[edge] = edge_to_name[edge].replace('activation', '')
        edge_to_name[edge] = edge_to_name[edge].replace('inhibition', '')

    #edges are transparent
    edge_to_color = dict()
    for edge in G.edges():
        if 'activation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(26, 148, 49, 0.3)' #green
        elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(255, 0, 0, 0.3)' #red
        else:
            edge_to_color[edge] = 'rgba(0, 0, 255, 0.3)' #blue
    
    #for graph with split nodes
    if redirect is True:
        #remove undefined nodes from graph
        G.remove_nodes_from(complex_array)

        #remove nodes with more than one gene
        G.remove_nodes_from(parent_list)

    if compound is False:
        #remove compound nodes
        G.remove_nodes_from(compound_array)
        
    node_to_symbol = dict()
    for node in G.node:
        if G.node[node]['type'] == 'map':
            node_to_symbol[node] = G.node[node]['gene_names']
        else:
            if 'symbol' in G.node[node]:
                node_to_symbol[node] = G.node[node]['symbol']
            elif 'gene_names'in G.node[node]:
                node_to_symbol[node] = G.node[node]['gene_names']
            else: 
                node_to_symbol[node] = G.node[node]['name']
            
    # getting name of nodes
    node_to_gene = dict()
    for node in G.node:
        node_to_gene[node] = G.node[node]['gene_names']
            
    # getting x coord of nodes
    node_to_x = dict()
    for node in G.node:
        node_to_x[node] = G.node[node]['x']
    
    # getting y coord of nodes
    node_to_y = dict()
    for node in G.node:
        node_to_y[node] = G.node[node]['y']
    
    id_to_log2fold = log2FoldChange(G, path_to_csv)
    
    # Create color scale with negative as green and positive as red
    my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ])
    
    # color nodes based on log2fold data
    node_to_color = dict()
    
    for node in G.nodes():

        if node in id_to_log2fold:
            node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode

        else:
            node_to_color[node] = '#f1f1f1'

    # getting nodes in graph
    nodes = G.nodes()
    numnodes = len(nodes)
    node_map = dict(zip(nodes,range(numnodes)))  # map to indices for source/target in edges
    
    # getting edges in graph
    edges = G.edges()
    numedges = len(edges)

    # dictionaries that hold per node and per edge attributes
    nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box",
                 "node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes]

    edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
                  "color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'',
                 "hidden":'false', "physics":'true'} for i in range(numedges)]        

    # html file label for first graph (must manually increment later)
    time = 1700

    # create graph here
    #return G
    return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num", 
                               edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15,
                               edge_font_align= "top", physics_enabled = False, graph_width = 1000, graph_height = 1000)
Exemplo n.º 4
0
"""
KEGG module example
====================

Histogram of KEGG pathways relations
"""
#################################################
#
from pylab import *

# extract all relations from all pathways
from bioservices.kegg import KEGG
s = KEGG()
s.organism = "hsa"

# retrieve more than 260 pathways so it takes time
max_pathways = 10
results = [s.parse_kgml_pathway(x) for x in s.pathwayIds[0:max_pathways]]
relations = [x['relations'] for x in results]

# plot
hist([len(this) for this in relations], 20)
xlabel('number of relations')
ylabel('#')
title("number of relations per pathways")
grid(True)