Exemplo n.º 1
0
def teste4():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get(modules[0]))
    compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',.....
    pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',......
    module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate']
    return pathway
Exemplo n.º 2
0
    def __init__(self, gene_lists, taxon, dataframe,
                 kegg_organism=None,
                 enrichment_params={
                        "padj": 0.05,
                        "log2_fc": 3,
                        "max_entries": 3000,
                        "kegg_background": None,
                        "mapper": None,
                        "preload_directory": None,
                        'plot_compute_levels': False,
                        'plot_logx': True
                        },
                go_only=False,
                kegg_only=False,
                command=""
                ):
        """.. rubric:: constructor

        """
        super().__init__()
        self.title = "Enrichment"

        self.command = command
        #self.rnadiff_folder = rnadiff_folder
        self.gene_lists = gene_lists
        self.enrichment_params = enrichment_params
        self.data = dataframe
        self.taxon = taxon
        if taxon == 10090:
            self.organism = "mmu"
        elif taxon == 9606:
            self.organism = "hsa"
        else:
            if kegg_organism is None:
                logger.error("You must specify the kegg organism name if not human or mouse: eg., eco for ecoli")
                # figure out the organism from taxon 
                raise NotImplementedError
            else:
                from bioservices import KEGG
                k = KEGG()
                k.organism = kegg_organism # validates the organism name
                self.organism = kegg_organism

        if self.enrichment_params['preload_directory']:
            pathname = self.enrichment_params['preload_directory']
            if os.path.exists(pathname) is False:
                logger.error(f"{pathname} does not exist")
                sys.exit(1)

        #from sequana.rnadiff import RNADiffResults
        #self.rnadiff = RNADiffResults(self.rnadiff_folder)
        self.rnadiff = {}

        self.create_report_content(go_only=go_only, kegg_only=kegg_only)
        self.create_html("enrichment.html")
Exemplo n.º 3
0
def teste5():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get("M00627"))
    module_name=dic["NAME"][0]
    reactions=dic["REACTION"]
    if "Pentose phosphate cycle" in module_name:
        print(module_name)
    else:
        print("haha")
Exemplo n.º 4
0
def teste2():
    s = KEGG()
    s.organism = "hsa"
    modules=s.moduleIds
    print(modules[3])
    dic=s.parse(s.get(modules[3]))
    reactions=dic["REACTION"]
    dic_reac={}
    for reac in reactions:
        teste=reactions[reac]
        string=teste.split(" ")
        dic_reac[reac]=string
    return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds 
Exemplo n.º 5
0
def teste6():
    s = KEGG()
    s.organism = "hsa"
    modules=["M00001", "M00002", "M00013", "M00034"]
    dic_reac={}
    for mod in modules:
        dic=s.parse(s.get(mod))
        reactions=dic["REACTION"]
        for reac in reactions:
            teste=reactions[reac]
            string=teste.split(" ")
            dic_reac[reac]=string
    return dic_reac 
Exemplo n.º 6
0
def download_pathway_ids(organism, cache=False):
    """
    Query KEGG for a recent list of pathways for an organism.

    Parameters
    ----------
    organism: str
        A KEGG organism code. For example 'hsa'.

    cache : bool, optional, default: False
        If True, results are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `list`
        List of str pathway identifiers.
    """
    kegg = KEGG(cache=cache)
    kegg.organism = organism
    pathways = kegg.pathwayIds
    return pathways
Exemplo n.º 7
0
def kegg():
    k = KEGG()
    k.organismIds
    k.organism = "hsa"
    return k
Exemplo n.º 8
0
def find_pathways_organism(cvDict,
                           preDefList=[],
                           writeGraphml=True,
                           organism="hsa"):
    aliasDict, koDict, orgDict = {}, {}, {
    }  # set up empty dictionaries for converting codes
    nc.parseKEGGdict('inputData/ko00001.keg', aliasDict,
                     koDict)  # parse the dictionary of ko codes
    try:  # try to retrieve and parse the dictionary containing organism gene names to codes conversion
        url = urllib2.urlopen('http://rest.kegg.jp/list/' + organism)
        text = url.readlines()
        # reads KEGG dictionary of identifiers between numbers and actual protein names and saves it to a python dictionary
        for line in text:
            line_split = line.split('\t')
            k = line_split[0].split(':')[1]
            nameline = line_split[1].split(';')
            name = nameline[0]
            if ',' in name:
                nameline = name.split(',')
                name = nameline[0]
                for entry in range(1, len(nameline)):
                    aliasDict[nameline[entry].strip()] = name.upper()
            orgDict[k] = name
    except:
        print('Could not get library: ' + organism)
    k = KEGG()  # read KEGG from bioservices
    k.organism = organism
    minOverlap = 5
    if len(preDefList) == 0:
        pathwayList = list(k.pathwayIds)
    else:
        pathwayList = list(preDefList)

    # set up a converter to retain only numbers from KEGG pathway codes
    allChars = string.maketrans('', '')
    noDigits = allChars.translate(allChars, string.digits)

    genes = set(cvDict.keys())  # find the list of genes included in dataset
    for x in pathwayList:
        x = x.replace("path:", "")
        code = str(x)
        code = code.translate(allChars, noDigits)  # eliminate org letters
        coder = str('ko' + code)  # add ko
        graph = nx.DiGraph()  # open a graph object
        nc.uploadKEGGcodes([coder], graph, koDict)  # get ko pathway
        coder = str(organism + code)  # set up with org letters
        uploadKEGGcodes_org([coder], graph, orgDict, koDict,
                            organism)  # get org pathway
        # check to see if there is a connected component, simplify graph and print if so
        allNodes = set(graph.nodes())
        test = len(allNodes.intersection(genes))
        print("Pathway: ", x, " Overlap: ", test, " Edges: ",
              len(graph.edges()))
        if len(
                list(nx.connected_component_subgraphs(graph.to_undirected()))
        ) > 0:  # if there is more than a 1 node connected component, run BONITA
            #nx.write_graphml(graph,coder+'_before.graphml')
            if len(genes.intersection(graph.nodes())
                   ) > minOverlap:  # if there are 5 genes shared
                graph = simplifyNetworkpathwayAnalysis(
                    graph, cvDict)  # simplify graph to nodes in dataset
                nx.write_graphml(graph, coder + '.graphml')  # write graph out
                nx.write_gpickle(graph, coder + '.gpickle')  # write graph out
                print('nodes: ', str(len(graph.nodes())), ',   edges:',
                      str(len(graph.edges())))
                print(graph.nodes())
                if len(graph.nodes()) > 0:
                    # save the removed nodes and omics data values for just those nodes in the particular pathway
                    pathwaySampleList = [
                        {}
                        for q in range(len(geneDict[list(graph.nodes())[0]]))
                    ]
                    for noder in graph.nodes():
                        for jn in range(len(pathwaySampleList)):
                            pathwaySampleList[jn][noder] = geneDict[noder][jn]
                    pickle.dump(pathwaySampleList,
                                open(coder + "_sss.pickle", "wb"))
Exemplo n.º 9
0
def pathway_to_dataframe(pathway_id, org='hsa', verbose=False, cache=False):
    """
    Extract protein-protein interaction from KEGG pathway to
    a pandas DataFrame. NOTE: Interactions will be directionless.

    Parameters
    ----------
    pathway_id : str
        Pathway identifier to parse into a dataframe. Example: 'path:hsa00010'

    org : str or None, optioanl, default: 'hsa'
        If supplied, filters out all interactions with identifiers that
        are not in the dictionary created from :func:`kegg_to_uniprot`. If None,
        all interactions are parsed regardless of mappability to UniProt.

    verbose : bool, optional, default: False
        If True, logs messages to stdout to inform of current progress.

    cache : bool, optional, default: False
        If True, HTTP responses are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `pd.DataFrame`
        DataFrame with 'source', 'target', 'label', 'pubmed', and 
        'experiment_type' columns.

    """
    kegg = KEGG(cache=cache)
    kegg.organism = org
    kegg_to_up = kegg_to_uniprot(org, cache)
    res = kegg.parse_kgml_pathway(pathway_id)
    sources = []
    targets = []
    labels = []

    if verbose:
        logger.info("Parsing pathway {}".format(pathway_id))

    for rel in res['relations']:
        id1 = rel['entry1']
        id2 = rel['entry2']
        name1 = res['entries'][[x['id']
                                for x in res['entries']].index(id1)]['name']
        name2 = res['entries'][[x['id']
                                for x in res['entries']].index(id2)]['name']
        type1 = res['entries'][[x['id']
                                for x in res['entries']].index(id1)]['type']
        type2 = res['entries'][[x['id']
                                for x in res['entries']].index(id2)]['type']
        reaction_type = rel['name'].replace(' ', '-')
        link_type = rel['link']

        if link_type not in links_to_include:
            continue

        if type1 not in types_to_include or type2 not in types_to_include:
            continue

        for a in name1.strip().split(' '):
            for b in name2.strip().split(' '):
                valid_db_a = (kegg.organism in a or 'ec' in a)
                valid_db_b = (kegg.organism in b or 'ec' in b)

                valid_db_a &= (a in kegg_to_up)
                valid_db_b &= (b in kegg_to_up)

                if valid_db_a and valid_db_b:
                    sources.append(a)
                    targets.append(b)
                    labels.append(reaction_type)

    interactions = make_interaction_frame(sources, targets, labels)
    return interactions
Exemplo n.º 10
0
sns.set(style='ticks', palette='pastel', color_codes=True)

# ---- Import network
network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t')
network_proteins = set(network['protein1']).intersection(network['protein2'])

# ---- Set-up UniProt
uniprot = UniProt(cache=True)

# ---- Set-up QuickGO bioservice
quickgo = QuickGO(cache=True)

# ---- Set-up KEGG bioservice
kegg, kegg_parser = KEGG(cache=True), KEGGParser()

kegg.organism = 'mmu'
print '[INFO] KEGG service configured'

kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds}
print '[INFO] KEGG pathways extracted: ', len(kegg_pathways)

# Convert KEGG pathways Gene Name to UniProt
k2u = kegg.conv('uniprot', 'mmu')

kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways}

kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]}
kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map}

kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins}
print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins)
Exemplo n.º 11
0
def kegg():
    k = KEGG()
    k.organismIds
    k.organism = "hsa"
    return k