def getLevelMapper(level, arguments): if level in koSyns: return lambda h: h if arguments.heirarchyType == 'cazy': return getCazyGroup lookupLevel = level if level not in level3Syns else '3' if arguments.heirarchyType == 'kegg': # Ideally, we'd be able to parse the heirachy once, but the current # KEGG code just retuns simple mappings logging.info("Reading KEGG level %s assignments from %s" % (level, arguments.heirarchyFile)) geneTranslation = kegg.parse_KEGG_file(arguments.heirarchyFile, lookupLevel) else: # SEED or COG/KOG if arguments.heirarchyType == 'seed': logging.info("Reading SEED subsystem assignments from %s" % (arguments.heirarchyFile)) seedTree = kegg.readSEEDTree(arguments.heirarchyFile) elif arguments.heirarchyType == 'cog': logging.info("Reading COG subsystem assignments from %s" % (arguments.heirarchyFile)) seedTree = kegg.readCogTree(arguments.heirarchyFile) geneTranslation = seedTree[lookupLevel] return lambda gene: geneTranslation.get(gene, gene)
def getLevelMapper(level, arguments): if level in koSyns: return lambda h: h if arguments.heirarchyType == 'cazy': return getCazyGroup lookupLevel = level if level not in level3Syns else '3' if arguments.heirarchyType == 'kegg': # Ideally, we'd be able to parse the heirachy once, but the current # KEGG code just retuns simple mappings logging.info( "Reading KEGG level %s assignments from %s" % (level, arguments.heirarchyFile)) geneTranslation = kegg.readKEGGFile( arguments.heirarchyFile, lookupLevel) else: # SEED or COG/KOG if arguments.heirarchyType == 'seed': logging.info( "Reading SEED subsystem assignments from %s" % (arguments.heirarchyFile)) seedTree = kegg.readSEEDTree(arguments.heirarchyFile) elif arguments.heirarchyType == 'cog': logging.info( "Reading COG subsystem assignments from %s" % (arguments.heirarchyFile)) seedTree = kegg.readCogTree(arguments.heirarchyFile) geneTranslation = seedTree[lookupLevel] return lambda gene: geneTranslation.get(gene, gene)
def printCountTablesByLevel(fileCounts, totals, fileNames, options): """ Create a new file for each level with a tab separated table of counts """ cutoff = options.cutoff if options.heirarchyType == 'seed': logging.info( "Reading SEED subsystem assignments from %s", options.heirarchyFile) seedTree = kegg.readSEEDTree(options.heirarchyFile) elif options.heirarchyType == 'cog': logging.info( "Reading COG subsystem assignments from %s", options.heirarchyFile) seedTree = kegg.readCogTree(options.heirarchyFile) # create an output table for each requested level for level in options.levels: logging.debug("Processing level %s", level) translateToPaths = level not in koSyns descString = None if translateToPaths: if options.heirarchyType == 'cazy': geneTranslator = getCazyGroup else: lookupLevel = level if level not in level3Syns else '3' if options.heirarchyType == 'kegg': # Ideally, we'd be able to parse the heirachy once, but the # current KEGG code just retuns simple mappings logging.info( "Reading KEGG level %s assignments from %s", level, options.heirarchyFile) geneTranslation = kegg.readKEGGFile( options.heirarchyFile, lookupLevel) else: # SEED or COG/KOG geneTranslation = seedTree[lookupLevel] geneTranslator = dict_lookup_default_to_query(geneTranslation) elif level is not None and options.heirarchyType == 'kegg': # return descriptions if level explicitly set to ko (or syn.) descString = "Description" logging.info( "Reading KO descriptions from %s", options.heirarchyFile) geneTranslation = kegg.readKEGGFile( options.heirarchyFile, "DESCRIPTION") geneTranslator = lambda_get_gene_and_translation(geneTranslation) elif level is not None and options.heirarchyType == 'cog': # return descriptions if level explicitly set to ko (or syn.) descString = "Description\tCategories" geneTranslator = lambda_get_seed_translations(seedTree) else: # just return gene if no level set or not KEGG/COG/KOG geneTranslator = passThrough # For each level, try to force all counts to be at that level fileLevelTotals = {} levelCounts = {} levelPaths = {} thresholds = {} for (filename, counts) in fileCounts.items(): fileLevelTotals[filename] = 0 thresholds[filename] = totals[filename] * cutoff fileLevelCounts = levelCounts.setdefault(filename, {}) fileTotal = 0 for gene in sorted(counts.keys(), key=lambda s: "" if s is None else s): # get the counts from this node geneCount = counts[gene] fileTotal += geneCount # translate gene to pathway (or not depending on above code) pathway = geneTranslator(gene) # update counts # Some KOs will map to multiple pathways, # so... allow for multiple translated values if not( isinstance( pathway, list) or isinstance( pathway, tuple)): pathway = [pathway, ] for indPathway in pathway: fileLevelCounts[indPathway] = fileLevelCounts.get( indPathway, 0) + geneCount levelPaths[indPathway] = True logging.debug( "File %s has %d hits (had %d)", filename, fileTotal, totals[filename]) # logging.debug(repr(levelPaths)) # logging.debug(repr(levelCounts)) if logging.getLogger().level <= logging.DEBUG: for (filename, counts) in levelCounts.items(): logging.debug("File %s has %d counts", filename, sum(counts.values())) # apply cutoff for pathway in list(levelPaths.keys()): # check to see if pathway is over cutoff in any file over = False for (filename, fileLevelCount) in levelCounts.items(): flPathCount = fileLevelCount.get(pathway, 0) fileLevelTotals[filename] += flPathCount if flPathCount > thresholds[filename]: over = True if not over: # this pathway is not over the cutoff for any file levelPaths.pop(pathway) other = 'Other' levelPaths[other] = True for (filename, fileLevelCount) in levelCounts.items(): fileLevelCount[other] = fileLevelCount.get( other, 0) + fileLevelCount.pop(pathway, 0) if logging.getLogger().level <= logging.DEBUG: for (filename, counts) in levelCounts.items(): logging.debug("File %s has %d counts", filename, sum(counts.values())) missed = False for path in counts.keys(): if path not in levelPaths: missed = True logging.debug( "Missing pathway %s has %d counts for %s", path, counts[path], filename) if not missed: logging.debug( "There are no missing pathways from %s", filename) logging.debug("Final file counts: %r", fileLevelTotals) # output file if options.output_file is None: outs = sys.stdout else: if len(options.levels) > 1: outfile = "%s.%s" % (options.output_file, level) else: outfile = options.output_file outs = open(outfile, 'w') # write to file(s?) # header if level in koSyns: # Header for when level is the gene if descString is not None: outs.write("Gene\t%s\t%s\n" % (descString, '\t'.join(fileNames))) else: outs.write("Gene\t%s\n" % ('\t'.join(fileNames))) else: # Header for when level is a pathway or group outs.write("Pathway\t%s\n" % ('\t'.join(fileNames))) for pathway in sorted(levelPaths.keys(), key=lambda s: "" if s is None else s): outs.write(str(pathway)) for filename in fileNames: outs.write("\t") outs.write(str(levelCounts[filename].get(pathway, 0))) outs.write("\n") # close out stream if options.output_file is not None: outs.close()
def printCountTablesByLevel(fileCounts, totals, fileNames, options): """ Create a new file for each level with a tab separated table of counts """ cutoff = options.cutoff if options.heirarchyType == 'seed': logging.info("Reading SEED subsystem assignments from %s", options.heirarchyFile) seedTree = kegg.readSEEDTree(options.heirarchyFile) elif options.heirarchyType == 'cog': logging.info("Reading COG subsystem assignments from %s", options.heirarchyFile) seedTree = kegg.readCogTree(options.heirarchyFile) # create an output table for each requested level for level in options.levels: logging.debug("Processing level %s", level) translateToPaths = level not in koSyns descString = None if translateToPaths: if options.heirarchyType == 'cazy': geneTranslator = getCazyGroup else: lookupLevel = level if level not in level3Syns else '3' if options.heirarchyType == 'kegg': # Ideally, we'd be able to parse the heirachy once, but the # current KEGG code just retuns simple mappings logging.info("Reading KEGG level %s assignments from %s", level, options.heirarchyFile) geneTranslation = kegg.readKEGGFile( options.heirarchyFile, lookupLevel) else: # SEED or COG/KOG geneTranslation = seedTree[lookupLevel] geneTranslator = dict_lookup_default_to_query(geneTranslation) elif level is not None and options.heirarchyType == 'kegg': # return descriptions if level explicitly set to ko (or syn.) descString = "Description" logging.info("Reading KO descriptions from %s", options.heirarchyFile) geneTranslation = kegg.readKEGGFile(options.heirarchyFile, "DESCRIPTION") geneTranslator = lambda_get_gene_and_translation(geneTranslation) elif level is not None and options.heirarchyType == 'cog': # return descriptions if level explicitly set to ko (or syn.) descString = "Description\tCategories" geneTranslator = lambda_get_seed_translations(seedTree) else: # just return gene if no level set or not KEGG/COG/KOG geneTranslator = passThrough # For each level, try to force all counts to be at that level fileLevelTotals = {} levelCounts = {} levelPaths = {} thresholds = {} for (filename, counts) in fileCounts.items(): fileLevelTotals[filename] = 0 thresholds[filename] = totals[filename] * cutoff fileLevelCounts = levelCounts.setdefault(filename, {}) fileTotal = 0 for gene in sorted(counts.keys(), key=lambda s: "" if s is None else s): # get the counts from this node geneCount = counts[gene] fileTotal += geneCount # translate gene to pathway (or not depending on above code) pathway = geneTranslator(gene) # update counts # Some KOs will map to multiple pathways, # so... allow for multiple translated values if not (isinstance(pathway, list) or isinstance(pathway, tuple)): pathway = [ pathway, ] for indPathway in pathway: fileLevelCounts[indPathway] = fileLevelCounts.get( indPathway, 0) + geneCount levelPaths[indPathway] = True logging.debug("File %s has %d hits (had %d)", filename, fileTotal, totals[filename]) # logging.debug(repr(levelPaths)) # logging.debug(repr(levelCounts)) if logging.getLogger().level <= logging.DEBUG: for (filename, counts) in levelCounts.items(): logging.debug("File %s has %d counts", filename, sum(counts.values())) # apply cutoff for pathway in list(levelPaths.keys()): # check to see if pathway is over cutoff in any file over = False for (filename, fileLevelCount) in levelCounts.items(): flPathCount = fileLevelCount.get(pathway, 0) fileLevelTotals[filename] += flPathCount if flPathCount > thresholds[filename]: over = True if not over: # this pathway is not over the cutoff for any file levelPaths.pop(pathway) other = 'Other' levelPaths[other] = True for (filename, fileLevelCount) in levelCounts.items(): fileLevelCount[other] = fileLevelCount.get( other, 0) + fileLevelCount.pop(pathway, 0) if logging.getLogger().level <= logging.DEBUG: for (filename, counts) in levelCounts.items(): logging.debug("File %s has %d counts", filename, sum(counts.values())) missed = False for path in counts.keys(): if path not in levelPaths: missed = True logging.debug( "Missing pathway %s has %d counts for %s", path, counts[path], filename) if not missed: logging.debug("There are no missing pathways from %s", filename) logging.debug("Final file counts: %r", fileLevelTotals) # output file if options.output_file is None: outs = sys.stdout else: if len(options.levels) > 1: outfile = "%s.%s" % (options.output_file, level) else: outfile = options.output_file outs = open(outfile, 'w') # write to file(s?) # header if level in koSyns: # Header for when level is the gene if descString is not None: outs.write("Gene\t%s\t%s\n" % (descString, '\t'.join(fileNames))) else: outs.write("Gene\t%s\n" % ('\t'.join(fileNames))) else: # Header for when level is a pathway or group outs.write("Pathway\t%s\n" % ('\t'.join(fileNames))) for pathway in sorted(levelPaths.keys(), key=lambda s: "" if s is None else s): outs.write(str(pathway)) for filename in fileNames: outs.write("\t") outs.write(str(levelCounts[filename].get(pathway, 0))) outs.write("\n") # close out stream if options.output_file is not None: outs.close()