def main(): description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None} ) parser.add_argument( "-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action="append", help="Taxon to identify reads in. Top hits (as defined by " "--topHitPct) must be in this group. It can be a taxid, " "a name, or a file listing taxids. Use multiple times to " "specify a list of organisms. Use -a to specify whether " "all or at least one of the top hits must match.", ) parser.add_argument( "-a", "--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism " "in the target taxon/taxa. By default, all top hits must be " "in the target group.", ) parser.add_argument( "-t", "--topHitPct", default=0, type=float, help="How close(as a percentage to the best score a hit must be " "to qualify as a top hit. Default is 0, ie must have the best " "score. Use 100 to get all hits.", ) parser.add_argument( "-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit " "these hits) must be in the target group or this group. Again, " "it can be a taxid, a name, or a file listing taxids. " "It can also be inkoved multiple times to choose multiple " "groups.", ) parser.add_argument( "-r", "--reads", default=False, action="store_true", help="Output just read names. By default, print the relevant hit " "lines for each read", ) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check args if arguments.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if arguments.taxdir is not None: taxonomy = readTaxonomy(arguments.taxdir, namesMap=True) else: taxonomy = None group_1_set = get_group_set(arguments.group1, taxonomy) group_2_set = get_group_set(arguments.group2, taxonomy) logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set)) if group_2_set is not None: logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set)) # map reads to hits if arguments.parseStyle == GIS: keyType = int else: keyType = None accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType) # set up some function pointers global hitRE hitRE = parsingREs.get(arguments.parseStyle, None) if arguments.parseStyle == ORGS: getTaxid = _getOrgTaxid elif arguments.parseStyle == HITID: getTaxid = _getHitidTaxid elif arguments.parseStyle == HITDESC: getTaxid = _getHitdescTaxid else: getTaxid = _getExprTaxid # for filtering: filterParams = FilterParams.create_from_arguments(arguments) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle, outhandle) in inputIterator(arguments): readCount = 0 goodReadCount = 0 printCount = 0 # parse file for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False): readCount += 1 bestScore = 0 hitTaxids = {} for hit in hits: score = hit.score taxids = [] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit, accToTaxMap, taxonomy): if taxid is None: break if group_2_set is not None and taxid not in group_2_set: break taxids.append(taxid) if len(taxids) == 0: # nothing matched in the wider group break hitTaxids[hit] = taxids # find the top score if score > bestScore: bestScore = score else: # if we get here, then every hit was in wider taxon list logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore)) all = True recognized = [] for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct): if _anyTaxidInGroup(taxids, group_1_set): logging.debug("%s (%r) is in group 1" % (hit, taxids)) recognized.append(hit) else: logging.debug("%s (%r) is not in group 1" % (hit, taxids)) all = False if len(recognized) == 0: # if none of the best are in our target list, next read logging.debug("No best hits for %s are in group 1" % (read)) continue if (not arguments.any) and (not all): # next read unless user said any or all hits are in list logging.debug("Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount += 1 if arguments.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write("\n") else: logging.debug("Keeping %d hits for %s" % (len(recognized), read)) for hit in sorted(recognized, key=lambda h: (h.score, h.hit)): outhandle.write(hit.getLine(filterParams)) printCount += 1 if arguments.reads: logging.info("Printed %d of %d reads" % (goodReadCount, readCount)) else: logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
def main(): description = __doc__ parser = argparse.ArgumentParser(description) add_IO_arguments(parser) parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_argument( '-s', '--squash', dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, " "just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.add_path_arguments(parser) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warn("Type: %s" % (arguments.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the level list makes sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' elif cogMapRE.search(firstLine): arguments.mapStyle = 'cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (arguments.mapStyle)) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) elif arguments.mapStyle == 'cog': valueMap = kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == hits.GIS: keyType = int else: keyType = None valueMap = parseMapFile( arguments.mapFile, valueType=None, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s" % (len(valueMap), next(iter(valueMap.items())))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # set up level mapping levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels] # parse input files for (inhandle, outhandle) in inputIterator(arguments): logging.debug( "Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter( inhandle, valueMap, arguments.hitTableFormat, arguments.filterTopPct, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) if arguments.levels == [None]: arguments.levels = ['Hit'] outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels))) for read, hitIter in hitMapIter: assignments = [] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment = [] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings( assignment, arguments): outhandle.write( "%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): # set up CLI description = """ Takes a tabular text file and translate a column of KO values into a new column of KEGG pathways. KO column can have multiple entries per row. Output column will have multiple entries per pathway cell. """ parser = argparse.ArgumentParser(description=description) util.add_IO_arguments(parser) parser.add_argument("-l", "--level", dest="level", default="PATHWAY", metavar="LEVEL", help=""" Level to collect counts on. Level can be one of: NAME, PATHWAY, EC, DEFINITION, or a level in the CLASS heirachy: 1, 2, or 3 """) parser.add_argument( "-f", "--fill_missing", dest="fill", metavar="FILL", default="No Pathway", help="Put FILL in column when KO has no pathway assigned. " "Defaults to 'No Pathway'") # format, ortholog heirarchy, and more parser.add_argument("-k", "--ko_file", dest="ko_file", metavar="MAPFILE", help="Location of kegg ko file") parser.add_argument("-c", "--ko_column", type=int, default=1, help="Column number (first column is 1)", metavar="COLUMN") parser.add_argument( "-C", "--new_column", type=int, default=None, help="Column number to insert new column after. Default is the " "after the source column. 0=>make it the first column. " "-1=>make it the last column.", metavar="COLUMN") parser.add_argument( "-L", "--long_output", default=False, action='store_true', help="Insert new duplicate rows if KO maps to multiple values") parser.add_argument( "-H", "--header", default=None, metavar='HEADER', help="Put HEADER in first row instead of trying to translate") parser.add_argument("-Q", "--quotes", default=False, action="store_true", help="Encase translated values in double quotes") parser.add_argument( "-s", "--sep", dest='sep', default='\t', help="""Character separating table cells. Default is tab""") parser.add_argument( "-S", "--ko_sep", dest='ko_sep', default=';', help="""Character separating multiple KO values in iput table and used to separate multiple values in output column. Default is ";". Ignored for output if --longOutput requested""") # log level and help util.add_universal_arguments(parser) arguments = parser.parse_args() util.setup_logging(arguments) logging.info("KO mapping from: " + arguments.ko_file) logging.debug("Fill: '%s'" % (arguments.fill)) translation = kegg.parse_KEGG_file(arguments.ko_file, arguments.level) # switch to zero indexing if arguments.new_column: arguments.new_column -= 1 arguments.ko_column -= 1 for (inhandle, outhandle) in util.inputIterator(arguments): for new_line in translate_ko_column( inhandle, sep=arguments.sep, ko_sep=arguments.ko_sep, ko_column=arguments.ko_column, new_column=arguments.new_column, translation=translation, default=arguments.fill, quotes=arguments.quotes, header=arguments.header, long_out=arguments.long_output, ): outhandle.write(new_line)
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments(parser, defaults={ 'filter_top_pct': 0, 'parseStyle': ACCS, 'countMethod': 'tophit' }, choices={'countMethod': ('tophit', 'toporg')}) parser.add_argument( "-P", "--proportional", dest="proportional", default=False, action="store_true", help="Assign reads that have multiple equal top hits to taxa such " "that the overal proportion of taxa is consistent with the " "unambiguious hits. This is meant for use with the 'toporg' " "count method.") parser.add_argument( "-i", "--individualFiles", dest="individual", default=False, action="store_true", help="Use this flag to process files independently. Normally, " "counts from all files are pooled for making choices.") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # load necessary maps params = FilterParams.create_from_arguments(arguments) if arguments.countMethod == 'toporg': (taxonomy, hitStringMap) = readMaps(arguments) wta = not (arguments.proportional) if len(arguments.input_files) <= 1 or arguments.individual: # loop over input for (inhandle, outhandle) in inputIterator(arguments): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) m8stream = M8Stream(inhandle) if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for line in readHits: outhandle.write(line) else: # process all files at once multifile = redistribute.multipleFileWrapper(arguments.input_files) # Build a map from input file name to output handle outputMap = {} for infile_handle in arguments.input_files: infile_name = infile_handle.name if arguments.output_file is None: outputMap[infile_name] = sys.stdout elif len(arguments.input_files) <= 1: outputMap[infile_name] = open(arguments.output_file, 'w') else: # use outfileName as suffix if arguments.cwd: # strip path info first (infilePath, infileFile) = os.path.split(infile_name) outfile = "./" + infileFile + arguments.output_file else: outfile = infile_name + arguments.output_file outputMap[infile_name] = open(outfile, 'w') if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for (read, hit) in readHits: infile_name, read = read.split("/", 1) outhandle = outputMap[unquote_plus(infile_name)] outhandle.write(hit.line.split("/", 1)[1]) if arguments.output_file is not None: for outhandle in outputMap.values(): outhandle.close()
def main(): description = __doc__ parser = argparse.ArgumentParser(description) add_IO_arguments(parser) parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_argument( '-S', '--squash', dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, " "just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.add_path_arguments(parser) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warn("Type: %s" % (arguments.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the level list makes sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' elif cogMapRE.search(firstLine): arguments.mapStyle = 'cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (arguments.mapStyle)) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) elif arguments.mapStyle == 'cog': valueMap = kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == hits.GIS: keyType = int else: keyType = None valueMap = parseMapFile(arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s" % (len(valueMap), next(iter(valueMap.items())))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # set up level mapping levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels] # parse input files for (inhandle, outhandle) in inputIterator(arguments): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter( inhandle, valueMap, hits.FilterParams.create_from_arguments(arguments), arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) if arguments.levels == [None]: arguments.levels = ['Hit'] outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels))) for read, hitIter in hitMapIter: assignments = [] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment = [] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings( assignment, arguments): outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={ 'mapFile': None, 'parseStyle': ACCS, 'filter_top_pct': -1, 'countMethod': 'all', 'taxdir': None}) parser.add_argument( "-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append', help="Taxon to identify reads in. Top hits (as defined by " "--topHitPct) must be in this group. It can be a taxid, " "a name, or a file listing taxids. Use multiple times to " "specify a list of organisms. Use -a to specify whether " "all or at least one of the top hits must match.") parser.add_argument( "-a", "--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism " "in the target taxon/taxa. By default, all top hits must be " "in the target group.") parser.add_argument( '-t', '--topHitPct', default=0, type=float, help="How close(as a percentage to the best score a hit must be " "to qualify as a top hit. Default is 0, ie must have the best " "score. Use 100 to get all hits.") parser.add_argument( "-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit " "these hits) must be in the target group or this group. Again, " "it can be a taxid, a name, or a file listing taxids. " "It can also be inkoved multiple times to choose multiple " "groups.") parser.add_argument( '-r', '--reads', default=False, action="store_true", help="Output just read names. By default, print the relevant hit " "lines for each read") # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check args if arguments.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if arguments.taxdir is not None: taxonomy = readTaxonomy(arguments.taxdir, namesMap=True) else: taxonomy = None group_1_set = get_group_set(arguments.group1, taxonomy) group_2_set = get_group_set(arguments.group2, taxonomy) logging.debug( "Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set)) if group_2_set is not None: logging.debug( "Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set)) # map reads to hits if arguments.parseStyle == GIS: keyType = int else: keyType = None accToTaxMap = parseMapFile( arguments.mapFile, valueType=int, keyType=keyType) # set up some function pointers global hitRE hitRE = parsingREs.get(arguments.parseStyle, None) if arguments.parseStyle == ORGS: getTaxid = _getOrgTaxid elif arguments.parseStyle == HITID: getTaxid = _getHitidTaxid elif arguments.parseStyle == HITDESC: getTaxid = _getHitdescTaxid else: getTaxid = _getExprTaxid # for filtering: filterParams = FilterParams.create_from_arguments(arguments) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle, outhandle) in inputIterator(arguments): readCount = 0 goodReadCount = 0 printCount = 0 # parse file for ( read, hits) in filterM8Stream( inhandle, filterParams, return_lines=False): readCount += 1 bestScore = 0 hitTaxids = {} for hit in hits: score = hit.score taxids = [] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit, accToTaxMap, taxonomy): if taxid is None: break if group_2_set is not None and taxid not in group_2_set: break taxids.append(taxid) if len(taxids) == 0: # nothing matched in the wider group break hitTaxids[hit] = taxids # find the top score if score > bestScore: bestScore = score else: # if we get here, then every hit was in wider taxon list logging.debug( "Checking best hits for %s (top score: %.1f)" % (read, bestScore)) all = True recognized = [] for hit, taxids in _getBestHitTaxids( hitTaxids, bestScore, arguments.topHitPct): if _anyTaxidInGroup(taxids, group_1_set): logging.debug("%s (%r) is in group 1" % (hit, taxids)) recognized.append(hit) else: logging.debug( "%s (%r) is not in group 1" % (hit, taxids)) all = False if len(recognized) == 0: # if none of the best are in our target list, next read logging.debug( "No best hits for %s are in group 1" % (read)) continue if (not arguments.any) and (not all): # next read unless user said any or all hits are in list logging.debug( "Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount += 1 if arguments.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write('\n') else: logging.debug( "Keeping %d hits for %s" % (len(recognized), read)) for hit in sorted( recognized, key=lambda h: ( h.score, h.hit)): outhandle.write(hit.getLine(filterParams)) printCount += 1 if arguments.reads: logging.info("Printed %d of %d reads" % (goodReadCount, readCount)) else: logging.info( "Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
def main(): """ The CLI """ description = """ Takes a hit table (reads searched against a database) and assigns each read to a taxon. Hit table may be specified with -i or piped to STDIN. Notes: * Specifying a top score precent (-F) will force hits to be sorted by score within each read. However, it is assumed that the hits in the input table(s) are already grouped by read. This program does not attempt to sort the entire input. """ parser = argparse.ArgumentParser(description) util.add_IO_arguments(parser) parser.add_argument("-T", "--taxids", default=False, action="store_true", help="Output taxids instead of names") edlhits.add_taxon_arguments(parser) parser.add_argument( "-r", "--rank", dest="rank", default=None, metavar="RANK", help=" Rank to collect counts on. Defaults to None (whatever " "the annotation was). Corresponds to rank names in nodes.dmp. " "To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in " "ncbi tax dir") parser.add_argument( "-R", "--printRank", dest="printRanks", action="append", help="Include indeicated rank(s) in lineage of printed taxa. " "Will be ignored if beyond the rank of the taxa " "(IE We can't include species if the taxon being counted " "is genus)") parser.add_argument( "--no-header", dest="no_header", default=False, action='store_true', help="do not write header line") util.add_universal_arguments(parser) arguments = parser.parse_args() util.setup_logging(arguments) logging.debug("Parsing style is: %s", arguments.parseStyle) # Handle the case where Galaxy tries to set None as a string arguments.printRanks = util.checkNoneOption(arguments.printRanks) # check arguments if arguments.taxids and arguments.taxdir is None: parser.error("Only use -T when a taxonomy is specified") if arguments.rank is not None and arguments.taxdir is None: parser.error( "Please supply NCBI phylogeny(-n) if specifying a rank(-r).") if arguments.printRanks is not None and arguments.taxdir is None: parser.error( "Please supply NCBI phylogeny(-n) if specifying a rank(-R).") if arguments.rank is not None: if arguments.rank == 'domain': logging.warning('translating domain to superkingdom') arguments.rank = 'superkingdom' if arguments.rank not in ranks: parser.error("Unknown rank: %s" % (arguments.rank)) try: # Make sure the rank lists make sense if arguments.printRanks is not None: arguments.printRanks = cleanRanks(arguments.printRanks) except Exception as exc: parser.error(str(exc)) # load necessary maps (taxonomy, value_map) = edlhits.readMaps(arguments) # loop over inputs for (inhandle, outhandle) in util.inputIterator(arguments): logging.debug( "Reading from %s and writing to %s", inhandle, outhandle) hit_iter = edlhits.parseM8FileIter( inhandle, value_map, edlhits.FilterParams.create_from_arguments(arguments), arguments.parseStyle, arguments.countMethod, taxonomy=taxonomy, rank=arguments.rank) ## # print output # choose output method if arguments.taxids: hit_header = 'taxid' printer = taxid_printer else: if arguments.printRanks is None: hit_header = 'Hit(s)' printer = default_printer else: hit_header = '\t'.join(arguments.printRanks) def printer(read, hits): " Inline function to reduce number of arguments " return tax_table_printer(read, hits, arguments.rank, arguments.printRanks) # loop over reads if not arguments.no_header: outhandle.write("Read\t{}\n".format(hit_header)) for (read, hits) in hit_iter: outhandle.write(printer(read, hits))
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={ 'filter_top_pct': 0, 'parseStyle': ACCS, 'countMethod': 'tophit'}, choices={ 'countMethod': ( 'tophit', 'toporg')}) parser.add_argument( "-P", "--proportional", dest="proportional", default=False, action="store_true", help="Assign reads that have multiple equal top hits to taxa such " "that the overal proportion of taxa is consistent with the " "unambiguious hits. This is meant for use with the 'toporg' " "count method.") parser.add_argument( "-i", "--individualFiles", dest="individual", default=False, action="store_true", help="Use this flag to process files independently. Normally, " "counts from all files are pooled for making choices.") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # load necessary maps params = FilterParams.create_from_arguments(arguments) if arguments.countMethod == 'toporg': (taxonomy, hitStringMap) = readMaps(arguments) wta = not(arguments.proportional) if len(arguments.input_files) <= 1 or arguments.individual: # loop over input for (inhandle, outhandle) in inputIterator(arguments): logging.debug( "Reading from %s and writing to %s" % (inhandle, outhandle)) m8stream = M8Stream(inhandle) if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for line in readHits: outhandle.write(line) else: # process all files at once multifile = redistribute.multipleFileWrapper(arguments.input_files) # Build a map from input file name to output handle outputMap = {} for infile_handle in arguments.input_files: infile_name = infile_handle.name if arguments.output_file is None: outputMap[infile_name] = sys.stdout elif len(arguments.input_files) <= 1: outputMap[infile_name] = open(arguments.output_file, 'w') else: # use outfileName as suffix if arguments.cwd: # strip path info first (infilePath, infileFile) = os.path.split(infile_name) outfile = "./" + infileFile + arguments.output_file else: outfile = infile_name + arguments.output_file outputMap[infile_name] = open(outfile, 'w') if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for (read, hit) in readHits: infile_name, read = read.split("/", 1) outhandle = outputMap[unquote_plus(infile_name)] outhandle.write(hit.line.split("/", 1)[1]) if arguments.output_file is not None: for outhandle in outputMap.values(): outhandle.close()