def add_taxon_arguments(parser, defaults={}, choices={}): # get format and filter_top_pct options from blastm8 add_hit_table_arguments(parser, defaults, flags=['format', 'filter_top_pct']) # specific to taxon parsing: parser.add_argument( "-m", "--mapFile", dest="mapFile", default=defaults.get( "mapFile", None), metavar="MAPFILE", help="Location of file containing table of with db hit name " "as first column and taxa or taxonids in second column. " "Defaults to '%s'" % (defaults.get("mapFile", None))) parser.add_argument( "-p", "--parseStyle", default=defaults.get( "parseStyle", ACCS), choices=[ ACCS, GIS, ORGS, HITID, HITDESC], help="What should be parsed from the hit table: accessions('accs'), " "'gis', organsim names in brackets ('orgs'), the full hit " "name('hitid'), or the full hit description('hitdesc'). " "(defaults to '%s')" % (defaults.get("parseStyles", ACCS))) parser.add_argument( "-C", "--countMethod", dest="countMethod", default=defaults.get( "countMethod", "first"), choices=choices.get( 'countMethod', ('first', 'most', 'all', 'LCA', 'consensus')), help="How to deal with counts from multiple hits. (first, most: " "can return multiple hits in case of a tie, LCA: MEGAN-like, " "all: return every hit, consensus: return None unless all " "the same). Default is %s" % (defaults.get("countMethod", "first")), metavar="COUNTMETHOD") add_taxonomy_dir_argument(parser, defaults)
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser(description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[ sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open(getOutputFile(infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def add_path_arguments(parser, defaults={}, choices={}, helps={}): # get format and filter_top_pct arguments from blastm8 from edl.hits import HITID, ACCS, GIS, KEGG, HITDESC, PFAM from edl.blastm8 import add_hit_table_arguments add_hit_table_arguments(parser, defaults, flags=['format', 'filter_top_pct', 'sort' ]) # specific to pathway parsing: pgroup = parser.add_argument_group( "Pathway Arguments", "These arguments control the mapping of hits to gene " "function heirarchies like KEGG or SEED""") pgroup.add_argument( "-m", "--mapFile", dest="mapFile", default=defaults.get( "mapFile", None), metavar="MAPFILE", help="Location of file containing table of with db hit name as " "first column and geneIDs (Knumber) in second column.") pgroup.add_argument( "-M", "--mapStyle", default='auto', choices=[ 'auto', 'kegg', 'tab', 'seed'], help="What type of mapping file are you using: simple tab " "separated list of IDs and kos/subsystems/domains, the " "genes_ko.list file from KEGG (which adds ko: to the K " "numbers and can have multiple records for each gene id), " "or the 3 column file from SEED. By default, this script " "will inspect the file and guess, but you can force 'kegg', " "'seed' or 'tab' with this argument.") default = defaults.get('tab_map_delim', None) pgroup.add_argument("--tab_map_delim", default=default, help=("Delimiter to parse multiple assignments in " "map from ids to ko/path/fam. Only used for " "tabular mapping tables. Defaults to {}" .format(str(default)))) pgroup.add_argument( "-p", "--parseStyle", default=defaults.get( "parseStyle", HITID), choices=[ ACCS, GIS, KEGG, HITID, HITDESC, PFAM], help="What should be parsed from the hit table: accessions('accs'), " "'gis', K numbers in description ('kegg'), the full hit " "name('hitid'), or the full hit description('hitdesc'). " "(defaults to '%s')" % (defaults.get("parseStyle", HITID))) pgroup.add_argument( "-C", "--countMethod", dest="countMethod", default=defaults.get( "countMethod", "first"), choices=choices.get( 'countMethod', ('first', 'most', 'all', 'consensus')), help=helps.get( "countMethod", "How to deal with counts from multiple hits. (first, most: " "can return multiple hits, all: return every hit, consensus: " "return None unless all the same). Do not use most or consensus " "with more than one level at a time. Default is %s" % (defaults.get( "countMethod", "first"))), metavar="COUNTMETHOD") if defaults.get("filter_for_path", False): action = 'store_false' default = True helpstr = 'Consider all hits. By deafult, only hits with path \ assignments are used.' else: action = 'store_true' default = False helpstr = 'Ignore hits with no entry in pathway map (-m). By default \ all hits are used and if the best hit(s) is(are) to sequences with no path, \ then the read will not be assigned to a path' pgroup.add_argument( "-r", "--filter_for_path", action=action, dest="mappedHitsOnly", default=default, help=helpstr) add_pathways_argument(pgroup, defaults) parser.add_argument_group(pgroup)
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser( description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument( "-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open( getOutputFile( infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def main(): # command line arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve') # default to non-overlapping=0 add_hit_table_arguments(parser, flags='all', defaults={'nonoverlapping': 0}) parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument('hit_table', nargs='?', type=argparse.FileType('rU'), default=sys.stdin, help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # output file or STDOUT if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout # input file or STDIN (handled by argparse) infile_handle = arguments.hit_table logging.info("reading data from %s" % (infile_handle.name)) # filter, but don't apply nonoverlapping yet # non-overlapping should be applied per-reference only params = FilterParams.create_from_arguments(arguments) # save user supplied value for later overlap_buffer = params.nonoverlapping # turn off for now params.set_nonoverlapping(-1) # merge hit_iter = filterM8Stream(infile_handle, params, return_lines=False) for query, query_hits in hit_iter: # group by reference hit hits_by_ref = defaultdict(list) for hit in query_hits: hits_by_ref[hit.hit].append(hit) # one output for query/reference pair for ref, ref_hits in hits_by_ref.items(): # remove overlaps unless the buffer has been set to <0 if overlap_buffer >= 0: ref_hits = remove_overlapping_hits( ref_hits, on_hit=True, buffer=params.nonoverlapping) ref_hits = remove_overlapping_hits( ref_hits, on_hit=False, buffer=params.nonoverlapping) # aggregate values length, score, identities = 0, 0, 0 for hit in ref_hits: length += hit.mlen score += hit.score try: # this will be off by 100x identities += hit.pctid * hit.mlen except: # just report pctid=0 if no pctid column in input pass outfile_handle.write( "%s\t%s\t%d\t%d\t%0.2f\n" % (query, ref, length, score, identities / length)) outfile_handle.close() infile_handle.close()
def main(): description = __doc__ # command line options parser = argparse.ArgumentParser(description, conflict_handler='resolve') parser.add_argument("input_files", nargs=1, default=[], metavar="INFILE", help="Hit table to process") parser.add_argument( "-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write masked fasta output to OUTFILE (default is STDOUT).") parser.add_argument( "-i", "--infile", dest="fasta", metavar="FILE", help=" File containing the fasta (defaults to STDIN)") parser.add_argument( "-M", "--mask", dest="keep", default=True, action="store_false", help="Return unmatched sequence fragments instead of hits.") parser.add_argument("-m", "--minLength", dest="minLength", type=int, metavar="BASES", default=1, help="minimum number of bases for sequences in output") parser.add_argument( "-n", "--numbering_prefix", default=None, help="If given, name extracted sequence with this scring followed " "by a sinmple counting index of all extracted sequences. For " "example, -n \"r\" would add _r1 to the end of the first " "extracted sequence and _r2 to the second, and so on. By " "default, extracted sequences are named with start_end " "positions.") parser.add_argument( "-t", "--translate", default=False, action='store_true', help="Transalte to Amino Acid sequences") add_hit_table_arguments(parser, flags='all') # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument if len(arguments.input_files) != 1: parser.error( "Please supply the name of a hit table as the only argument") blastFile = arguments.input_files[0] # set up input/output streams if arguments.fasta is None: fastaHandle = sys.stdin fastaStr = 'STDIN' else: fastaHandle = open(arguments.fasta, "rU") fastaStr = arguments.fasta logging.info( "Extrating sequence fragments from %s based on hits in %s" % (fastaStr, blastFile)) if arguments.outfile is None: logging.info("Writing %s sequences to STDOUT" % ('fasta')) outputHandle = sys.stdout else: logging.info( "Writing %s sequences to %s" % ('fasta', arguments.outfile)) outputHandle = open(arguments.outfile, 'w') # load hit regions if arguments.keep: minHitLength = arguments.minLength else: minHitLength = 1 readHits = loadHitRegions(blastFile, minHitLength, arguments) logging.info("Found hits for %d reads" % (len(readHits))) # process the fasta file with hits extractHits( fastaHandle, outputHandle, readHits, arguments.translate, arguments.minLength, arguments.keep, arguments.numbering_prefix)
def main(): description = __doc__ # command line options parser = argparse.ArgumentParser(description, conflict_handler='resolve') parser.add_argument("input_files", nargs=1, default=[], metavar="INFILE", help="Hit table to process") parser.add_argument( "-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write masked fasta output to OUTFILE (default is STDOUT).") parser.add_argument( "-i", "--infile", dest="fasta", metavar="FILE", help=" File containing the fasta (defaults to STDIN)") parser.add_argument( "-M", "--mask", dest="keep", default=True, action="store_false", help="Return unmatched sequence fragments instead of hits.") parser.add_argument("-m", "--minLength", dest="minLength", type=int, metavar="BASES", default=1, help="minimum number of bases for sequences in output") parser.add_argument( "-n", "--numbering_prefix", default=None, help="If given, name extracted sequence with this scring followed " "by a sinmple counting index of all extracted sequences. For " "example, -n \"r\" would add _r1 to the end of the first " "extracted sequence and _r2 to the second, and so on. By " "default, extracted sequences are named with start_end " "positions.") parser.add_argument( "-t", "--translate", default=False, action='store_true', help="Transalte to Amino Acid sequences") add_hit_table_arguments(parser, flags='all') # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument if len(arguments.input_files) != 1: parser.error( "Please supply the name of a hit table as the only argument") blastFile = arguments.input_files[0] # set up input/output streams if arguments.fasta is None: fastaHandle = sys.stdin fastaStr = 'STDIN' else: fastaHandle = open(arguments.fasta, "rt") fastaStr = arguments.fasta logging.info( "Extrating sequence fragments from %s based on hits in %s" % (fastaStr, blastFile)) if arguments.outfile is None: logging.info("Writing %s sequences to STDOUT" % ('fasta')) outputHandle = sys.stdout else: logging.info( "Writing %s sequences to %s" % ('fasta', arguments.outfile)) outputHandle = open(arguments.outfile, 'w') # load hit regions if arguments.keep: minHitLength = arguments.minLength else: minHitLength = 1 readHits = loadHitRegions(blastFile, minHitLength, arguments) logging.info("Found hits for %d reads" % (len(readHits))) # process the fasta file with hits extractHits( fastaHandle, outputHandle, readHits, arguments.translate, arguments.minLength, arguments.keep, arguments.numbering_prefix)