def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} with InputFile(blastFile) as m8stream: params = FilterParams.create_from_arguments(options) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, return_lines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % \ (hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % \ (hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits from %d lines to %d reads" % (keepcount, hitcount, m8stream.lines, readcount)) return hitMap
def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} params = FilterParams.create_from_arguments(options) m8stream = M8Stream(blastFile) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, returnLines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % ( hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % ( hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits to %d reads" % (keepcount, hitcount, readcount)) return hitMap
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser(description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[ sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open(getOutputFile(infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser( description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument( "-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open( getOutputFile( infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def main(): # command line arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve') # default to non-overlapping=0 add_hit_table_arguments(parser, flags='all', defaults={'nonoverlapping': 0}) parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument('hit_table', nargs='?', type=argparse.FileType('rU'), default=sys.stdin, help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # output file or STDOUT if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout # input file or STDIN (handled by argparse) infile_handle = arguments.hit_table logging.info("reading data from %s" % (infile_handle.name)) # filter, but don't apply nonoverlapping yet # non-overlapping should be applied per-reference only params = FilterParams.create_from_arguments(arguments) # save user supplied value for later overlap_buffer = params.nonoverlapping # turn off for now params.set_nonoverlapping(-1) # merge hit_iter = filterM8Stream(infile_handle, params, return_lines=False) for query, query_hits in hit_iter: # group by reference hit hits_by_ref = defaultdict(list) for hit in query_hits: hits_by_ref[hit.hit].append(hit) # one output for query/reference pair for ref, ref_hits in hits_by_ref.items(): # remove overlaps unless the buffer has been set to <0 if overlap_buffer >= 0: ref_hits = remove_overlapping_hits( ref_hits, on_hit=True, buffer=params.nonoverlapping) ref_hits = remove_overlapping_hits( ref_hits, on_hit=False, buffer=params.nonoverlapping) # aggregate values length, score, identities = 0, 0, 0 for hit in ref_hits: length += hit.mlen score += hit.score try: # this will be off by 100x identities += hit.pctid * hit.mlen except: # just report pctid=0 if no pctid column in input pass outfile_handle.write( "%s\t%s\t%d\t%d\t%0.2f\n" % (query, ref, length, score, identities / length)) outfile_handle.close() infile_handle.close()