def countHits(infile, **kwargs): """ Count hits from a hit table. Calls edl.hits.parseM8FileIter with the following optional parameters: hitStringMap (None): dictionary (or file) mapping hit IDs to something else format (GENE): hit table format filter_top_pct (0): only consider hits within this % of top score for each read parseStyle (ACCS): how to process hit data into an identifying string countMethod ('all'): how to resolve hits to multiple sequences taxonomy (None): An edl.taxon.Taxonomy object or directory conatining taxdmp rank (None): Maximum rank to resolve hits """ # if taxonomy or hitStringMap are file names, parse them taxonomy = kwargs.pop('taxonomy', None) if isinstance(taxonomy, str): taxonomy = readTaxonomy(taxonomy, namesMap=kwargs.pop('namesMap', False)) hitStringMap = kwargs.pop('hitStringMap', None) if isinstance(hitStringMap, str): if taxonomy is not None: # the mapped hit ids will need to be ints valueType = kwargs.pop('valueType', int) else: valueType = kwargs.pop('valueType', None) hitStringMap = parseMapFile(hitStringMap, valueType=valueType) # if infile is name (and not handle), open as a handle if isinstance(infile, str): inhandle = open(infile) else: inhandle = infile # get iterator over reads that will parse hits hitIter = parseM8FileIter(inhandle, hitStringMap, FilterParams( format=kwargs.pop('format', GENE), top_pct=kwargs.pop('filter_top_pct', 0), ), kwargs.pop('parseStyle', ACCS), kwargs.pop('countMethod', 'all'), taxonomy=taxonomy, rank=kwargs.pop('rank', None)) # count the hits (total, counts) = countIterHits(hitIter, allMethod=kwargs.pop('allMethod', ALLEQ), returnMap=False) logger.info("Total hits: %s" % total) if isinstance(infile, str): inhandle.close() return counts
def parseM8FileIter( inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # check filtering options if countMethod == 'first': scorePct = -1 # get map from reads to lists of hit strings logger.info("Parsing hits") options = FilterParams() options.format = format if scorePct >= 0 or sortReads: # filter hits on score if requested if scorePct >= 0: logger.info( "Filtering for scores within %s pct of best" % scorePct) options.topPct = scorePct options.sort = 'score' options.sortReads = sortReads # filters and parses options.parseStyle = parsingStyle hitIter = filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits( hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) # apply count method hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} with InputFile(blastFile) as m8stream: params = FilterParams.create_from_arguments(options) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, return_lines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % \ (hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % \ (hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits from %d lines to %d reads" % (keepcount, hitcount, m8stream.lines, readcount)) return hitMap
def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} params = FilterParams.create_from_arguments(options) m8stream = M8Stream(blastFile) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, returnLines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % ( hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % ( hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits to %d reads" % (keepcount, hitcount, readcount)) return hitMap
def parseM8FileIter(inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # check filtering options if countMethod == 'first': scorePct=-1 # setup some variables infoInDescription = parsingStyle in [KEGG,ORGS] # get map from reads to lists of hit strings logger.info("Parsing hits") options=FilterParams() options.format=format if scorePct >= 0 or sortReads: # filter hits on score if requested if scorePct>=0: logger.info("Filtering for scores within %s pct of best" % scorePct) options.topPct=scorePct options.sort='score' options.sortReads=sortReads # filters and parses options.parseStyle=parsingStyle hitIter=filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits(hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) #debugKey="F4UZ9WW02HMBZJ" #logger.debug("Hits for %s: %r" % (debugKey,hitMap[debugKey])) # apply count method hitIter=applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser(description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[ sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open(getOutputFile(infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def main(): description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line arguments parser = argparse.ArgumentParser( description=description, conflict_handler='resolve') add_hit_table_arguments(parser, flags='all') parser.add_argument( "-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument( '-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name " "and options. Overridden by -o, cannot be used with data " "from STDIN.") parser.add_argument('-G', '--gff', default=False, action='store_true', help="output GFF format instead of input format") parser.add_argument('hit_table', nargs='*', type=argparse.FileType('rU'), default=[sys.stdin, ], help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check that we have blast file as argument # if we're not doing auto file names, wriate all outputs to same file if not arguments.autoOutName: if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout if arguments.gff: logging.info("Converting to GFF") # loop over inputs for infile_handle in arguments.hit_table: logging.info("reading data from %s" % (infile_handle.name)) if arguments.autoOutName: outfile_handle = open( getOutputFile( infile_handle.name, arguments), 'w') # filter params = FilterParams.create_from_arguments(arguments) filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff) if arguments.autoOutName: outfile_handle.close() infile_handle.close()
def main(): usage = "usage: %prog [OPTIONS] BLAST_FILE" description = """ Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output. """ # command line options parser = OptionParser(usage, description=description, conflict_handler='resolve') addHitTableOptions(parser, flags='all') parser.add_option("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_option('-O', '--autoOutName', default=False, action='store_true', help="Automatically generate output file name from input name and options. Overridden by -o, cannot be used with data from STDIN.") addUniversalOptions(parser) (options, args) = parser.parse_args() setupLogging(options,description) if options.hitTableFormat=='last': if options.hitTableSort=='evalue': parser.error("The last format has no evalue to sort by, sorry") # check that we have blast file as argument if len(args) <= 1: # input if len(args) == 1: infile = args[0] logging.info("reading data from %s" % (infile)) instream = open(infile,'rU') else: infile = './stdin' logging.info("reading data from STDIN") instream=sys.stdin # output if options.outfilename is not None: logging.info("Writing data to %s" % (options.outfilename)) outstream=open(options.outfilename,'w') elif options.autoOutName: outfile=getOutputFile(infile,options) logging.info("Writing data to %s" % (outfile)) outstream=open(outfile,'w') else: logging.info("writing data to STDOUT") outstream=sys.stdout # filter params=FilterParams.createFromOptions(options) filterM8(instream,outstream,params) else: if not options.autoOutName: if options.outfilename is not None: logging.info("Writing data to %s" % (options.outfilename)) outstream=open(options.outfilename,'w') else: logging.info("writing data to STDOUT") outstream=sys.stdout for infilename in args: logging.info("reading data from %s" % (infilename)) instream=open(infilename,'rU') if options.autoOutName: outstream=open(getOutputFile(infilename,options),'w') # filter params=FilterParams.createFromOptions(options) filterM8(instream,outstream,params) if options.autoOutName: outstream.close() instream.close()
def plotHitStats(axes, sequenceFile, hitsFile, referenceLengths=None, sequenceFormat='fasta', bins=20, hlog=False, lengthRange=None, barcolor='b', baredgecolor='k', hcolor='r', params=None, **kwargs): """ Given two or three matplotlib.axes.AxesSubplot objects create plot in each binned by sequence length: * overlay a histogram of sequence lengths on the fraction of sequences in each bin that have a hit * same bins as above, but use total sequence bases on top of fraction of bases covered by hits * if fasta or lengths of reference hits given, plot (using same bins) fraction of reference bases used in hits Positional Arguments: * axes: length 2 list or tuple of ax objects * sequenceFile: fasta or similar file of sequence data * hitsFile: text hit table Parameters: * hit parsing * params=None edl.blatm8.FilterParams object to filter hits * **kwargs used to create FilterParams object if params object not given * sequence parsing * sequenceFormat='fasta'. Can be anything supported by BioPython * referenceLengths=None: if give, create 3rd plot using given dictionary of hits. It can also just be the fasta of the reference sequences and the code will look up the lengths. * plotting: * bins=20 Number of length bins to divide sequence data into * barcolor='b' Color of data bars * baredgecolor='k' Color of data bar edges * hcolor='r' Color of histogram line and axis labels * lengthRange=None Can be used to force the x axis to span a specific range * hlog=False If set to True, histogram data plotted in log scale """ # get sequence lengths lengths = getSequenceLengths(sequenceFile, format=sequenceFormat) # parse hit file if params is None: params = FilterParams(**kwargs) hits = getSequenceHits(hitsFile, params) # plot data plotTranscriptHitRateByLengthBins(axes[0], lengths, hits, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog) plotTranscriptCoverageByLengthBins(axes[1], lengths, hits, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog) if referenceLengths is not None: plotHitCoverageByLengthBins(axes[2], lengths, hits, referenceLengths, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog)
def main(): # command line arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve') # default to non-overlapping=0 add_hit_table_arguments(parser, flags='all', defaults={'nonoverlapping': 0}) parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument('hit_table', nargs='?', type=argparse.FileType('rU'), default=sys.stdin, help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # output file or STDOUT if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout # input file or STDIN (handled by argparse) infile_handle = arguments.hit_table logging.info("reading data from %s" % (infile_handle.name)) # filter, but don't apply nonoverlapping yet # non-overlapping should be applied per-reference only params = FilterParams.create_from_arguments(arguments) # save user supplied value for later overlap_buffer = params.nonoverlapping # turn off for now params.set_nonoverlapping(-1) # merge hit_iter = filterM8Stream(infile_handle, params, return_lines=False) for query, query_hits in hit_iter: # group by reference hit hits_by_ref = defaultdict(list) for hit in query_hits: hits_by_ref[hit.hit].append(hit) # one output for query/reference pair for ref, ref_hits in hits_by_ref.items(): # remove overlaps unless the buffer has been set to <0 if overlap_buffer >= 0: ref_hits = remove_overlapping_hits( ref_hits, on_hit=True, buffer=params.nonoverlapping) ref_hits = remove_overlapping_hits( ref_hits, on_hit=False, buffer=params.nonoverlapping) # aggregate values length, score, identities = 0, 0, 0 for hit in ref_hits: length += hit.mlen score += hit.score try: # this will be off by 100x identities += hit.pctid * hit.mlen except: # just report pctid=0 if no pctid column in input pass outfile_handle.write( "%s\t%s\t%d\t%d\t%0.2f\n" % (query, ref, length, score, identities / length)) outfile_handle.close() infile_handle.close()