示例#1
0
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(taxonomy,
                                namesMap=kwargs.pop('namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
示例#2
0
def parseM8FileIter(
        inhandle,
        hitStringMap,
        format,
        scorePct,
        parsingStyle,
        countMethod,
        taxonomy=None,
        rank=None,
        ignoreEmptyHits=True,
        sortReads=False):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # check filtering options
    if countMethod == 'first':
        scorePct = -1

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    options = FilterParams()
    options.format = format
    if scorePct >= 0 or sortReads:
        # filter hits on score if requested
        if scorePct >= 0:
            logger.info(
                "Filtering for scores within %s pct of best" %
                scorePct)
            options.topPct = scorePct
            options.sort = 'score'
        options.sortReads = sortReads
        # filters and parses
    options.parseStyle = parsingStyle
    hitIter = filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(
        hitIter,
        hitStringMap=hitStringMap,
        parseStyle=parsingStyle,
        taxonomy=taxonomy,
        rank=rank)

    # apply count method
    hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter
def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    with InputFile(blastFile) as m8stream:
        params = FilterParams.create_from_arguments(options)
        hitcount = 0
        readcount = 0
        keepcount = 0
        for (read, hits) in filterM8Stream(m8stream, params,
                                           return_lines=False):
            readcount += 1
            hitTuples = []
            for hit in hits:
                hitcount += 1
                if abs(hit.qstart - hit.qend) + 1 < minLength:
                    continue

                keepcount += 1
                if hit.format == GFF:
                    annot = "# %d # %d # %s # %s;evalue=%s" % \
                            (hit.qstart, hit.qend,
                             hit.strand, hit.hitDesc, hit.evalue)
                else:
                    try:
                        annot = "%s [%d,%d] %0.1f%% %d bits" % \
                                (hit.hit, hit.hstart, hit.hend,
                                 hit.pctid, hit.score)
                    except AttributeError:
                        annot = "%s [%d,%d] score: %d" % (
                            hit.hit, hit.hstart, hit.hend, hit.score)

                if hit.format == GFF:
                    reverse = hit.strand != "+"
                else:
                    reverse = hit.hstart > hit.hend

                if reverse:
                    # reverse if hit is backwards
                    hitTuples.append((hit.qend, hit.qstart, annot))
                else:
                    hitTuples.append((hit.qstart, hit.qend, annot))
            hitMap[read] = hitTuples

        logging.debug(
            "Kept %d of %d hits from %d lines to %d reads" %
            (keepcount, hitcount, m8stream.lines, readcount))
    return hitMap
示例#4
0
def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    params = FilterParams.create_from_arguments(options)
    m8stream = M8Stream(blastFile)
    hitcount = 0
    readcount = 0
    keepcount = 0
    for (read, hits) in filterM8Stream(m8stream, params, returnLines=False):
        readcount += 1
        hitTuples = []
        for hit in hits:
            hitcount += 1
            if abs(hit.qstart - hit.qend) + 1 < minLength:
                continue

            keepcount += 1
            if hit.format == GFF:
                annot = "# %d # %d # %s # %s;evalue=%s" % (
                    hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue)
            else:
                try:
                    annot = "%s [%d,%d] %0.1f%% %d bits" % (
                        hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score)
                except AttributeError:
                    annot = "%s [%d,%d] score: %d" % (
                        hit.hit, hit.hstart, hit.hend, hit.score)

            if hit.format == GFF:
                reverse = hit.strand != "+"
            else:
                reverse = hit.hstart > hit.hend

            if reverse:
                # reverse if hit is backwards
                hitTuples.append((hit.qend, hit.qstart, annot))
            else:
                hitTuples.append((hit.qstart, hit.qend, annot))
        hitMap[read] = hitTuples

    logging.debug(
        "Kept %d of %d hits to %d reads" %
        (keepcount, hitcount, readcount))
    return hitMap
示例#5
0
def parseM8FileIter(inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # check filtering options
    if countMethod == 'first':
        scorePct=-1

    # setup some variables
    infoInDescription = parsingStyle in [KEGG,ORGS]

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    options=FilterParams()
    options.format=format
    if scorePct >= 0 or sortReads:
        # filter hits on score if requested
        if scorePct>=0:
            logger.info("Filtering for scores within %s pct of best" % scorePct)
            options.topPct=scorePct
            options.sort='score'
        options.sortReads=sortReads
        # filters and parses
    options.parseStyle=parsingStyle
    hitIter=filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank)

    #debugKey="F4UZ9WW02HMBZJ"
    #logger.debug("Hits for %s: %r" % (debugKey,hitMap[debugKey]))

    # apply count method
    hitIter=applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter
示例#6
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

    # command line arguments
    parser = argparse.ArgumentParser(description=description,
                                     conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
        "and options. Overridden by -o, cannot be used with data "
        "from STDIN.")
    parser.add_argument('-G',
                        '--gff',
                        default=False,
                        action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table',
                        nargs='*',
                        type=argparse.FileType('rU'),
                        default=[
                            sys.stdin,
                        ],
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(getOutputFile(infile_handle.name, arguments),
                                  'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
示例#7
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

# command line arguments
    parser = argparse.ArgumentParser(
        description=description,
        conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument(
        "-o",
        "--outfilenome",
        dest="outfilename",
        default=None,
        metavar="OUTFILENAME",
        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
             "and options. Overridden by -o, cannot be used with data "
             "from STDIN.")
    parser.add_argument('-G', '--gff', default=False, action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table', nargs='*',
                        type=argparse.FileType('rU'), default=[sys.stdin, ],
                        help="Table of search results to be filtered. "
                             "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(
                getOutputFile(
                    infile_handle.name,
                    arguments),
                'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
def main():
    usage = "usage: %prog [OPTIONS] BLAST_FILE"
    description = """
    Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output.
    """

# command line options
    parser = OptionParser(usage, description=description, conflict_handler='resolve')
    addHitTableOptions(parser, flags='all')
    parser.add_option("-o", "--outfilenome", dest="outfilename", default=None,
                      metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.")
    parser.add_option('-O', '--autoOutName', default=False,
                      action='store_true',
                      help="Automatically generate output file name from input name and options. Overridden by -o, cannot be used with data from STDIN.")

    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options,description)

    if options.hitTableFormat=='last':
        if options.hitTableSort=='evalue':
            parser.error("The last format has no evalue to sort by, sorry")

    # check that we have blast file as argument
    if len(args) <= 1:
        # input
        if len(args) == 1:
            infile = args[0]
            logging.info("reading data from %s" % (infile))
            instream = open(infile,'rU')
        else:
            infile = './stdin'
            logging.info("reading data from STDIN")
            instream=sys.stdin

        # output
        if options.outfilename is not None:
            logging.info("Writing data to %s" % (options.outfilename))
            outstream=open(options.outfilename,'w')
        elif options.autoOutName:
            outfile=getOutputFile(infile,options)
            logging.info("Writing data to %s" % (outfile))
            outstream=open(outfile,'w')
        else:
            logging.info("writing data to STDOUT")
            outstream=sys.stdout

        # filter
        params=FilterParams.createFromOptions(options)
        filterM8(instream,outstream,params)
    else:
        if not options.autoOutName:
            if options.outfilename is not None:
                logging.info("Writing data to %s" % (options.outfilename))
                outstream=open(options.outfilename,'w')
            else:
                logging.info("writing data to STDOUT")
                outstream=sys.stdout
        for infilename in args:
            logging.info("reading data from %s" % (infilename))
            instream=open(infilename,'rU')
            if options.autoOutName:
                outstream=open(getOutputFile(infilename,options),'w')

            # filter
            params=FilterParams.createFromOptions(options)
            filterM8(instream,outstream,params)

            if options.autoOutName:
                outstream.close()
            instream.close()
示例#9
0
def plotHitStats(axes,
                 sequenceFile,
                 hitsFile,
                 referenceLengths=None,
                 sequenceFormat='fasta',
                 bins=20,
                 hlog=False,
                 lengthRange=None,
                 barcolor='b',
                 baredgecolor='k',
                 hcolor='r',
                 params=None,
                 **kwargs):
    """
    Given two or three matplotlib.axes.AxesSubplot objects create plot in
    each binned by sequence length:

     * overlay a histogram of sequence lengths on the fraction of sequences
       in each bin that have a hit
     * same bins as above, but use total sequence bases on top of fraction
       of bases covered by hits
     * if fasta or lengths of reference hits given, plot (using same bins)
       fraction of reference bases used in hits

    Positional Arguments:
     * axes: length 2 list or tuple of ax objects
     * sequenceFile: fasta or similar file of sequence data
     * hitsFile: text hit table

    Parameters:
     * hit parsing
      * params=None edl.blatm8.FilterParams object to filter hits
      * **kwargs used to create FilterParams object if params object not given
     * sequence parsing
      * sequenceFormat='fasta'. Can be anything supported by BioPython
      * referenceLengths=None: if give, create 3rd plot using given
        dictionary of hits. It can also just be the fasta of the reference
        sequences and the code will look up the lengths.
     * plotting:
      * bins=20 Number of length bins to divide sequence data into
      * barcolor='b' Color of data bars
      * baredgecolor='k' Color of data bar edges
      * hcolor='r' Color of histogram line and axis labels
      * lengthRange=None Can be used to force the x axis to span a
        specific range
      * hlog=False If set to True, histogram data plotted in log scale
    """

    # get sequence lengths
    lengths = getSequenceLengths(sequenceFile, format=sequenceFormat)

    # parse hit file
    if params is None:
        params = FilterParams(**kwargs)
    hits = getSequenceHits(hitsFile, params)

    # plot data
    plotTranscriptHitRateByLengthBins(axes[0],
                                      lengths,
                                      hits,
                                      bins=bins,
                                      lengthRange=lengthRange,
                                      barcolor=barcolor,
                                      baredgecolor=baredgecolor,
                                      hcolor=hcolor,
                                      hlog=hlog)
    plotTranscriptCoverageByLengthBins(axes[1],
                                       lengths,
                                       hits,
                                       bins=bins,
                                       lengthRange=lengthRange,
                                       barcolor=barcolor,
                                       baredgecolor=baredgecolor,
                                       hcolor=hcolor,
                                       hlog=hlog)
    if referenceLengths is not None:
        plotHitCoverageByLengthBins(axes[2],
                                    lengths,
                                    hits,
                                    referenceLengths,
                                    bins=bins,
                                    lengthRange=lengthRange,
                                    barcolor=barcolor,
                                    baredgecolor=baredgecolor,
                                    hcolor=hcolor,
                                    hlog=hlog)
示例#10
0
def main():
    # command line arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler='resolve')

    # default to non-overlapping=0
    add_hit_table_arguments(parser,
                            flags='all',
                            defaults={'nonoverlapping': 0})
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument('hit_table',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # output file or STDOUT
    if arguments.outfilename is not None:
        logging.info("Writing data to %s" % (arguments.outfilename))
        outfile_handle = open(arguments.outfilename, 'w')
    else:
        logging.info("writing data to STDOUT")
        outfile_handle = sys.stdout

    # input file or STDIN (handled by argparse)
    infile_handle = arguments.hit_table
    logging.info("reading data from %s" % (infile_handle.name))

    # filter, but don't apply nonoverlapping yet
    # non-overlapping should be applied per-reference only
    params = FilterParams.create_from_arguments(arguments)
    # save user supplied value for later
    overlap_buffer = params.nonoverlapping
    # turn off for now
    params.set_nonoverlapping(-1)

    # merge
    hit_iter = filterM8Stream(infile_handle, params, return_lines=False)
    for query, query_hits in hit_iter:
        # group by reference hit
        hits_by_ref = defaultdict(list)
        for hit in query_hits:
            hits_by_ref[hit.hit].append(hit)

        # one output for query/reference pair
        for ref, ref_hits in hits_by_ref.items():

            # remove overlaps unless the buffer has been set to <0
            if overlap_buffer >= 0:
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=True, buffer=params.nonoverlapping)
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=False, buffer=params.nonoverlapping)

            # aggregate values
            length, score, identities = 0, 0, 0
            for hit in ref_hits:
                length += hit.mlen
                score += hit.score
                try:
                    # this will be off by 100x
                    identities += hit.pctid * hit.mlen
                except:
                    # just report pctid=0 if no pctid column in input
                    pass

            outfile_handle.write(
                "%s\t%s\t%d\t%d\t%0.2f\n" %
                (query, ref, length, score, identities / length))

    outfile_handle.close()
    infile_handle.close()