Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)

    parser.add_argument("contigs_fasta",
                        help="The contigs as a fasta files")
    parser.add_argument("output_file_prefix",
                        help="Three files are created with this prefix \
                              and one of hte suffixes: '.gff','.fna','.faa'")
    parser.add_argument("-r", "--rna-gff", action='append', default=[],
                        help="GFF file containing RNA annotations. \
                              May be specified multiple times for \
                              multiple files")
    parser.add_argument("-c", "--cds-gff", action='append', default=[],
                        help="GFF file containing RNA annotations. \
                              May be specified multiple times for \
                              multiple files")

    add_universal_arguments(parser)
    args = parser.parse_args()
    setup_logging(args)

    logging.info("Generating annotations for {contigs_fasta}".
                 format(**vars(args)))

    merge_gffs(args.rna_gff,
               args.cds_gff,
               args.contigs_fasta,
               args.output_file_prefix)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)

    parser.add_argument("contigs_fasta",
                        help="The contigs as a fasta files")
    parser.add_argument("output_file_prefix",
                        help="Three files are created with this prefix \
                              and one of hte suffixes: '.gff','.fna','.faa'")
    parser.add_argument("-r", "--rna-gff", action='append', default=[],
                        help="GFF file containing RNA annotations. \
                              May be specified multiple times for \
                              multiple files")
    parser.add_argument("-c", "--cds-gff", action='append', default=[],
                        help="GFF file containing RNA annotations. \
                              May be specified multiple times for \
                              multiple files")

    add_universal_arguments(parser)
    args = parser.parse_args()
    setup_logging(args)

    logging.info("Generating annotations for {contigs_fasta}".
                 format(**vars(args)))

    merge_gffs(args.rna_gff,
               args.cds_gff,
               args.contigs_fasta,
               args.output_file_prefix)
def main():
    description = __doc__

    parser = argparse.ArgumentParser(description)
    parser.add_argument("-t", "--taxfile", dest="taxfile",
                      metavar="FILE", help="Read Silva ranks from FILE")
    parser.add_argument("-d","--dbType",default="silva",choices=['silva','pr2'],
            help="Which database are we importing: 'silva' (default) or 'pr2'")
    parser.add_argument("-f","--fastaout",default=None, metavar='FILE',
            help="Write fasta with modified headers to FILE. Only used for pr2")
    parser.add_argument("-i","--idStart", default=0, type=int,
            help="Start taxid counting with this number")
    parser.add_argument("-o", "--output_map", default="lastdb.tax",
            help="File (relative to OUTPUT_DIR) to write id->taxid map to. Defaults to lastdb.tax")
    parser.add_argument('silva_fasta', nargs=1, 
            metavar='SILVA_FASTA')
    parser.add_argument('output_dir', nargs=1, 
            metavar="OUTPUT_DIR", help="Directory in which to create files")

    # logging and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # name logger
    logger=logging.getLogger(sys.argv[0])

    fastafile = arguments.silva_fasta[0]
    dumpDir = arguments.output_dir[0]

    # parse the input files
    if arguments.dbType=='pr2':
        rootNode, taxmap = buildPR2Tree(fastafile, 
                                        fastaout=arguments.fastaout,
                                        nextId=arguments.idStart,
                                        )
    else:
        if arguments.taxfile is None:
            parser.error("You must provide the tax file for Silva (-t)")
        rootNode, taxmap = buildSilvaTree(arguments.taxfile, fastafile, logger)

    nodesFile = os.path.sep.join((dumpDir, 'nodes.dmp'))
    namesFile = os.path.sep.join((dumpDir, 'names.dmp'))
    taxFile = os.path.sep.join((dumpDir, arguments.output_map))

    logger.info("Writing taxonomy to %s and %s" % (nodesFile, namesFile))
    with open(nodesFile,'w') as nodeFile:
        with open(namesFile,'w') as nameFile:
            writeDumpFiles(rootNode, nodeFile, nameFile)

    logger.info("Writing taxid table to %s" % (taxFile))
    with open(taxFile,'w') as taxMapFile:
        for (hitid, taxid) in taxmap.items():
            taxMapFile.write("%s\t%s\n" % (hitid, taxid))
Exemplo n.º 4
0
def main():
    """
    Sets up the command line interface
    """
    description = __doc__

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('infile',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help=("Input file (defaults to STDIN) containing "
                              "a value on each line"))
    parser.add_argument('outfile',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="File to which to write histogram")
    parser.add_argument('-b', '--bins', type=int, default=50)
    parser.add_argument('-l', '--label', default="value")
    parser.add_argument('-w', '--width', default=75, type=int)
    parser.add_argument('-L', '--log', action='store_true')
    parser.add_argument('-W', '--max-label-width', type=int, default=10)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    values = []
    for line in arguments.infile:
        try:
            values.append(float(line.strip()))
        except ValueError:
            if len(line.strip()) > 0:
                logging.warning("Skipping bad line:\n%s", line.strip())
    logging.info("Read in %d values", len(values))
    arguments.outfile.write(
        ascii_histogram(
            histogram(values, bins=arguments.bins),
            label=arguments.label,
            width=arguments.width,
            log=arguments.log,
            maxLabelWidth=arguments.max_label_width,
        ))
Exemplo n.º 5
0
def main():
    """ The CLI """
    description = """
Takes a hit table (reads searched against a database) and assigns
each read to a taxon. Hit table may be specified with -i or piped to STDIN.

    Notes:

     * Specifying a top score precent (-F) will force hits to be sorted
       by score within each read. However, it is assumed that the hits in
       the input table(s) are already grouped by read. This program does
       not attempt to sort the entire input.
    """
    parser = argparse.ArgumentParser(description)
    util.add_IO_arguments(parser)
    parser.add_argument("-T", "--taxids", default=False, action="store_true",
                        help="Output taxids instead of names")
    edlhits.add_taxon_arguments(parser)
    parser.add_argument(
        "-r",
        "--rank",
        dest="rank",
        default=None,
        metavar="RANK",
        help=" Rank to collect counts on.  Defaults to None (whatever "
             "the annotation was). Corresponds to rank names in nodes.dmp. "
             "To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in "
             "ncbi tax dir")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
             "Will be ignored if beyond the rank of the taxa "
             "(IE We can't include species if the taxon being counted "
             "is genus)")
    parser.add_argument(
        "--no-header",
        dest="no_header",
        default=False,
        action='store_true',
        help="do not write header line")

    util.add_universal_arguments(parser)
    arguments = parser.parse_args()
    util.setup_logging(arguments)

    logging.debug("Parsing style is: %s", arguments.parseStyle)

    # Handle the case where Galaxy tries to set None as a string
    arguments.printRanks = util.checkNoneOption(arguments.printRanks)

    # check arguments
    if arguments.taxids and arguments.taxdir is None:
        parser.error("Only use -T when a taxonomy is specified")
    if arguments.rank is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-r).")
    if arguments.printRanks is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-R).")
    if arguments.rank is not None:
        if arguments.rank == 'domain':
            logging.warning('translating domain to superkingdom')
            arguments.rank = 'superkingdom'
        if arguments.rank not in ranks:
            parser.error("Unknown rank: %s" % (arguments.rank))

    try:
        # Make sure the rank lists make sense
        if arguments.printRanks is not None:
            arguments.printRanks = cleanRanks(arguments.printRanks)
    except Exception as exc:
        parser.error(str(exc))

    # load necessary maps
    (taxonomy, value_map) = edlhits.readMaps(arguments)

    # loop over inputs
    for (inhandle, outhandle) in util.inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s",
            inhandle, outhandle)
        hit_iter = edlhits.parseM8FileIter(
            inhandle,
            value_map,
            edlhits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            taxonomy=taxonomy,
            rank=arguments.rank)

        ##
        # print output
        # choose output method
        if arguments.taxids:
            hit_header = 'taxid'
            printer = taxid_printer
        else:
            if arguments.printRanks is None:
                hit_header = 'Hit(s)'
                printer = default_printer
            else:
                hit_header = '\t'.join(arguments.printRanks)

                def printer(read, hits):
                    " Inline function to reduce number of arguments "
                    return tax_table_printer(read,
                                             hits,
                                             arguments.rank,
                                             arguments.printRanks)

        # loop over reads
        if not arguments.no_header:
            outhandle.write("Read\t{}\n".format(hit_header))
        for (read, hits) in hit_iter:
            outhandle.write(printer(read, hits))
Exemplo n.º 6
0
def main():
    # set up CLI
    description = """
    Takes a tabular text file and translate a column of KO values into a new
    column of KEGG pathways. KO column can have multiple entries per row.
    Output column will have multiple entries per pathway cell.
    """

    parser = argparse.ArgumentParser(description=description)
    util.add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="level",
                        default="PATHWAY",
                        metavar="LEVEL",
                        help=""" Level to collect counts on. Level can
                        be one of:
                          NAME, PATHWAY, EC, DEFINITION,
                          or a level in the CLASS heirachy: 1, 2, or 3
                        """)
    parser.add_argument(
        "-f",
        "--fill_missing",
        dest="fill",
        metavar="FILL",
        default="No Pathway",
        help="Put FILL in column when KO has no pathway assigned. "
        "Defaults to 'No Pathway'")
    # format, ortholog heirarchy, and more
    parser.add_argument("-k",
                        "--ko_file",
                        dest="ko_file",
                        metavar="MAPFILE",
                        help="Location of kegg ko file")
    parser.add_argument("-c",
                        "--ko_column",
                        type=int,
                        default=1,
                        help="Column number (first column is 1)",
                        metavar="COLUMN")
    parser.add_argument(
        "-C",
        "--new_column",
        type=int,
        default=None,
        help="Column number to insert new column after. Default is the "
        "after the source column. 0=>make it the first column. "
        "-1=>make it the last column.",
        metavar="COLUMN")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action='store_true',
        help="Insert new duplicate rows if KO maps to multiple values")
    parser.add_argument(
        "-H",
        "--header",
        default=None,
        metavar='HEADER',
        help="Put HEADER in first row instead of trying to translate")
    parser.add_argument("-Q",
                        "--quotes",
                        default=False,
                        action="store_true",
                        help="Encase translated values in double quotes")
    parser.add_argument(
        "-s",
        "--sep",
        dest='sep',
        default='\t',
        help="""Character separating table cells. Default is tab""")
    parser.add_argument(
        "-S",
        "--ko_sep",
        dest='ko_sep',
        default=';',
        help="""Character separating multiple KO values in iput table
            and used to separate multiple values in output column.
            Default is ";". Ignored for output if --longOutput
            requested""")

    # log level and help
    util.add_universal_arguments(parser)
    arguments = parser.parse_args()
    util.setup_logging(arguments)

    logging.info("KO mapping from: " + arguments.ko_file)
    logging.debug("Fill: '%s'" % (arguments.fill))

    translation = kegg.parse_KEGG_file(arguments.ko_file, arguments.level)

    # switch to zero indexing
    if arguments.new_column:
        arguments.new_column -= 1
    arguments.ko_column -= 1

    for (inhandle, outhandle) in util.inputIterator(arguments):
        for new_line in translate_ko_column(
                inhandle,
                sep=arguments.sep,
                ko_sep=arguments.ko_sep,
                ko_column=arguments.ko_column,
                new_column=arguments.new_column,
                translation=translation,
                default=arguments.fill,
                quotes=arguments.quotes,
                header=arguments.header,
                long_out=arguments.long_output,
        ):
            outhandle.write(new_line)
def main():
    description = __doc__

# command line options
    parser = argparse.ArgumentParser(description, conflict_handler='resolve')
    parser.add_argument("input_files", nargs=1,
                        default=[],
                        metavar="INFILE",
                        help="Hit table to process")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write masked fasta output to OUTFILE (default is STDOUT).")
    parser.add_argument(
        "-i",
        "--infile",
        dest="fasta",
        metavar="FILE",
        help=" File containing the fasta (defaults to STDIN)")
    parser.add_argument(
        "-M",
        "--mask",
        dest="keep",
        default=True,
        action="store_false",
        help="Return unmatched sequence fragments instead of hits.")
    parser.add_argument("-m", "--minLength", dest="minLength", type=int,
                        metavar="BASES", default=1,
                        help="minimum number of bases for sequences in output")
    parser.add_argument(
        "-n",
        "--numbering_prefix",
        default=None,
        help="If given, name extracted sequence with this scring followed "
             "by a sinmple counting index of all extracted sequences. For "
             "example, -n \"r\" would add _r1 to the end of the first "
             "extracted sequence and _r2 to the second, and so on. By "
             "default, extracted sequences are named with start_end "
             "positions.")

    parser.add_argument(
        "-t",
        "--translate",
        default=False,
        action='store_true',
        help="Transalte to Amino Acid sequences")

    add_hit_table_arguments(parser, flags='all')

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check that we have blast file as argument
    if len(arguments.input_files) != 1:
        parser.error(
            "Please supply the name of a hit table as the only argument")
    blastFile = arguments.input_files[0]

    # set up input/output streams
    if arguments.fasta is None:
        fastaHandle = sys.stdin
        fastaStr = 'STDIN'
    else:
        fastaHandle = open(arguments.fasta, "rt")
        fastaStr = arguments.fasta
    logging.info(
        "Extrating sequence fragments from %s based on hits in %s" %
        (fastaStr, blastFile))

    if arguments.outfile is None:
        logging.info("Writing %s sequences to STDOUT" % ('fasta'))
        outputHandle = sys.stdout
    else:
        logging.info(
            "Writing %s sequences to %s" %
            ('fasta', arguments.outfile))
        outputHandle = open(arguments.outfile, 'w')

    # load hit regions
    if arguments.keep:
        minHitLength = arguments.minLength
    else:
        minHitLength = 1
    readHits = loadHitRegions(blastFile, minHitLength, arguments)
    logging.info("Found hits for %d reads" % (len(readHits)))

    # process the fasta file with hits
    extractHits(
        fastaHandle,
        outputHandle,
        readHits,
        arguments.translate,
        arguments.minLength,
        arguments.keep,
        arguments.numbering_prefix)
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)
Exemplo n.º 9
0
def main():
    """" Set up the CLI """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_files",
                        nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o",
                        "--outfile",
                        dest="outfile",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-r",
                        "--rank",
                        dest="ranks",
                        default=None,
                        metavar="RANK",
                        action="append",
                        help=""" Rank(s) to collect counts on. Use flag
                        multiple
                        times to specify multiple ranks. If multiple values
                        given, one table produced for each with rank name
                        appended to file name. Defaults to all major ranks
                        between phylum and species. Corresponds to rank names
                        in nodes.dmp. To see list run:
                        'cut -f5 nodes.dmp | uniq | sort | uniq'
                        in ncbi tax dir. Will also accept 'organism' to mean
                        no rank (ie, just the organism name).""")
    parser.add_argument(
        "-s",
        "--collapseToDomain",
        default=False,
        action="store_true",
        help="Collapse all taxa below given rank down to "
        "superkingdom/domain. EG: in the genus output, anything "
        "assigned to Cyanobactia, will be lumped in with all "
        "other bacteria")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
        "Will be ignored if beyond the rank of the taxa "
        "(IE We can't include species if the taxon being counted "
        "is genus)")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, tax dir, and more
    add_taxon_arguments(parser,
                        choices={
                            'countMethod': ('LCA', 'all', 'first', 'most',
                                            'tophit', 'toporg', 'consensus')
                        })

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Handle the case where Galaxy tries to set None as a string
    arguments.ranks = checkNoneOption(arguments.ranks)
    arguments.printRanks = checkNoneOption(arguments.printRanks)

    logging.info("Printing out ranks: %r", arguments.ranks)

    # Set defaults and check for some conflicts
    if arguments.ranks is None and arguments.taxdir is None:
        # using hit names only
        arguments.ranks = [ORG_RANK]
        if arguments.printRanks is not None:
            parser.error("Display ranks are not used without taxonomic info")
    else:
        if arguments.taxdir is None:
            parser.error("Cannot select ranks without a taxonomy")
        if arguments.ranks is None:
            # set a default
            arguments.ranks = [
                'phylum', 'class', 'order', 'family', 'genus', 'species'
            ]

        try:
            # Make sure the rank lists make sense
            arguments.ranks = cleanRanks(arguments.ranks)
            if arguments.printRanks is not None:
                arguments.printRanks = cleanRanks(arguments.printRanks)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single rank
    if len(arguments.ranks) > 1 and arguments.outfile is None:
        parser.error("STDOUT only works if a single rank is chosen!")

    # Because rank is used in parsing hits, we can only do multiple ranks for
    # certain kinds of count methods
    if len(arguments.ranks) > 1:
        rank = None
        if arguments.countMethod in ['consensus', 'most']:
            parser.error(
                "Using multiple ranks does not work with the 'consensus' "
                "or 'most' counting methods. LCA should give the same "
                "results as consensus. If you really want to do this, "
                "use a bash loop:'for rank in phylum order genus; do "
                "COMMAND -r ${rank}; done'")
    else:
        rank = arguments.ranks[0]

    # load necessary maps
    (taxonomy, hitStringMap) = readMaps(arguments)

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        params = FilterParams.create_from_arguments(arguments)
        multifile = redistribute.multipleFileWrapper(fileLabels.keys())

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=True,
                parseStyle=arguments.parseStyle,
                sequenceWeights=sequenceWeights)
            # define method to turn Hits into orgnaisms
            hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                             taxonomy=taxonomy,
                                             hitStringMap=hitStringMap)

            translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0]

        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                returnTranslations=True,
                winnerTakeAll=True,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=ACCS)

            # Organisms will be returned, make translator trivial:
            translateHit = passThrough

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_name, read_name = read_name.split("/", 1)
            file_tag = fileLabels[unquote_plus(file_name)]
            taxon = translateHit(hit)
            taxcount = fileCounts[file_tag].setdefault(taxon, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][taxon] = taxcount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      hitStringMap,
                                      arguments.hitTableFormat,
                                      arguments.filter_top_pct,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      taxonomy=taxonomy,
                                      rank=rank)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info("parsed %d hits (%d unique) for %d reads from %s",
                         total, len(counts), len(hitMap), filename)

            infile.close()

    printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)
Exemplo n.º 10
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files", nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o", "--outfile", dest="output_file",
                        metavar="OUTFILE", help="Write count table to OUTFILE")
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={'countMethod':
                 ('tophit',
                  'first',
                  'most',
                  'all',
                  'consensus')},
        helps={'countMethod':
               ("How to deal with counts from multiple hits. ('first': "
                "just use the first hit, 'most': "
                "can return multiple hits, 'all': return every hit, "
                "consensus: return None unless all the same). Do not "
                "use most or consensus with more than one level at a time. "
                "Default is 'tophit': This breaks any ties by choosing "
                "the most abundant hit based on other unambiguous "
                "assignments.")})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                valueDelim=arguments.tab_map_delim,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s",
                         len(valueMap), next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    params = FilterParams.create_from_arguments(arguments)
    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug(
                "READ: %s\t%s\t%s\t%s",
                file_tag, read_name, hit.hit, gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      params,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s",
                total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
Exemplo n.º 11
0
def main():
    # set up CLI
    description = __doc__

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("-i", "--infile", dest="infile",
                        metavar="FILE", help="Read raw table from INFILE")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write collapsed table to OUTFILE")
    parser.add_argument("-d", "--delim", dest="delim", default="\t",
                        help="Input table delimiter", metavar="DELIM")
    parser.add_argument("-D", "--delimOut", dest="delimOut", default="\t",
                        help="Output table delimiter", metavar="DELIM")
    parser.add_argument(
        '-F',
        '--countFirst',
        action='store_true',
        default=False,
        help="Don't skip the first line, it's NOT a header")
    parser.add_argument(
        "-R",
        "--readColumn",
        dest="readCol",
        type=int,
        default=0,
        help="Index (starting at 0) of column with read name, 0 is default",
        metavar="READCOL")
    parser.add_argument(
        "-H",
        "--hitColumn",
        dest="hitCol",
        type=int,
        default=2,
        help="Index (starting at 0) of column with hit name (for counting), "
             "2 is default, if less than zero, all (non-read) columns will "
             "be used as multiple hits",
        metavar="HITCOL")
    parser.add_argument(
        '-s',
        '--hitSep',
        default=None,
        help="Use this string to split multiple values in single hit cell. "
             "Default is 'None' to leave hits as is, use 'eval' to parse "
             "as python repr strings")
    add_weight_arguments(parser, multiple=False)
    parser.add_argument("-T", "--total", default=False, action="store_true",
                        help="Report 'Total' in the first row")

    # cutoff options
    add_count_arguments(parser, {'cutoff': 0})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # make sure we have something to do
    if (arguments.infile is None):
        logging.info("Reading table from: STDIN")
    else:
        logging.info("Reading table from: " + arguments.infile)

    if (arguments.outfile is None):
        logging.info("Writing counts to: STDOUT")
    else:
        logging.info("Writing counts to: " + arguments.outfile)

    # process arguments
    takeFirst = (arguments.allMethod == 'first')
    splitHits = (arguments.hitSep is not None and arguments.hitSep != 'None')
    uncluster = (arguments.weights is not None)

    if arguments.hitSep == 'eval':
        parser.error("Sorry, parsing with eval is not yet supported!")

    # inform the curious user
    logging.info("Delimiter: '" + arguments.delim)
    logging.info("Read names in col: '" + str(arguments.readCol))
    logging.info("Hit names in col: '" + str(arguments.hitCol))
    if splitHits:
        logging.info("Splitting hits with: %s" % (arguments.hitSep))
        logging.warn(
            "Splitting hits has not been tested yet! Let me know how it goes.")
    if takeFirst:
        logging.info("Taking first hit for each read.")
    else:
        if arguments.allMethod == 'portion':
            logging.info("Dividing count among all hits for each read.")
        else:
            logging.info("Adding 1 to every hit for each read")
    if uncluster:
        logging.info(
            "Getting read cluster sizes from: %s" %
            (arguments.weights))
    if arguments.countFirst:
        logging.info("First line is data")
    else:
        logging.info("Skipping first line")

    # Do the counting!
    counts = {}
    countHitsForRead = getAllMethod(arguments.allMethod)

    clusteredReadCounts = {}
    if uncluster:
        clusteredReadCounts = parseMapFile(
            arguments.clusterFile, valueType=int)

    currentRead = ''
    readCount = 1
    hits = []

    if arguments.infile is None:
        infile = sys.stdin
    else:
        infile = open(arguments.infile)

    # loop over lines
    if not arguments.countFirst:
        # skip first line
        try:
            next(infile)
        except StopIteration:
            raise Exception("No lines in %s" % str(infile))

    for line in infile:
        line = line.rstrip('\r\n')
        rowcells = line.split(arguments.delim)
        # get read
        read = rowcells[arguments.readCol]

        # if it's a new read, process previous read
        if currentRead == '':
            currentRead = read
        elif read != currentRead and currentRead != '':
            readCount += 1
            logging.info("Checking hits for %s" % currentRead)

            # was it part of a cluster?
            multiplier = 1
            if uncluster:
                multiplier = clusteredReadCounts[currentRead]

            # where does the count for this read go
            countHitsForRead(hits, counts, multiplier=multiplier)

            hits = []
            currentRead = read

        # get hit from this line
        if arguments.hitCol >= 0:
            hit = rowcells[arguments.hitCol]
            if splitHits:
                hits.extend(hit.split(arguments.hitSep))
            else:
                hits.append(hit)
        else:
            rowcells.pop(arguments.readCol)
            hits.extend(rowcells)

    # check last read!
    logging.info("Checking hits for %s" % currentRead)
    # was it part of a cluster?
    multiplier = 1
    if uncluster:
        multiplier = clusteredReadCounts[currentRead]
    # where does the count for this read go
    countHitsForRead(hits, counts, multiplier=multiplier)

    # apply cutoff
    if arguments.cutoff > 0:
        applyFractionalCutoff(counts, threshold=arguments.cutoff * readCount)

    # print output
    if arguments.outfile is None:
        outhandle = sys.stdout
    else:
        outhandle = open(arguments.outfile, 'w')

    if arguments.total:
        outhandle.write("Total%s%d\n" % (arguments.delimOut, readCount))

    if arguments.allMethod == 'portion':
        outFmtString = "%s%s%f\n"
    else:
        outFmtString = "%s%s%d\n"

    delimRE = re.compile(arguments.delimOut)
    for hit in sorted(counts.keys()):
        count = counts[hit]
        hit = delimRE.sub('_', hit)
        outhandle.write(outFmtString % (hit, arguments.delimOut, count))
Exemplo n.º 12
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

    # command line arguments
    parser = argparse.ArgumentParser(description=description,
                                     conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
        "and options. Overridden by -o, cannot be used with data "
        "from STDIN.")
    parser.add_argument('-G',
                        '--gff',
                        default=False,
                        action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table',
                        nargs='*',
                        type=argparse.FileType('rU'),
                        default=[
                            sys.stdin,
                        ],
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(getOutputFile(infile_handle.name, arguments),
                                  'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
Exemplo n.º 13
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files",
                        nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o",
                        "--outfile",
                        dest="output_file",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={
            'countMethod': ('tophit', 'first', 'most', 'all', 'consensus')
        },
        helps={
            'countMethod':
            ("How to deal with counts from multiple hits. ('first': "
             "just use the first hit, 'most': "
             "can return multiple hits, 'all': return every hit, "
             "consensus: return None unless all the same). Do not "
             "use most or consensus with more than one level at a time. "
             "Default is 'tophit': This breaks any ties by choosing "
             "the most abundant hit based on other unambiguous "
             "assignments.")
        })

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s", len(valueMap),
                         next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        params = FilterParams.create_from_arguments(arguments)
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug("READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit,
                          gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      arguments.hitTableFormat,
                                      arguments.filter_top_pct,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info("parsed %d hits (%d unique) for %d reads from %s",
                         total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
Exemplo n.º 14
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-S',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
        "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug("Reading from %s and writing to %s" %
                      (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            hits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write("%s\t%s\n" %
                                    (read, "\t".join(assignmentList)))
Exemplo n.º 15
0
def main():
    # command line arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler='resolve')

    # default to non-overlapping=0
    add_hit_table_arguments(parser,
                            flags='all',
                            defaults={'nonoverlapping': 0})
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument('hit_table',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # output file or STDOUT
    if arguments.outfilename is not None:
        logging.info("Writing data to %s" % (arguments.outfilename))
        outfile_handle = open(arguments.outfilename, 'w')
    else:
        logging.info("writing data to STDOUT")
        outfile_handle = sys.stdout

    # input file or STDIN (handled by argparse)
    infile_handle = arguments.hit_table
    logging.info("reading data from %s" % (infile_handle.name))

    # filter, but don't apply nonoverlapping yet
    # non-overlapping should be applied per-reference only
    params = FilterParams.create_from_arguments(arguments)
    # save user supplied value for later
    overlap_buffer = params.nonoverlapping
    # turn off for now
    params.set_nonoverlapping(-1)

    # merge
    hit_iter = filterM8Stream(infile_handle, params, return_lines=False)
    for query, query_hits in hit_iter:
        # group by reference hit
        hits_by_ref = defaultdict(list)
        for hit in query_hits:
            hits_by_ref[hit.hit].append(hit)

        # one output for query/reference pair
        for ref, ref_hits in hits_by_ref.items():

            # remove overlaps unless the buffer has been set to <0
            if overlap_buffer >= 0:
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=True, buffer=params.nonoverlapping)
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=False, buffer=params.nonoverlapping)

            # aggregate values
            length, score, identities = 0, 0, 0
            for hit in ref_hits:
                length += hit.mlen
                score += hit.score
                try:
                    # this will be off by 100x
                    identities += hit.pctid * hit.mlen
                except:
                    # just report pctid=0 if no pctid column in input
                    pass

            outfile_handle.write(
                "%s\t%s\t%d\t%d\t%0.2f\n" %
                (query, ref, length, score, identities / length))

    outfile_handle.close()
    infile_handle.close()
Exemplo n.º 16
0
def main():
    description = __doc__

# command line options
    parser = argparse.ArgumentParser(description, conflict_handler='resolve')
    parser.add_argument("input_files", nargs=1,
                        default=[],
                        metavar="INFILE",
                        help="Hit table to process")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write masked fasta output to OUTFILE (default is STDOUT).")
    parser.add_argument(
        "-i",
        "--infile",
        dest="fasta",
        metavar="FILE",
        help=" File containing the fasta (defaults to STDIN)")
    parser.add_argument(
        "-M",
        "--mask",
        dest="keep",
        default=True,
        action="store_false",
        help="Return unmatched sequence fragments instead of hits.")
    parser.add_argument("-m", "--minLength", dest="minLength", type=int,
                        metavar="BASES", default=1,
                        help="minimum number of bases for sequences in output")
    parser.add_argument(
        "-n",
        "--numbering_prefix",
        default=None,
        help="If given, name extracted sequence with this scring followed "
             "by a sinmple counting index of all extracted sequences. For "
             "example, -n \"r\" would add _r1 to the end of the first "
             "extracted sequence and _r2 to the second, and so on. By "
             "default, extracted sequences are named with start_end "
             "positions.")

    parser.add_argument(
        "-t",
        "--translate",
        default=False,
        action='store_true',
        help="Transalte to Amino Acid sequences")

    add_hit_table_arguments(parser, flags='all')

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check that we have blast file as argument
    if len(arguments.input_files) != 1:
        parser.error(
            "Please supply the name of a hit table as the only argument")
    blastFile = arguments.input_files[0]

    # set up input/output streams
    if arguments.fasta is None:
        fastaHandle = sys.stdin
        fastaStr = 'STDIN'
    else:
        fastaHandle = open(arguments.fasta, "rU")
        fastaStr = arguments.fasta
    logging.info(
        "Extrating sequence fragments from %s based on hits in %s" %
        (fastaStr, blastFile))

    if arguments.outfile is None:
        logging.info("Writing %s sequences to STDOUT" % ('fasta'))
        outputHandle = sys.stdout
    else:
        logging.info(
            "Writing %s sequences to %s" %
            ('fasta', arguments.outfile))
        outputHandle = open(arguments.outfile, 'w')

    # load hit regions
    if arguments.keep:
        minHitLength = arguments.minLength
    else:
        minHitLength = 1
    readHits = loadHitRegions(blastFile, minHitLength, arguments)
    logging.info("Found hits for %d reads" % (len(readHits)))

    # process the fasta file with hits
    extractHits(
        fastaHandle,
        outputHandle,
        readHits,
        arguments.translate,
        arguments.minLength,
        arguments.keep,
        arguments.numbering_prefix)
Exemplo n.º 17
0
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-
        1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)
Exemplo n.º 18
0
def main():
    description = """
    Take a blast result table and output a subset of hits based on the
    chosen filtering options. If more than one blast file given, use -O
    to get multiple output files, otherwise all output data will be
    concatenated into one output.
    """

# command line arguments
    parser = argparse.ArgumentParser(
        description=description,
        conflict_handler='resolve')
    add_hit_table_arguments(parser, flags='all')
    parser.add_argument(
        "-o",
        "--outfilenome",
        dest="outfilename",
        default=None,
        metavar="OUTFILENAME",
        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument(
        '-O',
        '--autoOutName',
        default=False,
        action='store_true',
        help="Automatically generate output file name from input name "
             "and options. Overridden by -o, cannot be used with data "
             "from STDIN.")
    parser.add_argument('-G', '--gff', default=False, action='store_true',
                        help="output GFF format instead of input format")
    parser.add_argument('hit_table', nargs='*',
                        type=argparse.FileType('rU'), default=[sys.stdin, ],
                        help="Table of search results to be filtered. "
                             "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # check that we have blast file as argument

    # if we're not doing auto file names, wriate all outputs to same file
    if not arguments.autoOutName:
        if arguments.outfilename is not None:
            logging.info("Writing data to %s" % (arguments.outfilename))
            outfile_handle = open(arguments.outfilename, 'w')
        else:
            logging.info("writing data to STDOUT")
            outfile_handle = sys.stdout

    if arguments.gff:
        logging.info("Converting to GFF")

    # loop over inputs
    for infile_handle in arguments.hit_table:
        logging.info("reading data from %s" % (infile_handle.name))
        if arguments.autoOutName:
            outfile_handle = open(
                getOutputFile(
                    infile_handle.name,
                    arguments),
                'w')

        # filter
        params = FilterParams.create_from_arguments(arguments)
        filterM8(infile_handle, outfile_handle, params, to_gff=arguments.gff)

        if arguments.autoOutName:
            outfile_handle.close()
        infile_handle.close()
Exemplo n.º 19
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'filter_top_pct': 0,
            'parseStyle': ACCS,
            'countMethod': 'tophit'},
        choices={
            'countMethod': (
                'tophit',
                'toporg')})
    parser.add_argument(
        "-P",
        "--proportional",
        dest="proportional",
        default=False,
        action="store_true",
        help="Assign reads that have multiple equal top hits to taxa such "
             "that the overal proportion of taxa is consistent with the "
             "unambiguious hits. This is meant for use with the 'toporg' "
             "count method.")
    parser.add_argument(
        "-i",
        "--individualFiles",
        dest="individual",
        default=False,
        action="store_true",
        help="Use this flag to process files independently. Normally, "
             "counts from all files are pooled for making choices.")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # load necessary maps
    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'toporg':
        (taxonomy, hitStringMap) = readMaps(arguments)

    wta = not(arguments.proportional)

    if len(arguments.input_files) <= 1 or arguments.individual:
        # loop over input
        for (inhandle, outhandle) in inputIterator(arguments):
            logging.debug(
                "Reading from %s and writing to %s" %
                (inhandle, outhandle))

            m8stream = M8Stream(inhandle)
            if arguments.countMethod == 'tophit':
                # don't give any taxonomy, just map to accessions for
                # redistribution
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    parseStyle=arguments.parseStyle)
            else:
                # translate to organism before finding most abundant
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    taxonomy=taxonomy,
                    hitStringMap=hitStringMap,
                    parseStyle=arguments.parseStyle)

            for line in readHits:
                outhandle.write(line)

    else:
        # process all files at once
        multifile = redistribute.multipleFileWrapper(arguments.input_files)

        # Build a map from input file name to output handle
        outputMap = {}
        for infile_handle in arguments.input_files:
            infile_name = infile_handle.name
            if arguments.output_file is None:
                outputMap[infile_name] = sys.stdout
            elif len(arguments.input_files) <= 1:
                outputMap[infile_name] = open(arguments.output_file, 'w')
            else:
                # use outfileName as suffix
                if arguments.cwd:
                    # strip path info first
                    (infilePath, infileFile) = os.path.split(infile_name)
                    outfile = "./" + infileFile + arguments.output_file
                else:
                    outfile = infile_name + arguments.output_file
                outputMap[infile_name] = open(outfile, 'w')

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                parseStyle=arguments.parseStyle)
        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=arguments.parseStyle)

        for (read, hit) in readHits:
            infile_name, read = read.split("/", 1)
            outhandle = outputMap[unquote_plus(infile_name)]
            outhandle.write(hit.line.split("/", 1)[1])

        if arguments.output_file is not None:
            for outhandle in outputMap.values():
                outhandle.close()
Exemplo n.º 20
0
def main():
    """" Set up the CLI """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_files", nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o", "--outfile", dest="outfile",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-r", "--rank", dest="ranks", default=None,
                        metavar="RANK", action="append",
                        help=""" Rank(s) to collect counts on. Use flag
                        multiple
                        times to specify multiple ranks. If multiple values
                        given, one table produced for each with rank name
                        appended to file name. Defaults to all major ranks
                        between phylum and species. Corresponds to rank names
                        in nodes.dmp. To see list run:
                        'cut -f5 nodes.dmp | uniq | sort | uniq'
                        in ncbi tax dir. Will also accept 'organism' to mean
                        no rank (ie, just the organism name).""")
    parser.add_argument(
        "-s",
        "--collapseToDomain",
        default=False,
        action="store_true",
        help="Collapse all taxa below given rank down to "
             "superkingdom/domain. EG: in the genus output, anything "
             "assigned to Cyanobactia, will be lumped in with all "
             "other bacteria")
    parser.add_argument(
            "--proportional",
            dest="proportional",
            default=False,
            action="store_true",
            help="""When using tophit or toporg, redistribute proportionally
            instead of winner take all""")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
             "Will be ignored if beyond the rank of the taxa "
             "(IE We can't include species if the taxon being counted "
             "is genus)")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, tax dir, and more
    add_taxon_arguments(
        parser,
        choices={
            'countMethod': (
                'LCA',
                'all',
                'first',
                'most',
                'tophit',
                'toporg',
                'consensus')})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.proportional and \
            arguments.countMethod not in ['tophit', 'toporg']:
        parser.error("--proportinal only has meaning "
                     "if using tophit or toporg")

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Handle the case where Galaxy tries to set None as a string
    arguments.ranks = checkNoneOption(arguments.ranks)
    arguments.printRanks = checkNoneOption(arguments.printRanks)

    logging.info("Printing out ranks: %r", arguments.ranks)

    # Set defaults and check for some conflicts
    if arguments.ranks is None and arguments.taxdir is None:
        # using hit names only
        arguments.ranks = [ORG_RANK]
        if arguments.printRanks is not None:
            parser.error("Display ranks are not used without taxonomic info")
    else:
        if arguments.taxdir is None:
            parser.error("Cannot select ranks without a taxonomy")
        if arguments.ranks is None:
            # set a default
            arguments.ranks = [
                'phylum',
                'class',
                'order',
                'family',
                'genus',
                'species']

        try:
            # Make sure the rank lists make sense
            arguments.ranks = cleanRanks(arguments.ranks)
            if arguments.printRanks is not None:
                arguments.printRanks = cleanRanks(arguments.printRanks)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single rank
    if len(arguments.ranks) > 1 and arguments.outfile is None:
        parser.error("STDOUT only works if a single rank is chosen!")

    # Because rank is used in parsing hits, we can only do multiple ranks for
    # certain kinds of count methods
    if len(arguments.ranks) > 1:
        rank = None
        if arguments.countMethod in ['consensus', 'most']:
            parser.error(
                "Using multiple ranks does not work with the 'consensus' "
                "or 'most' counting methods. LCA should give the same "
                "results as consensus. If you really want to do this, "
                "use a bash loop:'for rank in phylum order genus; do "
                "COMMAND -r ${rank}; done'")
    else:
        rank = arguments.ranks[0]

    # load necessary maps
    (taxonomy, hitStringMap) = readMaps(arguments)

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        multifile = redistribute.multipleFileWrapper(fileLabels.keys())

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=not arguments.proportional,
                parseStyle=arguments.parseStyle,
                sequenceWeights=sequenceWeights)
            # define method to turn Hits into orgnaisms
            hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                             taxonomy=taxonomy,
                                             hitStringMap=hitStringMap)

            translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0]

        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                returnTranslations=True,
                winnerTakeAll=not arguments.proportional,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=ACCS)

            # Organisms will be returned, make translator trivial:
            translateHit = passThrough

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_name, read_name = read_name.split("/", 1)
            file_tag = fileLabels[unquote_plus(file_name)]
            taxon = translateHit(hit)
            taxcount = fileCounts[file_tag].setdefault(taxon, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][taxon] = taxcount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      hitStringMap,
                                      params,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      taxonomy=taxonomy,
                                      rank=rank)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s",
                total, len(counts), len(hitMap), filename)

            infile.close()

    printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)
Exemplo n.º 21
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-s',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
             "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s" %
            (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            arguments.hitTableFormat,
            arguments.filterTopPct,
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write(
                        "%s\t%s\n" %
                        (read, "\t".join(assignmentList)))
Exemplo n.º 22
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(parser,
                        defaults={
                            'filter_top_pct': 0,
                            'parseStyle': ACCS,
                            'countMethod': 'tophit'
                        },
                        choices={'countMethod': ('tophit', 'toporg')})
    parser.add_argument(
        "-P",
        "--proportional",
        dest="proportional",
        default=False,
        action="store_true",
        help="Assign reads that have multiple equal top hits to taxa such "
        "that the overal proportion of taxa is consistent with the "
        "unambiguious hits. This is meant for use with the 'toporg' "
        "count method.")
    parser.add_argument(
        "-i",
        "--individualFiles",
        dest="individual",
        default=False,
        action="store_true",
        help="Use this flag to process files independently. Normally, "
        "counts from all files are pooled for making choices.")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # load necessary maps
    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'toporg':
        (taxonomy, hitStringMap) = readMaps(arguments)

    wta = not (arguments.proportional)

    if len(arguments.input_files) <= 1 or arguments.individual:
        # loop over input
        for (inhandle, outhandle) in inputIterator(arguments):
            logging.debug("Reading from %s and writing to %s" %
                          (inhandle, outhandle))

            m8stream = M8Stream(inhandle)
            if arguments.countMethod == 'tophit':
                # don't give any taxonomy, just map to accessions for
                # redistribution
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    parseStyle=arguments.parseStyle)
            else:
                # translate to organism before finding most abundant
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    taxonomy=taxonomy,
                    hitStringMap=hitStringMap,
                    parseStyle=arguments.parseStyle)

            for line in readHits:
                outhandle.write(line)

    else:
        # process all files at once
        multifile = redistribute.multipleFileWrapper(arguments.input_files)

        # Build a map from input file name to output handle
        outputMap = {}
        for infile_handle in arguments.input_files:
            infile_name = infile_handle.name
            if arguments.output_file is None:
                outputMap[infile_name] = sys.stdout
            elif len(arguments.input_files) <= 1:
                outputMap[infile_name] = open(arguments.output_file, 'w')
            else:
                # use outfileName as suffix
                if arguments.cwd:
                    # strip path info first
                    (infilePath, infileFile) = os.path.split(infile_name)
                    outfile = "./" + infileFile + arguments.output_file
                else:
                    outfile = infile_name + arguments.output_file
                outputMap[infile_name] = open(outfile, 'w')

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                parseStyle=arguments.parseStyle)
        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=arguments.parseStyle)

        for (read, hit) in readHits:
            infile_name, read = read.split("/", 1)
            outhandle = outputMap[unquote_plus(infile_name)]
            outhandle.write(hit.line.split("/", 1)[1])

        if arguments.output_file is not None:
            for outhandle in outputMap.values():
                outhandle.close()
Exemplo n.º 23
0
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'mapFile': None,
            'parseStyle': ACCS,
            'filter_top_pct': -1,
            'countMethod': 'all',
            'taxdir': None})
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action='append',
        help="Taxon to identify reads in. Top hits (as defined by "
             "--topHitPct) must be in this group. It can be a taxid, "
             "a name, or a file listing taxids. Use multiple times to "
             "specify a list of organisms. Use -a to specify whether "
             "all or at least one of the top hits must match.")
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
             "in the target taxon/taxa. By default, all top hits must be "
             "in the target group.")
    parser.add_argument(
        '-t',
        '--topHitPct',
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
             "to qualify as a top hit. Default is 0, ie must have the best "
             "score. Use 100 to get all hits.")
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
             "these hits) must be in the target group or this group. Again, "
             "it can be a taxid, a name, or a file listing taxids. "
             "It can also be inkoved multiple times to choose multiple "
             "groups.")
    parser.add_argument(
        '-r',
        '--reads',
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit "
             "lines for each read")

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug(
        "Group 1 has %d entries and 439482 in group1 is %s" %
        (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug(
            "Group 2 has %d entries and 439482 in group2 is %s" %
            (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(
        arguments.mapFile,
        valueType=int,
        keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (
                read,
                hits) in filterM8Stream(
                inhandle,
                filterParams,
                return_lines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug(
                    "Checking best hits for %s (top score: %.1f)" %
                    (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(
                        hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug(
                            "%s (%r) is not in group 1" %
                            (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug(
                        "No best hits for %s are in group 1" %
                        (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug(
                        "Not all best hits for %s are in group 1" %
                        (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug(
                        "Keeping %d hits for %s" %
                        (len(recognized), read))
                    for hit in sorted(
                        recognized,
                        key=lambda h: (
                            h.score,
                            h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info(
                "Printed %d lines for %d of %d reads" %
                (printCount, goodReadCount, readCount))
Exemplo n.º 24
0
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None}
    )
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action="append",
        help="Taxon to identify reads in. Top hits (as defined by "
        "--topHitPct) must be in this group. It can be a taxid, "
        "a name, or a file listing taxids. Use multiple times to "
        "specify a list of organisms. Use -a to specify whether "
        "all or at least one of the top hits must match.",
    )
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
        "in the target taxon/taxa. By default, all top hits must be "
        "in the target group.",
    )
    parser.add_argument(
        "-t",
        "--topHitPct",
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
        "to qualify as a top hit. Default is 0, ie must have the best "
        "score. Use 100 to get all hits.",
    )
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
        "these hits) must be in the target group or this group. Again, "
        "it can be a taxid, a name, or a file listing taxids. "
        "It can also be inkoved multiple times to choose multiple "
        "groups.",
    )
    parser.add_argument(
        "-r",
        "--reads",
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit " "lines for each read",
    )

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write("\n")
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized), read))
                    for hit in sorted(recognized, key=lambda h: (h.score, h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
Exemplo n.º 25
0
def main():
    # set up CLI
    description = __doc__

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("-i", "--infile", dest="infile",
                        metavar="FILE", help="Read raw table from INFILE")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write collapsed table to OUTFILE")
    parser.add_argument("-d", "--delim", dest="delim", default="\t",
                        help="Input table delimiter", metavar="DELIM")
    parser.add_argument("-D", "--delimOut", dest="delimOut", default="\t",
                        help="Output table delimiter", metavar="DELIM")
    parser.add_argument(
        '-F',
        '--countFirst',
        action='store_true',
        default=False,
        help="Don't skip the first line, it's NOT a header")
    parser.add_argument(
        "-R",
        "--readColumn",
        dest="readCol",
        type=int,
        default=0,
        help="Index (starting at 0) of column with read name, 0 is default",
        metavar="READCOL")
    parser.add_argument(
        "-H",
        "--hitColumn",
        dest="hitCol",
        type=int,
        default=2,
        help="Index (starting at 0) of column with hit name (for counting), "
             "2 is default, if less than zero, all (non-read) columns will "
             "be used as multiple hits",
        metavar="HITCOL")
    parser.add_argument(
        '-s',
        '--hitSep',
        default=None,
        help="Use this string to split multiple values in single hit cell. "
             "Default is 'None' to leave hits as is, use 'eval' to parse "
             "as python repr strings")
    add_weight_arguments(parser, multiple=False)
    parser.add_argument("-T", "--total", default=False, action="store_true",
                        help="Report 'Total' in the first row")

    # cutoff options
    add_count_arguments(parser, {'cutoff': 0})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # make sure we have something to do
    if (arguments.infile is None):
        logging.info("Reading table from: STDIN")
    else:
        logging.info("Reading table from: " + arguments.infile)

    if (arguments.outfile is None):
        logging.info("Writing counts to: STDOUT")
    else:
        logging.info("Writing counts to: " + arguments.outfile)

    # process arguments
    takeFirst = (arguments.allMethod == 'first')
    splitHits = (arguments.hitSep is not None and arguments.hitSep != 'None')
    uncluster = (arguments.weights is not None)

    if arguments.hitSep == 'eval':
        parser.error("Sorry, parsing with eval is not yet supported!")

    # inform the curious user
    logging.info("Delimiter: '" + arguments.delim)
    logging.info("Read names in col: '" + str(arguments.readCol))
    logging.info("Hit names in col: '" + str(arguments.hitCol))
    if splitHits:
        logging.info("Splitting hits with: %s" % (arguments.hitSep))
        logging.warn(
            "Splitting hits has not been tested yet! Let me know how it goes.")
    if takeFirst:
        logging.info("Taking first hit for each read.")
    else:
        if arguments.allMethod == 'portion':
            logging.info("Dividing count among all hits for each read.")
        else:
            logging.info("Adding 1 to every hit for each read")
    if uncluster:
        logging.info(
            "Getting read cluster sizes from: %s" %
            (arguments.weights))
    if arguments.countFirst:
        logging.info("First line is data")
    else:
        logging.info("Skipping first line")

    # Do the counting!
    counts = {}
    countHitsForRead = getAllMethod(arguments.allMethod)

    clusteredReadCounts = {}
    if uncluster:
        clusteredReadCounts = parseMapFile(
            arguments.clusterFile, valueType=int)

    currentRead = ''
    readCount = 1
    hits = []

    if arguments.infile is None:
        infile = sys.stdin
    else:
        infile = open(arguments.infile)

    # loop over lines
    if not arguments.countFirst:
        # skip first line
        try:
            next(infile)
        except StopIteration:
            raise Exception("No lines in %s" % str(infile))

    for line in infile:
        line = line.rstrip('\r\n')
        rowcells = line.split(arguments.delim)
        # get read
        read = rowcells[arguments.readCol]

        # if it's a new read, process previous read
        if currentRead == '':
            currentRead = read
        elif read != currentRead and currentRead != '':
            readCount += 1
            logging.info("Checking hits for %s" % currentRead)

            # was it part of a cluster?
            multiplier = 1
            if uncluster:
                multiplier = clusteredReadCounts[currentRead]

            # where does the count for this read go
            countHitsForRead(hits, counts, multiplier=multiplier)

            hits = []
            currentRead = read

        # get hit from this line
        if arguments.hitCol >= 0:
            hit = rowcells[arguments.hitCol]
            if splitHits:
                hits.extend(hit.split(arguments.hitSep))
            else:
                hits.append(hit)
        else:
            rowcells.pop(arguments.readCol)
            hits.extend(rowcells)

    # check last read!
    logging.info("Checking hits for %s" % currentRead)
    # was it part of a cluster?
    multiplier = 1
    if uncluster:
        multiplier = clusteredReadCounts[currentRead]
    # where does the count for this read go
    countHitsForRead(hits, counts, multiplier=multiplier)

    # apply cutoff
    if arguments.cutoff > 0:
        applyFractionalCutoff(counts, threshold=arguments.cutoff * readCount)

    # print output
    if arguments.outfile is None:
        outhandle = sys.stdout
    else:
        outhandle = open(arguments.outfile, 'w')

    if arguments.total:
        outhandle.write("Total%s%d\n" % (arguments.delimOut, readCount))

    if arguments.allMethod == 'portion':
        outFmtString = "%s%s%f\n"
    else:
        outFmtString = "%s%s%d\n"

    delimRE = re.compile(arguments.delimOut)
    for hit in sorted(counts.keys()):
        count = counts[hit]
        hit = delimRE.sub('_', hit)
        outhandle.write(outFmtString % (hit, arguments.delimOut, count))