Пример #1
0
def crossTabulateHits(hitFile1, hitFile2,
                      skipHeader1=True, skipHeader2=True,
                      readCol1=0, readCol2=0,
                      hitCol1=-1, hitCol2=-1,
                      hitSep1=None, hitSep2=None):
    """
    Given two hit tables, cross reference reads to build a 2D DataFrame
    of hit counts
    """
    # get a map from reads to hits in file 1
    hitmap = tupleIteratorToMap(
        parseHits(
            hitFile1,
            readCol1,
            hitCol1,
            skipHeader1,
            hitSep1))
    # get an iterator over hits in file 2
    crossHits = parseHits(hitFile2, readCol2, hitCol2, skipHeader2, hitSep2)

    counts = {}
    types2 = set()
    readsOnlyIn2 = []
    readMatchCount = 0
    for read, hits in crossHits:
        try:
            hits2 = hitmap.pop(read)
            readMatchCount += 1
        except KeyError:
            readsOnlyIn2.append(read)
            continue

        for h2 in hits2:
            for h1 in hits:
                h1counts = counts.setdefault(h1, {})
                h1counts[h2] = h1counts.setdefault(h2, 0) + 1
            types2.add(h2)

    # generate dataframe
    logger.warn(
        "%d of %d reads only in file 1" %
        (len(hitmap), readMatchCount))
    logger.warn(
        "%d of %d reads only in file 2" %
        (len(readsOnlyIn2), readMatchCount))
    return pandas.DataFrame(counts)
Пример #2
0
def crossTabulateHits(hitFile1,
                      hitFile2,
                      skipHeader1=True,
                      skipHeader2=True,
                      readCol1=0,
                      readCol2=0,
                      hitCol1=-1,
                      hitCol2=-1,
                      hitSep1=None,
                      hitSep2=None):
    """
    Given two hit tables, cross reference reads to build a 2D DataFrame
    of hit counts
    """
    # get a map from reads to hits in file 1
    hitmap = tupleIteratorToMap(
        parseHits(hitFile1, readCol1, hitCol1, skipHeader1, hitSep1))
    # get an iterator over hits in file 2
    crossHits = parseHits(hitFile2, readCol2, hitCol2, skipHeader2, hitSep2)

    counts = {}
    types2 = set()
    readsOnlyIn2 = []
    readMatchCount = 0
    for read, hits in crossHits:
        try:
            hits2 = hitmap.pop(read)
            readMatchCount += 1
        except KeyError:
            readsOnlyIn2.append(read)
            continue

        for h2 in hits2:
            for h1 in hits:
                h1counts = counts.setdefault(h1, {})
                h1counts[h2] = h1counts.setdefault(h2, 0) + 1
            types2.add(h2)

    # generate dataframe
    logger.warn("%d of %d reads only in file 1" %
                (len(hitmap), readMatchCount))
    logger.warn("%d of %d reads only in file 2" %
                (len(readsOnlyIn2), readMatchCount))
    return pandas.DataFrame(counts)
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)
Пример #4
0
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-
        1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)