def crossTabulateHits(hitFile1, hitFile2, skipHeader1=True, skipHeader2=True, readCol1=0, readCol2=0, hitCol1=-1, hitCol2=-1, hitSep1=None, hitSep2=None): """ Given two hit tables, cross reference reads to build a 2D DataFrame of hit counts """ # get a map from reads to hits in file 1 hitmap = tupleIteratorToMap( parseHits( hitFile1, readCol1, hitCol1, skipHeader1, hitSep1)) # get an iterator over hits in file 2 crossHits = parseHits(hitFile2, readCol2, hitCol2, skipHeader2, hitSep2) counts = {} types2 = set() readsOnlyIn2 = [] readMatchCount = 0 for read, hits in crossHits: try: hits2 = hitmap.pop(read) readMatchCount += 1 except KeyError: readsOnlyIn2.append(read) continue for h2 in hits2: for h1 in hits: h1counts = counts.setdefault(h1, {}) h1counts[h2] = h1counts.setdefault(h2, 0) + 1 types2.add(h2) # generate dataframe logger.warn( "%d of %d reads only in file 1" % (len(hitmap), readMatchCount)) logger.warn( "%d of %d reads only in file 2" % (len(readsOnlyIn2), readMatchCount)) return pandas.DataFrame(counts)
def crossTabulateHits(hitFile1, hitFile2, skipHeader1=True, skipHeader2=True, readCol1=0, readCol2=0, hitCol1=-1, hitCol2=-1, hitSep1=None, hitSep2=None): """ Given two hit tables, cross reference reads to build a 2D DataFrame of hit counts """ # get a map from reads to hits in file 1 hitmap = tupleIteratorToMap( parseHits(hitFile1, readCol1, hitCol1, skipHeader1, hitSep1)) # get an iterator over hits in file 2 crossHits = parseHits(hitFile2, readCol2, hitCol2, skipHeader2, hitSep2) counts = {} types2 = set() readsOnlyIn2 = [] readMatchCount = 0 for read, hits in crossHits: try: hits2 = hitmap.pop(read) readMatchCount += 1 except KeyError: readsOnlyIn2.append(read) continue for h2 in hits2: for h1 in hits: h1counts = counts.setdefault(h1, {}) h1counts[h2] = h1counts.setdefault(h2, 0) + 1 types2.add(h2) # generate dataframe logger.warn("%d of %d reads only in file 1" % (len(hitmap), readMatchCount)) logger.warn("%d of %d reads only in file 2" % (len(readsOnlyIn2), readMatchCount)) return pandas.DataFrame(counts)
def main(): """ set up the command line interface """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-1", "--input_file_1", default=None, type=argparse.FileType('r'), metavar=("INPUT_TABLE_1"), help="Input table 1") parser.add_argument("-2", "--input_file_2", default=None, type=argparse.FileType('r'), metavar=("INPUT_TABLE_2"), help="Input table 2") parser.add_argument("-m", "--multiplier", default=None, metavar=("MULTIPLIER_TABLE"), help=("Table of values to multiply each sequence. " "EG assembly coverages.")) parser.add_argument("-T", "--total_reads", default=0, metavar="TOTAL_READS", type=int, help="Total number of reads to expect. (This allows " "the reporting of unknown read count)") parser.add_argument( "-o", "--outfile", dest="outfile", type=argparse.FileType('w'), default=sys.stdout, metavar="OUTFILE", help="Write count table to OUTFILE. (Defaults to STDOUT") parser.add_argument( "-L", "--long_output", default=False, action="store_true", help="Print one number per row (prefixed by two keys) instead " "of a table with one seet of keys as column names and one " "set as row names.") parser.add_argument( "-H", "--hitCol1", dest="hitCol1", type=int, default=-1, help="Index (starting at 0) of column in file 1 with hit name, -1 " "is default meaning all columns that are not the read name are " "hit names.", metavar="HITCOL") parser.add_argument( "-I", "--hitCol2", dest="hitCol2", type=int, default=-1, help="Index (starting at 0) of column in file 2 with hit name, -1 " "is default meaning all columns that are not the read name " "are hit names.", metavar="HITCOL") parser.add_argument( "-S", "--skipFirstRow", action="store_true", default=False, help="hit tables have a header row which needs to be skipped") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if arguments.input_file_1 is None or arguments.input_file_2 is None: parser.error("Please supply two input files") logging.info("reading hits from %s", arguments.input_file_1.name) hits1 = parseHits(arguments.input_file_1, 0, arguments.hitCol1, arguments.skipFirstRow, None) logging.info("reading hits from %s", arguments.input_file_2.name) hits2 = parseHits(arguments.input_file_2, 0, arguments.hitCol2, arguments.skipFirstRow, None) hits1 = tupleIteratorToMap(hits1) hits2 = tupleIteratorToMap(hits2) if arguments.multiplier is not None: multipliers = parseMapFile(arguments.multiplier, valueType=float) else: multipliers = None logging.info("counting hits") (table, cols) = combine_counts(hits1, hits2, multipliers, total_reads=arguments.total_reads) # print out hit table logging.info("printing table to " + arguments.outfile.name) print_table(arguments.outfile, table, cols, is_multiplied=multipliers is not None, long_output=arguments.long_output)
def main(): """ set up the command line interface """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-1", "--input_file_1", default=None, type=argparse.FileType('r'), metavar=("INPUT_TABLE_1"), help="Input table 1") parser.add_argument("-2", "--input_file_2", default=None, type=argparse.FileType('r'), metavar=("INPUT_TABLE_2"), help="Input table 2") parser.add_argument("-m", "--multiplier", default=None, metavar=("MULTIPLIER_TABLE"), help=("Table of values to multiply each sequence. " "EG assembly coverages.")) parser.add_argument("-T", "--total_reads", default=0, metavar="TOTAL_READS", type=int, help="Total number of reads to expect. (This allows " "the reporting of unknown read count)") parser.add_argument( "-o", "--outfile", dest="outfile", type=argparse.FileType('w'), default=sys.stdout, metavar="OUTFILE", help="Write count table to OUTFILE. (Defaults to STDOUT") parser.add_argument( "-L", "--long_output", default=False, action="store_true", help="Print one number per row (prefixed by two keys) instead " "of a table with one seet of keys as column names and one " "set as row names.") parser.add_argument( "-H", "--hitCol1", dest="hitCol1", type=int, default=-1, help="Index (starting at 0) of column in file 1 with hit name, -1 " "is default meaning all columns that are not the read name are " "hit names.", metavar="HITCOL") parser.add_argument( "-I", "--hitCol2", dest="hitCol2", type=int, default=- 1, help="Index (starting at 0) of column in file 2 with hit name, -1 " "is default meaning all columns that are not the read name " "are hit names.", metavar="HITCOL") parser.add_argument( "-S", "--skipFirstRow", action="store_true", default=False, help="hit tables have a header row which needs to be skipped") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if arguments.input_file_1 is None or arguments.input_file_2 is None: parser.error("Please supply two input files") logging.info("reading hits from %s", arguments.input_file_1.name) hits1 = parseHits(arguments.input_file_1, 0, arguments.hitCol1, arguments.skipFirstRow, None) logging.info("reading hits from %s", arguments.input_file_2.name) hits2 = parseHits(arguments.input_file_2, 0, arguments.hitCol2, arguments.skipFirstRow, None) hits1 = tupleIteratorToMap(hits1) hits2 = tupleIteratorToMap(hits2) if arguments.multiplier is not None: multipliers = parseMapFile(arguments.multiplier, valueType=float) else: multipliers = None logging.info("counting hits") (table, cols) = combine_counts(hits1, hits2, multipliers, total_reads=arguments.total_reads) # print out hit table logging.info("printing table to " + arguments.outfile.name) print_table(arguments.outfile, table, cols, is_multiplied=multipliers is not None, long_output=arguments.long_output)