Exemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)

    group.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    group.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft_clip_threshold)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
        if options.paired:
            outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write(
            "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n"
        )

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    read_events = collections.Counter()

    for bundle, read_events in umi_methods.get_bundles(
            infile,
            read_events,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            chrom=options.chrom,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up ReadCluster functor with methods specific to
        # specified options.method
        processor = network.ReadClusterer(options.method)

        bundle, groups, counts = processor(bundle=bundle,
                                           threshold=options.threshold,
                                           stats=True,
                                           deduplicate=False)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        if options.paired:
                            # if paired, we need to supply the tags to
                            # add to the paired read
                            outfile.write(read, unique_id, top_umi)

                        else:
                            # Add the 'UG' tag to the read
                            read.tags += [('UG', unique_id)]
                            read.tags += [(options.umi_group_tag, top_umi)]
                            outfile.write(read)

                    if options.tsv:
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str, (read.query_name, read.reference_name,
                                      umi_methods.get_read_position(
                                          read, options.soft)[1], umi.decode(),
                                      counts[umi], top_umi.decode(),
                                      group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
Exemplo n.º 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.per_gene:
        if not options.gene_transcript_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map")

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
        gene_tag = options.gene_tag
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=gene_tag,
            skip_regex=options.skip_regex,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True,
            return_read2=True,
            return_unmapped=options.output_unmapped):

        # write out read2s and unmapped if option set
        if status == 'single_read':
            # bundle is just a single read here
            outfile.write(bundle)
            nInput += 1
            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option("--group-out", dest="tsv", type="string",
                     help="Outfile name for file mapping read id to read group",
                     default=None)

    group.add_option("--output-bam", dest="output_bam", action="store_true",
                     default=False,
                     help=("output a bam file with read groups tagged using the UG tag"
                           "[default=%default]"))

    group.add_option("--output-unmapped", dest="output_unmapped", action="store_true",
                     default=False,
                     help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag", dest="umi_group_tag",
                      type="string", help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join(
            ["read_id", "contig", "position", "gene", "umi", "umi_count",
             "final_umi", "final_umi_count", "unique_id"]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(
            umis,
            counts,
            threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(map(str, (
                            read.query_name, read.reference_name,
                            umi_methods.get_read_position(
                                read, options.soft_clip_threshold)[1],
                            gene,
                            umi.decode(),
                            counts[umi],
                            top_umi.decode(),
                            group_count,
                            unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in
                                 bundle_iterator.read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()