def singleBarcodeGenerator(whitelist_tsv):
     with U.openFile(whitelist_tsv, "r") as inf:
         for line in inf:
             if line.startswith('#'):
                 continue
             line = line.strip().split("\t")
             yield(line[0])
    def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2):

        whitelist1 = []
        whitelist2 = []

        with U.openFile(whitelist_tsv, "r") as inf:
            for line in inf:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist1.append(line[0])

        with U.openFile(whitelist_tsv2, "r") as inf2:
            for line in inf2:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist2.append(line[0])

        for w1, w2 in itertools.product(whitelist1, whitelist2):
            yield(w1 + w2)
示例#3
0
def getMetaContig2contig(gene_transcript_map):
    ''' '''
    metacontig2contig = collections.defaultdict(set)
    for line in U.openFile(gene_transcript_map, "r"):

        if line.startswith("#"):
            continue

        if len(line.strip()) == 0:
            break

        gene, transcript = line.strip().split("\t")
        metacontig2contig[gene].add(transcript)

    return metacontig2contig
示例#4
0
def getUserDefinedBarcodes(whitelist_tsv, getErrorCorrection=False):
    cell_whitelist = []

    if getErrorCorrection:
        false_to_true_map = {}
    else:
        false_to_true_map = None

    with U.openFile(whitelist_tsv, "r") as inf:

        for line in inf:
            if line.startswith('#'):
                continue

            line = line.strip().split("\t")
            whitelist_barcode = line[0]
            cell_whitelist.append(whitelist_barcode)

            if getErrorCorrection:
                for error_barcode in line[1].split(","):
                    false_to_true_map[error_barcode] = whitelist_barcode

    return set(cell_whitelist), false_to_true_map
示例#5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "count-specific options")

    parser.add_option("--wide-format-cell-counts",
                      dest="wide_format_cell_counts",
                      action="store_true",
                      default=False,
                      help=("output the cell counts in a wide format "
                            "(rows=genes, columns=cells)"))

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv, add_group_dedup_options=False)

    options.per_gene = True  # hardcodes counting to per-gene only

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    infile = pysam.Samfile(in_name, in_mode)

    # write out to tempfile and then sort to stdout
    tmpfilename = U.getTempFilename(dir=options.tmpdir)
    tmpfile = U.openFile(tmpfilename, mode="w")

    nInput, nOutput, input_reads = 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag
        else:
            inreads = infile.fetch()

    bundle_iterator = umi_methods.get_bundles(
        options, only_count_reads=True, metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):
        if status == "single_read":
            continue

        gene, cell = key

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method

        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        gene_count = len(groups)

        if options.per_cell:
            tmpfile.write("%s\n" % "\t".join(
                (gene, cell.decode(), str(gene_count))))
        else:
            tmpfile.write("%s\n" % "\t".join((gene, str(gene_count))))

        nOutput += gene_count

    tmpfile.close()

    if options.per_cell:

        gene_counts_dict = {}

        with U.openFile(tmpfilename, mode="r") as inf:
            genes = set()
            cells = set()
            for line in inf:
                gene, cell, gene_count = line.strip().split("\t")
                genes.add(gene)
                cells.add(cell)

                if gene not in gene_counts_dict:
                    gene_counts_dict[gene] = {}

                gene_counts_dict[gene][cell] = gene_count

        if options.wide_format_cell_counts:  # write out in wide format

            options.stdout.write("%s\t%s\n" %
                                 ("gene", "\t".join(sorted(cells))))

            for gene in sorted(genes):
                counts = []
                for cell in sorted(cells):
                    if cell in gene_counts_dict[gene]:
                        counts.append(gene_counts_dict[gene][cell])
                    else:
                        counts.append(0)
                options.stdout.write("%s\t%s\n" %
                                     (gene, "\t".join(map(str, counts))))

        else:  # write out in long format
            options.stdout.write("%s\t%s\t%s\n" % ("gene", "cell", "count"))
            for gene in sorted(genes):
                for cell in sorted(list(gene_counts_dict[gene].keys())):
                    options.stdout.write(
                        "%s\t%s\t%s\n" %
                        (gene, cell, gene_counts_dict[gene][cell]))
    else:
        options.stdout.write("%s\t%s\n" % ("gene", "count"))

        with U.openFile(tmpfilename, mode="r") as inf:
            for line in inf:
                options.stdout.write(line)

    os.unlink(tmpfilename)

    # output reads events and benchmark information.
    for event in bundle_iterator.read_events.most_common():
        U.info("%s: %s" % (event[0], event[1]))

    U.info("Number of (post deduplication) reads counted: %i" % nOutput)

    U.Stop()
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
        if options.paired:
            outfile = umi_methods.TwoPassPairWriter(infile, outfile, tags=True)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write(
            "read_id\tcontig\tposition\tumi\tumi_count\tfinal_umi\tfinal_umi_count\tunique_id\n"
        )

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    read_events = collections.Counter()

    for bundle, read_events in umi_methods.get_bundles(
            infile,
            read_events,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            chrom=options.chrom,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up ReadCluster functor with methods specific to
        # specified options.method
        processor = network.ReadClusterer(options.method)

        bundle, groups, counts = processor(bundle=bundle,
                                           threshold=options.threshold,
                                           stats=True,
                                           deduplicate=False)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        if options.paired:
                            # if paired, we need to supply the tags to
                            # add to the paired read
                            outfile.write(read, unique_id, top_umi)

                        else:
                            # Add the 'UG' tag to the read
                            read.tags += [('UG', unique_id)]
                            read.tags += [(options.umi_group_tag, top_umi)]
                            outfile.write(read)

                    if options.tsv:
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str, (read.query_name, read.reference_name,
                                      umi_methods.get_read_position(
                                          read, options.soft)[1], umi.decode(),
                                      counts[umi], top_umi.decode(),
                                      group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
示例#7
0
def getUserDefinedBarcodes(whitelist_tsv):
    cell_whitelist = []
    with U.openFile(whitelist_tsv, "r") as inf:
        for line in inf:
            cell_whitelist.append(line.strip())
    return set(cell_whitelist)
示例#8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup"
                      " only on position",
                      default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "percentile",
                               "unique", "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi "
                            "[default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.per_gene:
        if not options.gene_transcript_map and not options.gene_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map "
                "or --gene-tag")
    try:
        re.compile(options.skip_regex)
    except re.error:
        raise ValueError("skip-regex '%s' is not a "
                         "valid regex" % options.skip_regex)

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = umi_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(infile.filename,
                                                    chrom=options.chrom,
                                                    umi_getter=umi_getter)

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=options.ignore_umi,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=options.gene_tag,
            skip_regex=options.skip_regex,
            whole_contig=options.whole_contig,
            read_length=options.read_length,
            detection_method=options.detection_method,
            umi_getter=umi_getter,
            all_reads=False,
            return_read2=False,
            return_unmapped=False):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ReadCluster functor with methods specific to
            # specified options.method
            processor = network.ReadDeduplicator(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [umi_getter(x) for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.

    U.info(
        "%s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i" % nOutput)

    U.Stop()
示例#9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method",
                      type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix",
                      type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads",
                      type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser,
                              argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (options.pattern,
                                                     options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (options.pattern,
                                                      options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" %
            (len(cell_barcode_counts), options.subset_reads,
             options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts, options.expect_cells, options.cell_number,
        options.error_correct_threshold, options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [
                    cell_barcode_counts[x]
                    for x in sorted(true_to_false_map[barcode])
                ]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write(
            "%s\t%s\t%s\t%s\n" %
            (barcode, corrected_barcodes, cell_barcode_counts[barcode],
             corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()
示例#10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true",
                      help="Input file is in sam format", default=False)
    parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true",
                      help="Output alignments in sam format", default=False)
    parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset", dest="subset", type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique", dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                           " one",
                      default=False)
    parser.add_option("--soft-clip-threshold", dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                           "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-threshold", dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                           "when clustering", default=1)
    parser.add_option("--chrom", dest="chrom", type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired", dest="paired", action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method", dest="method", type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats", dest="stats", type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats", dest="further_stats",
                      action="store_true", default=False,
                      help="Output further stats")
    parser.add_option("--per-contig", dest="per_contig", action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--whole-contig", dest="whole_contig", action="store_true",
                      default=False,
                      help="Read whole contig before outputting bundles: guarantees that no reads"
                           "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method", type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality", dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length", dest="read_length", action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode,
                            template=infile)

    if options.paired:
        outfile = TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" % (
                        options.detection_method, ",".join(
                            [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename, chrom=options.chrom)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold,
                options.stats, options.further_stats)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    outfile.close()

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(stats_pre_df.pivot_table(
            columns=stats_pre_df["counts"], values="counts", aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(stats_post_df.pivot_table(
            columns=stats_post_df["counts"], values="counts", aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post,
                                 how='left', left_index=True, right_index=True,
                                 sort=True, suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(
            options.stats + "_per_umi_per_position.tsv", sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df, agg_post_df, how='left',
                          left_index=True, right_index=True,
                          sort=True, suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(max(map(max, [pre_cluster_stats,
                                   post_cluster_stats,
                                   pre_cluster_stats_null,
                                   post_cluster_stats_null])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster,
                               minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique": tallyCounts(pre_cluster_binned, max_ed),
            "unique_null": tallyCounts(pre_cluster_null_binned, max_ed),
            options.method: tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance": cluster_bins})

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False, sep="\t")

        if options.further_stats:
            with U.openFile(options.stats + "_topologies.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join((x, str(y)))
                               for x, y in topology_counts.most_common()]) + "\n")

            with U.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write(
                    "\n".join(["\t".join(map(str, (x, y))) for
                               x, y in node_counts.most_common()]) + "\n")

    # write footer and output benchmark information.
    
    U.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    U.Stop()
示例#11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option("--group-out", dest="tsv", type="string",
                     help="Outfile name for file mapping read id to read group",
                     default=None)

    group.add_option("--output-bam", dest="output_bam", action="store_true",
                     default=False,
                     help=("output a bam file with read groups tagged using the UG tag"
                           "[default=%default]"))

    group.add_option("--output-unmapped", dest="output_unmapped", action="store_true",
                     default=False,
                     help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag", dest="umi_group_tag",
                      type="string", help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join(
            ["read_id", "contig", "position", "gene", "umi", "umi_count",
             "final_umi", "final_umi_count", "unique_id"]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(
            umis,
            counts,
            threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(map(str, (
                            read.query_name, read.reference_name,
                            umi_methods.get_read_position(
                                read, options.soft_clip_threshold)[1],
                            gene,
                            umi.decode(),
                            counts[umi],
                            top_umi.decode(),
                            group_count,
                            unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" % ", ".join(["%s: %s" % (x[0], x[1]) for x in
                                 bundle_iterator.read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
示例#12
0
def getUserDefinedBarcodes(whitelist_tsv, whitelist_tsv2=None,
                           getErrorCorrection=False,
                           deriveErrorCorrection=False,
                           threshold=1):
    '''
    whitelist_tsv: tab-separated file with whitelisted barcodes. First
    field should be whitelist barcodes. Second field [optional] should
    be comma-separated barcodes which are to be corrected to the
    barcode in the first field.

    whitelist_tsv2: as above but for read2s
    getErrorCorrection: extract the second field in whitelist_tsv and
    return a map of non-whitelist:whitelist

    deriveErrorCorrection: return a map of non-whitelist:whitelist
    using a simple edit distance threshold
    '''

    base2errors = {"A": ["T", "C", "G", "N"],
                   "T": ["A", "C", "G", "N"],
                   "C": ["T", "A", "G", "N"],
                   "G": ["T", "C", "A", "N"]}

    whitelist = []

    if getErrorCorrection or deriveErrorCorrection:
        false_to_true_map = {}
    else:
        false_to_true_map = None

    def singleBarcodeGenerator(whitelist_tsv):
        with U.openFile(whitelist_tsv, "r") as inf:
            for line in inf:
                if line.startswith('#'):
                    continue
                line = line.strip().split("\t")
                yield(line[0])

    def pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2):

        whitelist1 = []
        whitelist2 = []

        with U.openFile(whitelist_tsv, "r") as inf:
            for line in inf:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist1.append(line[0])

        with U.openFile(whitelist_tsv2, "r") as inf2:
            for line in inf2:
                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist2.append(line[0])

        for w1, w2 in itertools.product(whitelist1, whitelist2):
            yield(w1 + w2)

    if deriveErrorCorrection:

        if whitelist_tsv2:
            whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2)
        else:
            whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv)

        for whitelist_barcode in whitelist_barcodes:
            whitelist.append(whitelist_barcode)

            # for every possible combination of positions for error(s)
            for positions in itertools.product(
                    range(0, len(whitelist_barcode)), repeat=threshold):

                m_bases = [base2errors[whitelist_barcode[x]] for x in positions]

                # for every possible combination of errors
                for m in itertools.product(*m_bases):
                    error_barcode = list(whitelist_barcode)

                    # add errors
                    for pos, error_base in zip(positions, m):
                        error_barcode[pos] = error_base

                    error_barcode = "".join(error_barcode)

                    # if error barcode has already been seen, must be within
                    # threshold edit distance of >1 whitelisted barcodes
                    if error_barcode in false_to_true_map:
                        # don't report multiple times for the same barcode
                        if false_to_true_map[error_barcode]:
                            U.info("Error barcode %s can be assigned to more than "
                                   "one possible true barcode: %s or %s" % (
                                       error_barcode,
                                       false_to_true_map[error_barcode],
                                       whitelist_barcode))
                        false_to_true_map[error_barcode] = None
                    else:
                        false_to_true_map[error_barcode] = whitelist_barcode

    elif getErrorCorrection:
        assert not whitelist_tsv2, ("Can only extract errors from the whitelist "
                                    "if a single whitelist is given")
        with U.openFile(whitelist_tsv, "r") as inf:

            for line in inf:

                if line.startswith('#'):
                    continue

                line = line.strip().split("\t")
                whitelist_barcode = line[0]
                whitelist.append(whitelist_barcode)

                if getErrorCorrection:
                    for error_barcode in line[1].split(","):
                        false_to_true_map[error_barcode] = whitelist_barcode

    else:  # no error correction
        if whitelist_tsv2:
            whitelist_barcodes = pairedBarcodeGenerator(whitelist_tsv, whitelist_tsv2)
        else:
            whitelist_barcodes = singleBarcodeGenerator(whitelist_tsv)

        whitelist = [x for x in whitelist_barcodes]

    return set(whitelist), false_to_true_map
示例#13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--read2-out",
                      dest="read2_out",
                      type="string",
                      help="file to output processed paired read to")
    parser.add_option(
        "--read2-out-only",
        dest="read2_out_only",
        action="store_true",
        help="Paired reads, only output the second read in the pair")
    parser.add_option("--quality-filter-threshold",
                      dest="quality_filter_threshold",
                      type="int",
                      help=("Remove reads where any UMI base quality score "
                            "falls below this threshold"))
    parser.add_option(
        "--quality-filter-mask",
        dest="quality_filter_mask",
        type="int",
        help=("If a UMI base has a quality below this threshold, "
              "replace the base with 'N'"))
    parser.add_option("--quality-encoding",
                      dest="quality_encoding",
                      type="choice",
                      choices=["phred33", "phred64", "solexa"],
                      help=("Quality score encoding. Choose from 'phred33'"
                            "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    parser.add_option("--extract-method",
                      dest="extract_method",
                      type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--filter-cell-barcode",
                      dest="filter_cell_barcode",
                      action="store_true",
                      help="Filter the cell barcodes")
    parser.add_option("--error-correct-cell",
                      dest="error_correct_cell",
                      action="store_true",
                      help=("Correct errors in the cell barcode"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance allowed for correction"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix",
                      type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--output-whitelist",
                      dest="output_whitelist",
                      type="string",
                      help=("Write out the automatically generated whitelist"))
    parser.add_option("--whitelist-tsv",
                      dest="whitelist_tsv",
                      type="string",
                      help=("A whitelist of accepted cell barcodes"))
    parser.add_option("--blacklist-tsv",
                      dest="blacklist_tsv",
                      type="string",
                      help=("A blacklist of accepted cell barcodes"))
    parser.add_option(
        "--cell-barcode-subset",
        dest="cell_barcode_subset",
        type="int",
        help=("Use only the first N reads to automatically "
              "identify the true cell barcodes. If N is greater "
              "than the number of reads, all reads will be used"))
    parser.add_option("--reads-subset",
                      dest="reads_subset",
                      type="int",
                      help=("Only extract from the first N reads. If N is "
                            "greater than the number of reads, all reads will "
                            "be used"))
    parser.add_option(
        "--reconcile-pairs",
        dest="reconcile",
        action="store_true",
        help=("Allow the presences of reads in read2 input that are"
              "not present in read1 input. This allows cell barcode"
              "filtering of read1s without considering read2s"))
    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_cell=False,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_out_only=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        plot_prefix=None,
                        output_whitelist=None,
                        cell_barcode_subset=50000000,
                        reconcile=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv)

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if options.whitelist_tsv:
        if options.blacklist_tsv:
            U.error("Do not supply a blacklist and a whitelist. Just "
                    "remove the blacklist barcodes from the whitelist!")

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (options.pattern,
                                                     options.pattern2))

    if options.stdin == sys.stdin:
        if not options.whitelist_tsv and options.filter_cell_barcode:
            U.error(
                "cannot support reading from stdin if correcting cell barcode")
        read1s = umi_methods.fastqIterate(U.openFile(options.stdin))
    else:
        read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name))

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        options.extract_method, options.pattern, options.pattern2,
        options.prime3, extract_cell, options.quality_encoding,
        options.quality_filter_threshold, options.quality_filter_mask,
        options.filter_cell_barcode)

    if options.filter_cell_barcode:
        if (not options.whitelist_tsv) or options.error_correct_cell:
            cell_barcode_counts = collections.Counter()

            n_reads = 0
            if not options.read2_in:
                for read1 in read1s:
                    n_reads += 1
                    cell_barcode = ReadExtractor.getCellBarcode(read1)
                    if cell_barcode:
                        cell_barcode_counts[cell_barcode] += 1
                    if options.cell_barcode_subset:
                        if (n_reads > options.cell_barcode_subset):
                            break
            else:
                read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
                for read1, read2 in izip(read1s, read2s):
                    n_reads += 1
                    cell_barcode = ReadExtractor.getCellBarcode(read1, read2)
                    if cell_barcode:
                        cell_barcode_counts[cell_barcode] += 1
                    if options.cell_barcode_subset:
                        if (n_reads > options.cell_barcode_subset):
                            break

            if options.blacklist_tsv:
                cell_blacklist = umi_methods.getUserDefinedBarcodes(
                    options.blacklist_tsv)
                for cell in cell_blacklist:
                    del cell_barcode_counts[cell]

            if options.whitelist_tsv:
                cell_whitelist = umi_methods.getUserDefinedBarcodes(
                    options.whitelist_tsv)
                error_correct_mappings = umi_methods.getErrorCorrectMappings(
                    cell_barcode_counts.keys(), cell_whitelist,
                    options.error_correct_threshold)
            else:
                # getCellWhitelist has not been properly defined yet!
                cell_whitelist, error_correct_mappings = umi_methods.getCellWhitelist(
                    cell_barcode_counts, options.error_correct_threshold,
                    options.plot_prefix)

            # re-make the reads1s iterator
            read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name))

        else:
            cell_whitelist = umi_methods.getUserDefinedBarcodes(
                options.whitelist_tsv)
            error_correct_mappings = None, None

        false_to_true_map, true_to_false_map = error_correct_mappings

        if options.output_whitelist:

            with U.openFile(options.output_whitelist, "w") as outf:

                columns = [
                    "barcode", "count", "corrected_barcodes",
                    "corrected_barcode_counts"
                ]
                outf.write("\t".join(columns) + "\n")

                for barcode in sorted(list(cell_whitelist)):

                    if true_to_false_map:
                        corrected_barcodes = ",".join(
                            sorted(true_to_false_map[barcode]))
                        corrected_barcode_counts = ",".join(
                            map(str, [
                                cell_barcode_counts[x]
                                for x in sorted(true_to_false_map[barcode])
                            ]))
                    else:
                        corrected_barcodes, corrected_barcode_counts = "", ""

                    outf.write("%s\t%s\t%s\t%s\n" %
                               (barcode, cell_barcode_counts[barcode],
                                corrected_barcodes, corrected_barcode_counts))

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.read2_in is None:
        for read in read1s:
            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                        options.reads_subset):
                    break

            if not new_read:
                continue

            options.stdout.write(str(new_read) + "\n")

    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict):
            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                        options.reads_subset):
                    break

            if not reads:
                continue
            else:
                new_read1, new_read2 = reads

            if not options.read2_out_only:
                options.stdout.write(str(new_read1) + "\n")

            if options.read2_out:
                read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    U.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format",
                      default=False)
    parser.add_option("-o",
                      "--out-sam",
                      dest="out_sam",
                      action="store_true",
                      help="Output alignments in sam format",
                      default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering",
                      default=1)
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats",
                      dest="further_stats",
                      action="store_true",
                      default=False,
                      help="Output further stats")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length",
                      dest="read_length",
                      action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = TwoPassPairWriter(infile, outfile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename, chrom=options.chrom)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold, options.stats,
                options.further_stats)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    outfile.close()

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(
            stats_pre_df.pivot_table(columns=stats_pre_df["counts"],
                                     values="counts",
                                     aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(
            stats_post_df.pivot_table(columns=stats_post_df["counts"],
                                      values="counts",
                                      aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre,
                                 UMI_counts_df_post,
                                 how='left',
                                 left_index=True,
                                 right_index=True,
                                 sort=True,
                                 suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv",
                             sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

        if options.further_stats:
            with U.openFile(options.stats + "_topologies.tsv", "w") as outf:
                outf.write("\n".join([
                    "\t".join((x, str(y)))
                    for x, y in topology_counts.most_common()
                ]) + "\n")

            with U.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write("\n".join([
                    "\t".join(map(str, (x, y)))
                    for x, y in node_counts.most_common()
                ]) + "\n")

    # write footer and output benchmark information.

    U.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    U.Stop()
示例#15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "whitelist-specific options")

    group.add_option("--plot-prefix",
                     dest="plot_prefix", type="string",
                     help=("Prefix for plots to visualise the automated "
                           "detection of the number of 'true' cell barcodes"))
    group.add_option("--subset-reads",
                     dest="subset_reads", type="int",
                     help=("Use the first N reads to automatically identify "
                           "the true cell barcodes. If N is greater than the "
                           "number of reads, all reads will be used. "
                           "Default is 100,000,000"))
    group.add_option("--error-correct-threshold",
                     dest="error_correct_threshold",
                     type="int",
                     help=("Hamming distance for correction of barcodes to "
                           "whitelist barcodes. This value will also be used "
                           "for error detection above the knee if required "
                           "(--ed-above-threshold)"))
    group.add_option("--method",
                     dest="method",
                     choices=["reads", "umis"],
                     help=("Use reads or unique umi counts per cell"))
    group.add_option("--knee-method",
                     dest="knee_method",
                     choices=["distance", "density"],
                     help=("Use distance or density methods for detection of knee"))
    group.add_option("--expect-cells",
                     dest="expect_cells",
                     type="int",
                     help=("Prior expectation on the upper limit on the "
                           "number of cells sequenced"))
    group.add_option("--allow-threshold-error",
                     dest="allow_threshold_error", action="store_true",
                     help=("Don't select a threshold. Will still "
                           "output the plots if requested (--plot-prefix)"))
    group.add_option("--set-cell-number",
                     dest="cell_number",
                     type="int",
                     help=("Specify the number of cell barcodes to accept"))

    parser.add_option("--ed-above-threshold",
                      dest="ed_above_threshold", type="choice",
                      choices=["discard", "correct"],
                      help=("Detect CBs above the threshold which may be "
                            "sequence errors from another CB and either "
                            "'discard' or 'correct'. Default=discard"))
    parser.add_option_group(group)

    parser.set_defaults(method="reads",
                        knee_method="distance",
                        extract_method="string",
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        allow_threshold_error=False,
                        cell_number=False,
                        ed_above_threshold=None,
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filtered_out and not options.extract_method == "regex":
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex)")

    if options.expect_cells:
        if options.knee_method == "distance":
            U.error("Cannot use --expect-cells with 'distance' knee "
                    "method. Switch to --knee-method=density if you want to "
                    "provide an expectation for the number of "
                    "cells. Alternatively, if you know the number of cell "
                    "barcodes, use --cell-number")
        if options.cell_number:
            U.error("Cannot supply both --expect-cells and "
                    "--cell-number options")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = whitelist_methods.getCellWhitelist(
        cell_barcode_counts,
        options.knee_method,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    if cell_whitelist:
        U.info("Top %s cell barcodes passed the selected threshold" %
               len(cell_whitelist))

    if options.ed_above_threshold:
        cell_whitelist, true_to_false_map = whitelist_methods.errorDetectAboveThreshold(
            cell_barcode_counts,
            cell_whitelist,
            true_to_false_map,
            errors=options.error_correct_threshold,
            resolution_method=options.ed_above_threshold)

    if cell_whitelist:
        U.info("Writing out whitelist")
        total_correct_barcodes = 0
        total_corrected_barcodes = 0
        for barcode in sorted(list(cell_whitelist)):

            total_correct_barcodes += cell_barcode_counts[barcode]

            if true_to_false_map:
                corrected_barcodes = ",".join(
                    sorted(true_to_false_map[barcode]))

                correct_barcode_counts = [cell_barcode_counts[x] for x in
                                          sorted(true_to_false_map[barcode])]
                total_corrected_barcodes += sum(correct_barcode_counts)

                corrected_barcode_counts = ",".join(
                    map(str, correct_barcode_counts))
            else:
                corrected_barcodes, corrected_barcode_counts = "", ""

            options.stdout.write("%s\t%s\t%s\t%s\n" % (
                barcode, corrected_barcodes, cell_barcode_counts[barcode],
                corrected_barcode_counts))
    else:
        msg = ("No local minima was accepted. Recommend checking the plot "
               "output and counts per local minima (requires `--plot-prefix`"
               "option) and then re-running with manually selected threshold "
               "(`--set-cell-number` option)")

        if options.allow_threshold_error:
            U.info(msg)
        else:
            U.error(msg)

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    if cell_whitelist:
        U.info("Found %i total reads matching the selected cell barcodes" %
               total_correct_barcodes)
        U.info("Found %i total reads which can be error corrected to the "
               "selected cell barcodes" % total_corrected_barcodes)

    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    U.Stop()
示例#16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--split-barcode",
                      dest="split",
                      action="store_true",
                      help="barcode is split across read pair")
    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read")
    parser.add_option("--read2-out",
                      dest="read2_out",
                      type="string",
                      help="file to output processed paired read to")
    parser.add_option("--quality-filter-threshold",
                      dest="quality_filter_threshold",
                      type="int",
                      help=("Remove reads where any UMI base quality score "
                            "falls below this threshold"))
    parser.add_option("--quality-encoding",
                      dest="quality_encoding",
                      type="choice",
                      choices=["phred33", "phred64", "solexa"],
                      help=("Quality score encoding. Choose from phred33"
                            "[33-77] phred64 [64-106] or solexa [59-106]"))
    parser.add_option("--supress-stats",
                      dest="stats",
                      action="store_false",
                      help="Suppress the writing of stats to the log")

    parser.set_defaults(split=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=None,
                        prime3=False,
                        stats=True,
                        quality_filter_threshold=None,
                        quality_encoding=None)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv)

    # check options
    if not options.pattern:
        raise ValueError("must specify a pattern using ``--bc-pattern``")

    if options.split:
        if not options.read2_in:
            raise ValueError("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    if options.read2_in:
        if not options.read2_out:
            raise ValueError("must specify an output for the paired end "
                             "``--read2-out``")

    if options.quality_filter_threshold:
        if not options.quality_encoding:
            raise ValueError("must provide a quality encoding to filter UMIs "
                             "by quality ``--quality-encoding``")

    # Initialise the processor
    processor = Extractor(options.pattern, options.pattern2,
                          options.quality_filter_threshold,
                          options.quality_encoding, options.prime3)

    read1s = fastqIterate(options.stdin)

    if options.read2_in is None:

        for read in read1s:
            new_1 = processor(read)
            if new_1:
                options.stdout.write(str(new_1) + "\n")

    else:

        read2s = fastqIterate(U.openFile(options.read2_in))
        read2_out = U.openFile(options.read2_out, "w")

        for read1, read2 in izip(read1s, read2s):
            new_1, new_2 = processor(read1, read2)
            if new_1:
                options.stdout.write(str(new_1) + "\n")
                read2_out.write(str(new_2) + "\n")

    # write footer and output benchmark information.

    if options.stats:

        options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) +
                             "\n")
        for id in processor.bc_count:
            options.stdlog.write("\t".join(id +
                                           (str(processor.bc_count[id]), )) +
                                 "\n")

    U.Stop()
示例#17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format [default=%default]",
                      default=False)
    parser.add_option(
        "-o",
        "--out-sam",
        dest="out_sam",
        action="store_true",
        help="Output alignments in sam format [default=%default]",
        default=False)
    parser.add_option("--umi-separator",
                      dest="umi_sep",
                      type="string",
                      help="separator between read id and UMI",
                      default="_")
    parser.add_option("--umi-tag",
                      dest="umi_tag",
                      type="string",
                      help="tag containing umi",
                      default='RX')
    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')
    parser.add_option("--extract-umi-method",
                      dest="get_umi_method",
                      type="choice",
                      choices=("read_id", "tag"),
                      default="read_id",
                      help="where is the read UMI encoded? [default=%default]")
    parser.add_option("--subset",
                      dest="subset",
                      type="float",
                      help="Use only a fraction of reads, specified by subset",
                      default=None)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one [default=%default]",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read is counted as spliced [default=%default]",
                      default=4)
    parser.add_option("--edit-distance-threshold",
                      dest="threshold",
                      type="int",
                      default=1,
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering. [default=%default]")
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="paired BAM. [default=%default]")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional", "unique",
                               "cluster"),
                      default="directional",
                      help="method to use for umi deduping [default=%default]")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig (field 3 in BAM; RNAME),"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option("--per-gene",
                      dest="per_gene",
                      action="store_true",
                      default=False,
                      help=("Deduplicate per gene,"
                            "e.g for transcriptome where contig = transcript"
                            "must also provide a transript to gene map with"
                            "--gene-transcript-map [default=%default]"))
    parser.add_option("--gene-transcript-map",
                      dest="gene_transcript_map",
                      type="string",
                      help="file mapping transcripts to genes (tab separated)",
                      default=None)
    parser.add_option("--gene-tag",
                      dest="gene_tag",
                      type="string",
                      help=("Deduplicate per gene where gene is"
                            "defined by this bam tag [default=%default]"),
                      default=None)
    parser.add_option(
        "--read-length",
        dest="read_length",
        action="store_true",
        default=False,
        help=("use read length in addition to position and UMI"
              "to identify possible duplicates [default=%default]"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained"
                      " [default=%default]",
                      default=0)
    parser.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))
    parser.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)
    parser.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))
    parser.add_option(
        "--skip-tags-regex",
        dest="skip_regex",
        type="string",
        help=("Used with --gene-tag. "
              "Ignore reads where the gene-tag matches this regex"),
        default="^[__|Unassigned]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.per_gene:
        if not options.gene_transcript_map:
            raise ValueError(
                "--per-gene option requires --gene-transcript-map")

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    # set the method with which to extract umis from reads
    if options.get_umi_method == "read_id":
        umi_getter = partial(umi_methods.get_umi_read_id, sep=options.umi_sep)
    elif options.get_umi_method == "tag":
        umi_getter = partial(umi_methods.get_umi_tag, tag=options.umi_tag)
    else:
        raise ValueError("Unknown umi extraction method")

    nInput, nOutput, unique_id = 0, 0, 0

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
        gene_tag = options.gene_tag
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)
            gene_tag = options.gene_tag

    for bundle, read_events, status in umi_methods.get_bundles(
            inreads,
            ignore_umi=False,
            subset=options.subset,
            quality_threshold=options.mapping_quality,
            paired=options.paired,
            spliced=options.spliced,
            soft_clip_threshold=options.soft,
            per_contig=options.per_contig,
            gene_tag=gene_tag,
            skip_regex=options.skip_regex,
            read_length=options.read_length,
            umi_getter=umi_getter,
            all_reads=True,
            return_read2=True,
            return_unmapped=options.output_unmapped):

        # write out read2s and unmapped if option set
        if status == 'single_read':
            # bundle is just a single read here
            outfile.write(bundle)
            nInput += 1
            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        if nOutput % 10000 == 0:
            U.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            U.debug("Read %i input reads" % nInput)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info(
        "Reads: %s" %
        ", ".join(["%s: %s" % (x[0], x[1])
                   for x in read_events.most_common()]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
示例#18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])

    group = U.OptionGroup(parser, "extract-specific options")

    # (Experimental option) Retain the UMI in the sequence read"
    group.add_option("--retain-umi", dest="retain_umi", action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--read2-out", dest="read2_out", type="string",
                     help="file to output processed paired read to")
    group.add_option("--read2-stdout", dest="read2_stdout",
                     action="store_true",
                     help="Paired reads, send read2 to stdout, discarding read1")
    group.add_option("--quality-filter-threshold",
                     dest="quality_filter_threshold", type="int",
                     help=("Remove reads where any UMI base quality score "
                           "falls below this threshold"))
    group.add_option("--quality-filter-mask",
                     dest="quality_filter_mask", type="int",
                     help=("If a UMI base has a quality below this threshold, "
                           "replace the base with 'N'"))
    group.add_option("--quality-encoding",
                     dest="quality_encoding", type="choice",
                     choices=["phred33", "phred64", "solexa"],
                     help=("Quality score encoding. Choose from 'phred33'"
                           "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    group.add_option("--filter-cell-barcode",
                     dest="filter_cell_barcode",
                     action="store_true",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--error-correct-cell",
                     dest="error_correct_cell",
                     action="store_true",
                     help=("Correct errors in the cell barcode"))
    group.add_option("--whitelist",
                     dest="whitelist", type="string",
                     help=("A whitelist of accepted cell barcodes"))
    group.add_option("--blacklist",
                     dest="blacklist", type="string",
                     help=("A blacklist of rejected cell barcodes"))
    group.add_option("--filter-umi",
                     dest="filter_umi",
                     action="store_true",
                     #help="Filter the UMIs"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist", dest="umi_whitelist",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-whitelist-paired", dest="umi_whitelist_paired",
                     type="string", default=None,
                     #help="A whitelist of accepted UMIs for read2[default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--correct-umi-threshold", dest="correct_umi_threshold",
                     type="int", default=0,
                     #help="Correct errors in UMIs to the whitelist(s) provided"
                     #"if within threshold [default=%default]"
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--umi-correct-log", dest="umi_correct_log",
                     type="string", default=None,
                     #help="File logging UMI error correction",
                     help=optparse.SUPPRESS_HELP)
    group.add_option("--subset-reads", "--reads-subset",
                     dest="reads_subset", type="int",
                     help=("Only extract from the first N reads. If N is "
                           "greater than the number of reads, all reads will "
                           "be used"))
    group.add_option("--reconcile-pairs",
                     dest="reconcile", action="store_true",
                     help=("Allow the presences of reads in read2 input that "
                           "are not present in read1 input. This allows cell "
                           "barcode filtering of read1s without "
                           "considering read2s"))
    parser.add_option_group(group)

    group = U.OptionGroup(parser, "[EXPERIMENTAl] barcode extraction options")

    group.add_option("--either-read", dest="either_read", action="store_true",
                     help="UMI may be on either read (see "
                     "--either-read-resolve) for options to resolve cases where"
                     "UMI is on both reads")
    group.add_option("--either-read-resolve",
                     dest="either_read_resolve", type="choice",
                     choices=["discard", "quality"],
                     help=("How to resolve instances where both reads "
                           "contain a UMI but using --either-read."
                           "Choose from 'discard' or 'quality'"
                           "(use highest quality). default=dicard"))

    parser.add_option_group(group)

    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist=None,
                        blacklist=None,
                        error_correct_cell=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_stdout=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        reconcile=False,
                        either_read=False,
                        either_read_resolve="discard",
                        ignore_suffix=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.filter_cell_barcode:
        U.info('Use of --whitelist ensures cell barcodes are filtered. '
               '--filter-cell-barcode is no longer required and may be '
               'removed in future versions.')

    if options.whitelist is not None:
        options.filter_cell_barcode = True

    if options.retain_umi and not options.extract_method == "regex":
        U.error("option --retain-umi only works with --extract-method=regex")

    if (options.filtered_out and not options.extract_method == "regex" and
        whitelist is None):
        U.error("Reads will not be filtered unless extract method is"
                "set to regex (--extract-method=regex) or cell"
                "barcodes are filtered (--whitelist)")

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if options.either_read:
        if extract_cell:
            U.error("Option to extract from either read (--either-read) "
                    "is not currently compatible with cell barcode extraction")
        if not options.extract_method == "regex":
            U.error("Option to extract from either read (--either-read)"
                    "requires --extract-method=regex")
        if not options.pattern or not options.pattern2:
            U.error("Option to extract from either read (--either-read)"
                    "requires --bc-pattern=[PATTERN1] and"
                    "--bc-pattern2=[PATTERN2]")

    if options.filter_umi:

        if not options.umi_whitelist:
                U.error("must provide a UMI whitelist (--umi-whitelist) if using "
                        "--filter-umi option")
        if options.pattern2 and not options.umi_whitelist_paired:
                U.error("must provide a UMI whitelist for paired end "
                        "(--umi-whitelist-paired) if using --filter-umi option"
                        "with paired end data")
        if not extract_umi:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any umi bases "
                        "(marked with 'Ns') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any umi groups "
                        "(starting with 'umi_') %s, %s" (
                            options.pattern, options.pattern2))

    if options.whitelist:

        if not extract_cell:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any cell bases "
                        "(marked with 'Cs') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any cell groups "
                        "(starting with 'cell_') %s, %s" (
                            options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = extract_methods.ExtractFilterAndUpdate(
        options.extract_method,
        options.pattern,
        options.pattern2,
        options.prime3,
        extract_cell,
        options.quality_encoding,
        options.quality_filter_threshold,
        options.quality_filter_mask,
        options.filter_umi,
        options.filter_cell_barcode,
        options.retain_umi,
        options.either_read,
        options.either_read_resolve)

    if options.filter_umi:
        umi_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.umi_whitelist,
            options.umi_whitelist_paired,
            deriveErrorCorrection=True,
            threshold=options.correct_umi_threshold)

        U.info("Length of whitelist: %i" % len(umi_whitelist))
        U.info("Length of 'correctable' whitelist: %i" % len(false_to_true_map))

        ReadExtractor.umi_whitelist = umi_whitelist
        ReadExtractor.umi_false_to_true_map = false_to_true_map
        ReadExtractor.umi_whitelist_counts = collections.defaultdict(
            lambda: collections.Counter())

    if options.whitelist:
        cell_whitelist, false_to_true_map = whitelist_methods.getUserDefinedBarcodes(
            options.whitelist,
            getErrorCorrection=options.error_correct_cell)

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.blacklist:
        blacklist = set()
        with U.openFile(options.blacklist, "r") as inf:
            for line in inf:
                blacklist.add(line.strip().split("\t")[0])
        ReadExtractor.cell_blacklist = blacklist

    # variables for progress monitor
    progCount = 0
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.filtered_out:
        filtered_out = U.openFile(options.filtered_out, "w")

    if options.read2_in is None:
        for read in read1s:

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))

            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not new_read:
                if options.filtered_out:
                    filtered_out.write(str(read) + "\n")
                continue

            options.stdout.write(str(new_read) + "\n")

    else:

        if options.filtered_out2:
            filtered_out2 = U.openFile(options.filtered_out2, "w")

        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict, options.ignore_suffix):

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))
                sys.stdout.flush()

            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not reads:
                if options.filtered_out:
                    filtered_out.write(str(read1) + "\n")
                if options.filtered_out2:
                    filtered_out2.write(str(read2) + "\n")
                continue
            else:
                new_read1, new_read2 = reads

            if options.read2_stdout:
                options.stdout.write(str(new_read2) + "\n")
            else:
                options.stdout.write(str(new_read1) + "\n")

                if options.read2_out:
                    read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()
    if options.filtered_out:
        filtered_out.close()
    if options.filtered_out2:
        filtered_out2.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    if options.umi_correct_log:
        with U.openFile(options.umi_correct_log, "w") as outf:
            outf.write("umi\tcount_no_errors\tcount_errors\n")
            for umi, counts in ReadExtractor.umi_whitelist_counts.items():
                outf.write("%s\t%i\t%i\n" % (
                    umi, counts["no_error"], counts["error"]))
        outf.close()

    U.Stop()
示例#19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)

    group.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    group.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft_clip_threshold)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()
示例#20
0
def getKneeEstimateDensity(cell_barcode_counts,
                           expect_cells=False,
                           cell_number=False,
                           plotfile_prefix=None):
    ''' estimate the number of "true" cell barcodes using a gaussian
    density-based method

    input:
         cell_barcode_counts = dict(key = barcode, value = count)
         expect_cells (optional) = define the expected number of cells
         cell_number (optional) = define number of cell barcodes to accept
         plotfile_prefix = (optional) prefix for plots

    returns:
         List of true barcodes
    '''

    # very low abundance cell barcodes are filtered out (< 0.001 *
    # the most abundant)
    threshold = 0.001 * cell_barcode_counts.most_common(1)[0][1]

    counts = sorted(cell_barcode_counts.values(), reverse=True)
    counts_thresh = [x for x in counts if x > threshold]
    log_counts = np.log10(counts_thresh)

    # guassian density with hardcoded bw
    density = gaussian_kde(log_counts, bw_method=0.1)

    xx_values = 10000  # how many x values for density plot
    xx = np.linspace(log_counts.min(), log_counts.max(), xx_values)

    local_min = None

    if cell_number:  # we have a prior hard expectation on the number of cells
        threshold = counts[cell_number]

    else:
        local_mins = argrelextrema(density(xx), np.less)[0]
        local_mins_counts = []

        for poss_local_min in local_mins[::-1]:

            passing_threshold = sum([
                y > np.power(10, xx[poss_local_min])
                for x, y in cell_barcode_counts.items()
            ])
            local_mins_counts.append(passing_threshold)

            if not local_min:  # if we have selected a local min yet
                if expect_cells:  # we have a "soft" expectation
                    if (passing_threshold > expect_cells * 0.1
                            and passing_threshold <= expect_cells):
                        local_min = poss_local_min

                else:  # we have no prior expectation
                    # TS: In abscence of any expectation (either hard or soft),
                    # this set of heuristic thresholds are used to decide
                    # which local minimum to select.
                    # This is very unlikely to be the best way to achieve this!
                    if (poss_local_min >= 0.2 * xx_values and
                        (log_counts.max() - xx[poss_local_min] > 0.5
                         or xx[poss_local_min] < log_counts.max() / 2)):
                        local_min = poss_local_min

        if local_min is not None:
            threshold = np.power(10, xx[local_min])

    if cell_number or local_min is not None:
        final_barcodes = set(
            [x for x, y in cell_barcode_counts.items() if y > threshold])
    else:
        final_barcodes = None

    if plotfile_prefix:

        # colour-blind friendly colours - https://gist.github.com/thriveth/8560036
        CB_color_cycle = [
            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3',
            '#999999', '#e41a1c', '#dede00'
        ]
        user_line = mlines.Line2D([], [],
                                  color=CB_color_cycle[0],
                                  ls="dashed",
                                  markersize=15,
                                  label='User-defined')
        selected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[0],
                                      ls="dashed",
                                      markersize=15,
                                      label='Selected')
        rejected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[3],
                                      ls="dashed",
                                      markersize=15,
                                      label='Rejected')

        # make density plot
        fig = plt.figure()
        fig1 = fig.add_subplot(111)
        fig1.plot(xx, density(xx), 'k')
        fig1.set_xlabel("Count per cell (log10)")
        fig1.set_ylabel("Density")

        if cell_number:
            fig1.axvline(np.log10(threshold),
                         ls="dashed",
                         color=CB_color_cycle[0])
            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")

        elif local_min is None:  # no local_min was accepted
            for pos in xx[local_mins]:
                fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3])
            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")
        else:
            for pos in xx[local_mins]:
                if pos == xx[local_min]:  # selected local minima
                    fig1.axvline(x=xx[local_min],
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig1.axvline(x=pos, ls="dashed", color=CB_color_cycle[3])

            lgd = fig1.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_count_density.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        # make knee plot
        fig = plt.figure()
        fig2 = fig.add_subplot(111)
        fig2.plot(range(0, len(counts)), np.cumsum(counts), c="black")

        xmax = len(counts)
        if local_min is not None:
            # reasonable maximum x-axis value
            xmax = min(len(final_barcodes) * 5, xmax)

        fig2.set_xlim((0 - (0.01 * xmax), xmax))
        fig2.set_xlabel("Rank")
        fig2.set_ylabel("Cumulative count")

        if cell_number:
            fig2.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0])
            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")

        elif local_min is None:  # no local_min was accepted
            for local_mins_count in local_mins_counts:
                fig2.axvline(x=local_mins_count,
                             ls="dashed",
                             color=CB_color_cycle[3])
            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        else:
            for local_mins_count in local_mins_counts:
                if local_mins_count == len(
                        final_barcodes):  # selected local minima
                    fig2.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig2.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[3])

            lgd = fig2.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_knee.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if local_min is not None:
            colours_selected = [
                CB_color_cycle[0] for x in range(0, len(final_barcodes))
            ]
            colours_rejected = [
                "black" for x in range(0,
                                       len(counts) - len(final_barcodes))
            ]
            colours = colours_selected + colours_rejected
        else:
            colours = ["black" for x in range(0, len(counts))]

        fig = plt.figure()
        fig3 = fig.add_subplot(111)
        fig3.scatter(x=range(1,
                             len(counts) + 1),
                     y=counts,
                     c=colours,
                     s=10,
                     linewidths=0)
        fig3.loglog()
        fig3.set_xlim(0, len(counts) * 1.25)
        fig3.set_xlabel('Barcode index')
        fig3.set_ylabel('Count')

        if cell_number:
            fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[0])
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[user_line],
                              title="Cell threshold")
        elif local_min is None:  # no local_min was accepted
            for local_mins_count in local_mins_counts:
                fig3.axvline(x=local_mins_count,
                             ls="dashed",
                             color=CB_color_cycle[3])
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")
        else:
            for local_mins_count in local_mins_counts:
                if local_mins_count == len(
                        final_barcodes):  # selected local minima
                    fig3.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[0])
                else:
                    fig3.axvline(x=local_mins_count,
                                 ls="dashed",
                                 color=CB_color_cycle[3])

            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, rejected_line],
                              title="Possible thresholds")

        fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if not cell_number:
            with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix,
                            "w") as outf:
                outf.write("count\taction\n")
                for local_mins_count in local_mins_counts:
                    if local_min and local_mins_count == len(final_barcodes):
                        threshold_type = "Selected"
                    else:
                        threshold_type = "Rejected"

                    outf.write("%s\t%s\n" % (local_mins_count, threshold_type))

    return final_barcodes
示例#21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--read2-out", dest="read2_out", type="string",
                      help="file to output processed paired read to")
    parser.add_option("--read2-stdout", dest="read2_stdout",
                      action="store_true",
                      help="Paired reads, send read2 to stdout, discarding read1")
    parser.add_option("--quality-filter-threshold",
                      dest="quality_filter_threshold", type="int",
                      help=("Remove reads where any UMI base quality score "
                            "falls below this threshold"))
    parser.add_option("--quality-filter-mask",
                      dest="quality_filter_mask", type="int",
                      help=("If a UMI base has a quality below this threshold, "
                            "replace the base with 'N'"))
    parser.add_option("--quality-encoding",
                      dest="quality_encoding", type="choice",
                      choices=["phred33", "phred64", "solexa"],
                      help=("Quality score encoding. Choose from 'phred33'"
                            "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    parser.add_option("--extract-method",
                      dest="extract_method", type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--filter-cell-barcode",
                      dest="filter_cell_barcode",
                      action="store_true",
                      help="Filter the cell barcodes")
    parser.add_option("--error-correct-cell",
                      dest="error_correct_cell",
                      action="store_true",
                      help=("Correct errors in the cell barcode"))
    parser.add_option("--whitelist",
                      dest="whitelist", type="string",
                      help=("A whitelist of accepted cell barcodes"))
    parser.add_option("--blacklist",
                      dest="blacklist", type="string",
                      help=("A blacklist of accepted cell barcodes"))
    parser.add_option("--reads-subset",
                      dest="reads_subset", type="int",
                      help=("Only extract from the first N reads. If N is "
                            "greater than the number of reads, all reads will "
                            "be used"))
    parser.add_option("--reconcile-pairs",
                      dest="reconcile", action="store_true",
                      help=("Allow the presences of reads in read2 input that are"
                            "not present in read1 input. This allows cell barcode"
                            "filtering of read1s without considering read2s"))
    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist=None,
                        blacklist=None,
                        error_correct_cell=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_stdout=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        reconcile=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.pattern2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.pattern2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (
                        options.pattern, options.pattern2))

    if options.filter_cell_barcodes:

        if not options.whitelist:
                U.error("must provide a whitelist (--whitelist) if using "
                        "--filter-cell-barcode option")

        if not extract_cell:
            if options.extract_method == "string":
                U.error("barcode pattern(s) do not include any cell bases "
                        "(marked with 'Cs') %s, %s" % (
                            options.pattern, options.pattern2))
            elif options.extract_method == "regex":
                U.error("barcode regex(es) do not include any cell groups "
                        "(starting with 'cell_') %s, %s" (
                            options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        options.extract_method,
        options.pattern,
        options.pattern2,
        options.prime3,
        extract_cell,
        options.quality_encoding,
        options.quality_filter_threshold,
        options.quality_filter_mask,
        options.filter_cell_barcode)

    if options.filter_cell_barcode:
        cell_whitelist, false_to_true_map = umi_methods.getUserDefinedBarcodes(
            options.whitelist, options.error_correct_cell)

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.blacklist:
        blacklist = set()
        with U.openFile(options.blacklist, "r") as inf:
            for line in inf:
                blacklist.add(line.strip().split("\t")[0])
        ReadExtractor.cell_blacklist = blacklist

    # variables for progress monitor
    progCount = 0
    displayMax = 100000
    U.info("Starting barcode extraction")

    if options.read2_in is None:
        for read in read1s:

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))

            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not new_read:
                continue

            options.stdout.write(str(new_read) + "\n")

    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict):

            # incrementing count for monitoring progress
            progCount += 1

            # Update display in every 100kth iteration
            if progCount % displayMax == 0:
                U.info("Parsed {} reads".format(progCount))
                sys.stdout.flush()

            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                    options.reads_subset):
                    break

            if not reads:
                continue
            else:
                new_read1, new_read2 = reads

            if options.read2_stdout:
                options.stdout.write(str(new_read2) + "\n")
            else:
                options.stdout.write(str(new_read1) + "\n")

                if options.read2_out:
                    read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    U.Stop()
示例#22
0
def getKneeEstimateDistance(cell_barcode_counts,
                            cell_number=False,
                            plotfile_prefix=None):
    ''' estimate the number of "true" cell barcodes via a knee method
    which finds the point with maximum distance

    input:
         cell_barcode_counts = dict(key = barcode, value = count)
         cell_number (optional) = define number of cell barcodes to accept
         plotfile_prefix = (optional) prefix for plots

    returns:
         List of true barcodes
    '''
    def getKneeDistance(values):
        '''
        This function is based on
        https://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve

        and https://dataplatform.cloud.ibm.com/analytics/notebooks/54d79c2a-f155-40ec-93ec-ed05b58afa39/view?access_token=6d8ec910cf2a1b3901c721fcb94638563cd646fe14400fecbb76cea6aaae2fb1

        The idea is to draw a line from the first to last point on the
        cumulative counts curve and then find the point on the curve
        which is the maximum distance away from this line
        '''

        # get coordinates of all the points
        nPoints = len(values)
        allCoord = np.vstack((range(nPoints), values)).T

        # get the first point
        firstPoint = allCoord[0]
        # get vector between first and last point - this is the line
        lineVec = allCoord[-1] - allCoord[0]
        lineVecNorm = lineVec / np.sqrt(np.sum(lineVec**2))

        # find the distance from each point to the line:
        # vector between all points and first point
        vecFromFirst = allCoord - firstPoint

        # To calculate the distance to the line, we split vecFromFirst into two
        # components, one that is parallel to the line and one that is perpendicular
        # Then, we take the norm of the part that is perpendicular to the line and
        # get the distance.
        # We find the vector parallel to the line by projecting vecFromFirst onto
        # the line. The perpendicular vector is vecFromFirst - vecFromFirstParallel
        # We project vecFromFirst by taking the scalar product of the vector with
        # the unit vector that points in the direction of the line (this gives us
        # the length of the projection of vecFromFirst onto the line). If we
        # multiply the scalar product by the unit vector, we have vecFromFirstParallel

        scalarProduct = np.sum(vecFromFirst *
                               npm.repmat(lineVecNorm, nPoints, 1),
                               axis=1)
        vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm)
        vecToLine = vecFromFirst - vecFromFirstParallel

        # distance to line is the norm of vecToLine
        distToLine = np.sqrt(np.sum(vecToLine**2, axis=1))

        # knee/elbow is the point with max distance value
        idxOfBestPoint = np.argmax(distToLine)

        return (distToLine, idxOfBestPoint)

    counts = [x[1] for x in cell_barcode_counts.most_common()]
    values = list(np.cumsum(counts))

    # We need to perform the distance knee iteratively with reduced
    # number of CBs since it's sensitive to the number of CBs input
    # and overestimates if too many CBs are used
    previous_idxOfBestPoint = 0
    distToLine, idxOfBestPoint = getKneeDistance(values)
    if idxOfBestPoint == 0:
        raise ValueError("Something's gone wrong here!!")

    max_iterations = 100
    iterations = 0
    while idxOfBestPoint - previous_idxOfBestPoint != 0:
        previous_idxOfBestPoint = idxOfBestPoint
        iterations += 1
        if iterations > max_iterations:
            break
        distToLine, idxOfBestPoint = getKneeDistance(values[:idxOfBestPoint *
                                                            3])

    knee_final_barcodes = [
        x[0] for x in cell_barcode_counts.most_common()[:idxOfBestPoint + 1]
    ]

    if cell_number:
        threshold = counts[cell_number]
        final_barcodes = set(
            [x for x, y in cell_barcode_counts.items() if y > threshold])
    else:
        final_barcodes = knee_final_barcodes

    if plotfile_prefix:

        # colour-blind friendly colours - https://gist.github.com/thriveth/8560036
        CB_color_cycle = [
            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3',
            '#999999', '#e41a1c', '#dede00'
        ]

        user_line = mlines.Line2D([], [],
                                  color=CB_color_cycle[2],
                                  ls="dashed",
                                  markersize=15,
                                  label='User-defined')
        selected_line = mlines.Line2D([], [],
                                      color=CB_color_cycle[0],
                                      ls="dashed",
                                      markersize=15,
                                      label='Knee')

        # plot of the original curve and its corresponding distances
        plt.figure(figsize=(12, 6))
        plt.plot(distToLine, label='Distance', color='r')
        plt.plot(values, label='Cumulative', color='b')
        plt.plot([idxOfBestPoint],
                 values[idxOfBestPoint],
                 marker='o',
                 markersize=8,
                 color="red",
                 label='Knee')

        if cell_number:
            plt.axvline(x=cell_number,
                        ls="dashed",
                        color=CB_color_cycle[2],
                        label="User-defined")

        plt.legend()
        plt.savefig("%s_cell_barcode_knee.png" % plotfile_prefix)

        colours_selected = [
            CB_color_cycle[0] for x in range(0, len(final_barcodes))
        ]
        colours_rejected = [
            "black" for x in range(0,
                                   len(counts) - len(final_barcodes))
        ]
        colours = colours_selected + colours_rejected

        fig = plt.figure()
        fig3 = fig.add_subplot(111)
        fig3.scatter(x=range(1,
                             len(counts) + 1),
                     y=counts,
                     c=colours,
                     s=10,
                     linewidths=0)
        fig3.loglog()
        fig3.set_xlim(0, len(counts) * 1.25)
        fig3.set_xlabel('Barcode index')
        fig3.set_ylabel('Count')
        fig3.axvline(x=len(knee_final_barcodes),
                     ls="dashed",
                     color=CB_color_cycle[0])

        if cell_number:
            fig3.axvline(x=cell_number, ls="dashed", color=CB_color_cycle[2])

            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line, user_line],
                              title="User threshold")
        else:
            lgd = fig3.legend(bbox_to_anchor=(1.05, 1),
                              loc=2,
                              borderaxespad=0.,
                              handles=[selected_line],
                              title="Knee threshold")

        fig.savefig("%s_cell_barcode_counts.png" % plotfile_prefix,
                    bbox_extra_artists=(lgd, ),
                    bbox_inches='tight')

        if not cell_number:
            with U.openFile("%s_cell_thresholds.tsv" % plotfile_prefix,
                            "w") as outf:
                outf.write("count\n")
                outf.write("%s\n" % idxOfBestPoint)

    return (final_barcodes)
示例#23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])
    group = U.OptionGroup(parser, "dedup-specific options")

    group.add_option("--output-stats",
                     dest="stats",
                     type="string",
                     default=False,
                     help="Specify location to output stats")

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = options.stdout.name
        options.stdout.close()
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats and options.ignore_umi:
        raise ValueError("'--output-stats' and '--ignore-umi' options"
                         " cannot be used together")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = sam_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)

    else:
        if options.per_contig and options.gene_transcript_map:
            metacontig2contig = sam_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = sam_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()

    # set up ReadCluster functor with methods specific to
    # specified options.method
    processor = network.ReadDeduplicator(options.method)

    bundle_iterator = sam_methods.get_bundles(
        options, metacontig_contig=metacontig2contig)

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(
            infile.filename,
            chrom=options.chrom,
            barcode_getter=bundle_iterator.barcode_getter)

    for bundle, key, status in bundle_iterator(inreads):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        while nOutput >= output_reads + 100000:
            output_reads += 100000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [
                    bundle_iterator.barcode_getter(x)[0] for x in reads
                ]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if not options.no_sort_output:
        # sort the output
        pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
        os.unlink(out_name)  # delete the tempfile

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame(
            {
                "unique":
                tallyCounts(pre_cluster_binned, max_ed),
                "unique_null":
                tallyCounts(pre_cluster_null_binned, max_ed),
                options.method:
                tallyCounts(post_cluster_binned, max_ed),
                "%s_null" % options.method:
                tallyCounts(post_cluster_null_binned, max_ed),
                "edit_distance":
                cluster_bins
            },
            columns=[
                "unique", "unique_null", options.method,
                "%s_null" % options.method, "edit_distance"
            ])

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))

    U.info("Number of reads out: %i" % nOutput)

    if not options.ignore_umi:  # otherwise processor has not been used
        U.info("Total number of positions deduplicated: %i" %
               processor.UMIClusterer.positions)
        if processor.UMIClusterer.positions > 0:
            U.info("Mean number of unique UMIs per position: %.2f" %
                   (float(processor.UMIClusterer.total_umis_per_position) /
                    processor.UMIClusterer.positions))
            U.info("Max. number of unique UMIs per position: %i" %
                   processor.UMIClusterer.max_umis_per_position)
        else:
            U.warn("The BAM did not contain any valid "
                   "reads/read pairs for deduplication")

    U.Stop()
示例#24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method", type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix", type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads", type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (
                        options.pattern, options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(
                sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [cell_barcode_counts[x] for x
                          in sorted(true_to_false_map[barcode])]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write("%s\t%s\t%s\t%s\n" % (
            barcode, corrected_barcodes, cell_barcode_counts[barcode],
            corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()