Exemplo n.º 1
0
def correct_cells(final_results, umis_per_cell, collapsing_threshold,
                  expected_cells):
    """
    Corrects cell barcodes.
    
    Args:
        final_results (dict): Dict of dict of Counters with mapping results.
        umis_per_cell (Counter): Counter of number of umis per cell.
        collapsing_threshold (int): Max distance between umis.
        expected_cells (int): Number of expected cells.
    
    Returns:
        final_results (dict): Same as input but with corrected umis.
        corrected_umis (int): How many umis have been corrected.
    """
    print('Correcting cell barcodes')
    corrected_barcodes = 0
    try:
        cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
            cell_barcode_counts=umis_per_cell,
            expect_cells=expected_cells,
            cell_number=False,
            error_correct_threshold=collapsing_threshold,
            plotfile_prefix=False)
        if true_to_false_map:
            for real_barcode in true_to_false_map:
                for fake_barcode in true_to_false_map[real_barcode]:
                    temp = final_results.pop(fake_barcode)
                    corrected_barcodes += 1
                    for TAG in temp.keys():
                        final_results[real_barcode][TAG].update(temp[TAG])
                    temp_umi_counts = umis_per_cell.pop(fake_barcode)
                    umis_per_cell[real_barcode] += temp_umi_counts
    except Exception as e:
        print(
            'Could not find a good local minima for correction.\nNo cell barcode correction was done.'
        )
    return (final_results, umis_per_cell, corrected_barcodes)
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method",
                      type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix",
                      type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads",
                      type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser,
                              argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (options.pattern,
                                                     options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (options.pattern,
                                                      options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" %
            (len(cell_barcode_counts), options.subset_reads,
             options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts, options.expect_cells, options.cell_number,
        options.error_correct_threshold, options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [
                    cell_barcode_counts[x]
                    for x in sorted(true_to_false_map[barcode])
                ]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write(
            "%s\t%s\t%s\t%s\n" %
            (barcode, corrected_barcodes, cell_barcode_counts[barcode],
             corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()
Exemplo n.º 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "whitelist-specific options")

    group.add_option("--plot-prefix",
                     dest="plot_prefix",
                     type="string",
                     help=("Prefix for plots to visualise the automated "
                           "detection of the number of 'true' cell barcodes"))
    group.add_option("--subset-reads",
                     dest="subset_reads",
                     type="int",
                     help=("Use the first N reads to automatically identify "
                           "the true cell barcodes. If N is greater than the "
                           "number of reads, all reads will be used. "
                           "Default is 100,000,000"))
    group.add_option("--error-correct-threshold",
                     dest="error_correct_threshold",
                     type="int",
                     help=("Hamming distance for correction of "
                           "barcodes to whitelist barcodes"))
    group.add_option("--method",
                     dest="method",
                     choices=["reads", "umis"],
                     help=("Use reads or unique umi counts per cell"))
    group.add_option("--expect-cells",
                     dest="expect_cells",
                     type="int",
                     help=("Prior expectation on the upper limit on the "
                           "number of cells sequenced"))
    group.add_option("--allow-threshold-error",
                     dest="allow_threshold_error",
                     action="store_true",
                     help=("Don't select a threshold. Will still "
                           "output the plots if requested (--plot-prefix)"))
    group.add_option("--set-cell-number",
                     dest="cell_number",
                     type="int",
                     help=("Specify the number of cell barcodes to accept"))

    parser.add_option_group(group)

    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        allow_threshold_error=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser,
                              argv=argv,
                              add_extract_options=True,
                              add_group_dedup_options=False,
                              add_umi_grouping_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    extract_cell, extract_umi = U.validateExtractOptions(options)

    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (options.pattern,
                                                      options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" %
            (len(cell_barcode_counts), options.subset_reads,
             options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts, options.expect_cells, options.cell_number,
        options.error_correct_threshold, options.plot_prefix)

    if cell_whitelist:
        U.info("Writing out whitelist")
        total_correct_barcodes = 0
        total_corrected_barcodes = 0
        for barcode in sorted(list(cell_whitelist)):

            total_correct_barcodes += cell_barcode_counts[barcode]

            if true_to_false_map:
                corrected_barcodes = ",".join(
                    sorted(true_to_false_map[barcode]))

                correct_barcode_counts = [
                    cell_barcode_counts[x]
                    for x in sorted(true_to_false_map[barcode])
                ]
                total_corrected_barcodes += sum(correct_barcode_counts)

                corrected_barcode_counts = ",".join(
                    map(str, correct_barcode_counts))
            else:
                corrected_barcodes, corrected_barcode_counts = "", ""

            options.stdout.write(
                "%s\t%s\t%s\t%s\n" %
                (barcode, corrected_barcodes, cell_barcode_counts[barcode],
                 corrected_barcode_counts))
    else:
        msg = ("No local minima was accepted. Recommend checking the plot "
               "output and counts per local minima (requires `--plot-prefix`"
               "option) and then re-running with manually selected threshold "
               "(`--set-cell-number` option)")

        if options.allow_threshold_error:
            U.info(msg)
        else:
            U.error(msg)

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))
    if cell_whitelist:
        U.info("Top %s cell barcodes passed the selected threshold" %
               len(cell_whitelist))
        U.info("Found %i total reads matching the selected cell barcodes" %
               total_correct_barcodes)
        U.info("Found %i total reads which can be error corrected to the "
               "selected cell barcodes" % total_corrected_barcodes)
    U.Stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--bc-pattern",
                      dest="pattern",
                      type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2",
                      dest="pattern2",
                      type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime",
                      dest="prime3",
                      action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in",
                      dest="read2_in",
                      type="string",
                      help="file name for read pairs")
    parser.add_option("--read2-out",
                      dest="read2_out",
                      type="string",
                      help="file to output processed paired read to")
    parser.add_option(
        "--read2-out-only",
        dest="read2_out_only",
        action="store_true",
        help="Paired reads, only output the second read in the pair")
    parser.add_option("--quality-filter-threshold",
                      dest="quality_filter_threshold",
                      type="int",
                      help=("Remove reads where any UMI base quality score "
                            "falls below this threshold"))
    parser.add_option(
        "--quality-filter-mask",
        dest="quality_filter_mask",
        type="int",
        help=("If a UMI base has a quality below this threshold, "
              "replace the base with 'N'"))
    parser.add_option("--quality-encoding",
                      dest="quality_encoding",
                      type="choice",
                      choices=["phred33", "phred64", "solexa"],
                      help=("Quality score encoding. Choose from 'phred33'"
                            "[33-77] 'phred64' [64-106] or 'solexa' [59-106]"))
    parser.add_option("--extract-method",
                      dest="extract_method",
                      type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--filter-cell-barcode",
                      dest="filter_cell_barcode",
                      action="store_true",
                      help="Filter the cell barcodes")
    parser.add_option("--error-correct-cell",
                      dest="error_correct_cell",
                      action="store_true",
                      help=("Correct errors in the cell barcode"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance allowed for correction"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix",
                      type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--output-whitelist",
                      dest="output_whitelist",
                      type="string",
                      help=("Write out the automatically generated whitelist"))
    parser.add_option("--whitelist-tsv",
                      dest="whitelist_tsv",
                      type="string",
                      help=("A whitelist of accepted cell barcodes"))
    parser.add_option("--blacklist-tsv",
                      dest="blacklist_tsv",
                      type="string",
                      help=("A blacklist of accepted cell barcodes"))
    parser.add_option(
        "--cell-barcode-subset",
        dest="cell_barcode_subset",
        type="int",
        help=("Use only the first N reads to automatically "
              "identify the true cell barcodes. If N is greater "
              "than the number of reads, all reads will be used"))
    parser.add_option("--reads-subset",
                      dest="reads_subset",
                      type="int",
                      help=("Only extract from the first N reads. If N is "
                            "greater than the number of reads, all reads will "
                            "be used"))
    parser.add_option(
        "--reconcile-pairs",
        dest="reconcile",
        action="store_true",
        help=("Allow the presences of reads in read2 input that are"
              "not present in read1 input. This allows cell barcode"
              "filtering of read1s without considering read2s"))
    parser.set_defaults(extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_cell=False,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=False,
                        read2_out_only=False,
                        quality_filter_threshold=None,
                        quality_encoding=None,
                        plot_prefix=None,
                        output_whitelist=None,
                        cell_barcode_subset=50000000,
                        reconcile=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv)

    if options.quality_filter_threshold or options.quality_filter_mask:
        if not options.quality_encoding:
            U.error("must provide a quality encoding (--quality-"
                    "encoding) to filter UMIs by quality (--quality"
                    "-filter-threshold) or mask low quality bases "
                    "with (--quality-filter-mask)")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if options.whitelist_tsv:
        if options.blacklist_tsv:
            U.error("Do not supply a blacklist and a whitelist. Just "
                    "remove the blacklist barcodes from the whitelist!")

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" %
                    (options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (options.pattern,
                                                     options.pattern2))

    if options.stdin == sys.stdin:
        if not options.whitelist_tsv and options.filter_cell_barcode:
            U.error(
                "cannot support reading from stdin if correcting cell barcode")
        read1s = umi_methods.fastqIterate(U.openFile(options.stdin))
    else:
        read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name))

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        options.extract_method, options.pattern, options.pattern2,
        options.prime3, extract_cell, options.quality_encoding,
        options.quality_filter_threshold, options.quality_filter_mask,
        options.filter_cell_barcode)

    if options.filter_cell_barcode:
        if (not options.whitelist_tsv) or options.error_correct_cell:
            cell_barcode_counts = collections.Counter()

            n_reads = 0
            if not options.read2_in:
                for read1 in read1s:
                    n_reads += 1
                    cell_barcode = ReadExtractor.getCellBarcode(read1)
                    if cell_barcode:
                        cell_barcode_counts[cell_barcode] += 1
                    if options.cell_barcode_subset:
                        if (n_reads > options.cell_barcode_subset):
                            break
            else:
                read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
                for read1, read2 in izip(read1s, read2s):
                    n_reads += 1
                    cell_barcode = ReadExtractor.getCellBarcode(read1, read2)
                    if cell_barcode:
                        cell_barcode_counts[cell_barcode] += 1
                    if options.cell_barcode_subset:
                        if (n_reads > options.cell_barcode_subset):
                            break

            if options.blacklist_tsv:
                cell_blacklist = umi_methods.getUserDefinedBarcodes(
                    options.blacklist_tsv)
                for cell in cell_blacklist:
                    del cell_barcode_counts[cell]

            if options.whitelist_tsv:
                cell_whitelist = umi_methods.getUserDefinedBarcodes(
                    options.whitelist_tsv)
                error_correct_mappings = umi_methods.getErrorCorrectMappings(
                    cell_barcode_counts.keys(), cell_whitelist,
                    options.error_correct_threshold)
            else:
                # getCellWhitelist has not been properly defined yet!
                cell_whitelist, error_correct_mappings = umi_methods.getCellWhitelist(
                    cell_barcode_counts, options.error_correct_threshold,
                    options.plot_prefix)

            # re-make the reads1s iterator
            read1s = umi_methods.fastqIterate(U.openFile(options.stdin.name))

        else:
            cell_whitelist = umi_methods.getUserDefinedBarcodes(
                options.whitelist_tsv)
            error_correct_mappings = None, None

        false_to_true_map, true_to_false_map = error_correct_mappings

        if options.output_whitelist:

            with U.openFile(options.output_whitelist, "w") as outf:

                columns = [
                    "barcode", "count", "corrected_barcodes",
                    "corrected_barcode_counts"
                ]
                outf.write("\t".join(columns) + "\n")

                for barcode in sorted(list(cell_whitelist)):

                    if true_to_false_map:
                        corrected_barcodes = ",".join(
                            sorted(true_to_false_map[barcode]))
                        corrected_barcode_counts = ",".join(
                            map(str, [
                                cell_barcode_counts[x]
                                for x in sorted(true_to_false_map[barcode])
                            ]))
                    else:
                        corrected_barcodes, corrected_barcode_counts = "", ""

                    outf.write("%s\t%s\t%s\t%s\n" %
                               (barcode, cell_barcode_counts[barcode],
                                corrected_barcodes, corrected_barcode_counts))

        ReadExtractor.cell_whitelist = cell_whitelist
        ReadExtractor.false_to_true_map = false_to_true_map

    if options.read2_in is None:
        for read in read1s:
            new_read = ReadExtractor(read)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                        options.reads_subset):
                    break

            if not new_read:
                continue

            options.stdout.write(str(new_read) + "\n")

    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))

        if options.read2_out:
            read2_out = U.openFile(options.read2_out, "w")

        if options.reconcile:
            strict = False
        else:
            strict = True

        for read1, read2 in umi_methods.joinedFastqIterate(
                read1s, read2s, strict):
            reads = ReadExtractor(read1, read2)

            if options.reads_subset:
                if (ReadExtractor.read_counts['Input Reads'] >
                        options.reads_subset):
                    break

            if not reads:
                continue
            else:
                new_read1, new_read2 = reads

            if not options.read2_out_only:
                options.stdout.write(str(new_read1) + "\n")

            if options.read2_out:
                read2_out.write(str(new_read2) + "\n")

    if options.read2_out:
        read2_out.close()

    for k, v in ReadExtractor.getReadCounts().most_common():
        U.info("%s: %s" % (k, v))

    U.Stop()
Exemplo n.º 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern for paired reads")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read.")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--extract-method",
                      dest="extract_method", type="choice",
                      choices=["string", "regex"],
                      help=("How to extract the umi +/- cell barcodes, Choose "
                            "from 'string' or 'regex'"))
    parser.add_option("--plot-prefix",
                      dest="plot_prefix", type="string",
                      help=("Prefix for plots to visualise the automated "
                            "detection of the number of 'true' cell barcodes"))
    parser.add_option("--subset-reads",
                      dest="subset_reads", type="int",
                      help=("Use the first N reads to automatically identify "
                            "the true cell barcodes. If N is greater than the "
                            "number of reads, all reads will be used"))
    parser.add_option("--error-correct-threshold",
                      dest="error_correct_threshold",
                      type="int",
                      help=("Hamming distance for correction of "
                            "barcodes to whitelist barcodes"))
    parser.add_option("--method",
                      dest="method",
                      choices=["reads", "umis"],
                      help=("Use reads or unique umi counts per cell"))
    parser.add_option("--expect-cells",
                      dest="expect_cells",
                      type="int",
                      help=("Prior expectation on the upper limit on the "
                            "number of cells sequenced"))
    parser.add_option("--set-cell-number",
                      dest="cell_number",
                      type="int",
                      help=("Specify the number of cell barcodes to accept"))
    parser.set_defaults(method="reads",
                        extract_method="string",
                        filter_cell_barcodes=False,
                        whitelist_tsv=None,
                        blacklist_tsv=None,
                        error_correct_threshold=1,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        plot_prefix=None,
                        subset_reads=100000000,
                        expect_cells=False,
                        cell_number=False)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = U.Start(parser, argv=argv,
                              add_group_dedup_options=False,
                              add_sam_options=False)

    if options.expect_cells and options.cell_number:
        U.error("Cannot supply both --expect-cells and "
                "--cell-number options")

    if not options.pattern and not options.pattern2:
        if not options.read2_in:
            U.error("Must supply --bc-pattern for single-end")
        else:
            U.error("Must supply --bc-pattern and/or --bc-pattern2 "
                    "if paired-end ")

    if options.pattern2:
        if not options.read2_in:
            U.error("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    extract_cell = False
    extract_umi = False

    # If the pattern is a regex we can compile the regex(es) prior to
    # ExtractFilterAndUpdate instantiation
    if options.extract_method == "regex":
        if options.pattern:
            try:
                options.pattern = regex.compile(options.pattern)
            except regex.error:
                U.error("barcode_regex '%s' is not a "
                        "valid regex" % options.pattern)

        if options.pattern2:
            try:
                options.pattern2 = regex.compile(options.barcode_regex2)
            except regex.Error:
                U.error("barcode_regex2 '%s' is not a "
                        "valid regex" % options.barcode_regex2)

    # check whether the regex contains a umi group(s) and cell groups(s)
    if options.extract_method == "regex":
        if options.pattern:
            for group in options.pattern.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True
        if options.pattern2:
            for group in options.pattern2.groupindex:
                if group.startswith("cell_"):
                    extract_cell = True
                elif group.startswith("umi_"):
                    extract_umi = True

    # check whether the pattern string contains umi/cell bases
    elif options.extract_method == "string":
        if options.pattern:
            if "C" in options.pattern:
                extract_cell = True
            if "N" in options.pattern:
                extract_umi = True
        if options.pattern2:
            if "C" in options.pattern2:
                extract_cell = True
            if "N" in options.pattern2:
                extract_umi = True

    if not extract_umi:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any umi bases "
                    "(marked with 'Ns') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any umi groups "
                    "(starting with 'umi_') %s, %s" (
                        options.pattern, options.pattern2))
    if not extract_cell:
        if options.extract_method == "string":
            U.error("barcode pattern(s) do not include any cell bases "
                    "(marked with 'Cs') %s, %s" % (
                        options.pattern, options.pattern2))
        elif options.extract_method == "regex":
            U.error("barcode regex(es) do not include any cell groups "
                    "(starting with 'cell_') %s, %s" (
                        options.pattern, options.pattern2))

    read1s = umi_methods.fastqIterate(options.stdin)

    # set up read extractor
    ReadExtractor = umi_methods.ExtractFilterAndUpdate(
        method=options.extract_method,
        pattern=options.pattern,
        pattern2=options.pattern2,
        prime3=options.prime3,
        extract_cell=extract_cell)

    cell_barcode_counts = collections.Counter()

    n_reads = 0
    n_cell_barcodes = 0

    # if using the umis method, need to keep a set of umis observed
    if options.method == "umis":
        cell_barcode_umis = collections.defaultdict(set)

    # variables for progress monitor
    displayMax = 100000
    U.info("Starting barcode extraction")

    if not options.read2_in:
        for read1 in read1s:

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1
            barcode_values = ReadExtractor.getBarcodes(read1)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_cell_barcodes > options.subset_reads:
                    break
    else:
        read2s = umi_methods.fastqIterate(U.openFile(options.read2_in))
        for read1, read2 in izip(read1s, read2s):

            # Update display in every 100kth iteration
            if n_reads % displayMax == 0:
                U.info("Parsed {} reads".format(n_reads))

            n_reads += 1

            barcode_values = ReadExtractor.getBarcodes(read1, read2)
            if barcode_values is None:
                continue
            else:
                cell, umi, _, _, _, _, _ = barcode_values
                if options.method == "umis":
                    cell_barcode_umis[cell].add(umi)
                else:
                    cell_barcode_counts[cell] += 1
                n_cell_barcodes += 1

            if options.subset_reads:
                if n_reads > options.subset_reads:
                    break

    U.info("Starting - whitelist determination")

    if options.method == "umis":
        for cell in cell_barcode_umis:
            cell_barcode_counts[cell] = len(cell_barcode_umis[cell])

    if options.cell_number and options.cell_number > len(cell_barcode_counts):
        raise ValueError(
            "--set-cell-barcode option specifies more cell barcodes than the "
            "number of observed cell barcodes. This may be because "
            "--subset-reads was set to a value too low to capture reads from "
            "all cells. %s cell barcodes observed from %s parsed reads. "
            "Expected>= %s cell barcodes" % (
                len(cell_barcode_counts),
                options.subset_reads,
                options.cell_number))

    cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
        cell_barcode_counts,
        options.expect_cells,
        options.cell_number,
        options.error_correct_threshold,
        options.plot_prefix)

    U.info("Writing out whitelist")
    for barcode in sorted(list(cell_whitelist)):

        if true_to_false_map:
            corrected_barcodes = ",".join(
                sorted(true_to_false_map[barcode]))
            corrected_barcode_counts = ",".join(
                map(str, [cell_barcode_counts[x] for x
                          in sorted(true_to_false_map[barcode])]))
        else:
            corrected_barcodes, corrected_barcode_counts = "", ""

        options.stdout.write("%s\t%s\t%s\t%s\n" % (
            barcode, corrected_barcodes, cell_barcode_counts[barcode],
            corrected_barcode_counts))

    U.info("Parsed %i reads" % n_reads)
    U.info("%i reads matched the barcode pattern" % n_cell_barcodes)
    U.info("Found %i unique cell barcodes" % len(cell_barcode_counts))

    U.Stop()