Exemplo n.º 1
0
def complement(seq):
    """Take the complement of a nucleotide sequence"""
    try:
        c = "".join(__complementaryBasePairs[i] for i in seq)
    except KeyError:
        print_error("error: non-cannonical representation of a nucleotide "
                    "sequence provided. String is {0}".format(seq))

    return c
Exemplo n.º 2
0
def translate_quality(quals, encoding=33):
    """
    Translate ASCII characters to quality scores
    """
    valid_range = range(0, 43)
    qscores = [ord(i) - encoding for i in quals]
    for qscore in qscores:
        if qscore not in valid_range:
            print_error("error: wrong quality score encoding provided")
    return qscores
Exemplo n.º 3
0
def parse_commas(args, argname):
    args = [i.lstrip() for i in args.split(",")]

    if 1 > len(args) > 2:
        seq_io.print_error("error: only one or two integer values should be "
                           "provided to {0}".format(argname))

    try:
        arg1 = int(args[0])
        arg2 = int(args[1])
    except ValueError:
        seq_io.print_error("error: input to {0} must be one or more integer "
                           "values in the form INT or INT,INT".format(argname))
    except IndexError:
        arg1 = arg2 = int(args[0])

    return (arg1, arg2)
Exemplo n.º 4
0
def parse_colons(argument):
    try:
        window, score = argument.split(':')
    except ValueError:
        seq_io.print_error("error: the input provided to sliding-window is "
                           "formatted incorrectly. See --help for usage")
    else:
        if score.isdigit():
            score = int(score)
        else:
            seq_io.print_error("error: the quality score threshold provided "
                               "to sliding-window must be an integer value")
        if window.isdigit():
            window = int(window)
        else:
            try:
                window = float(window)
            except ValueError:
                seq_io.print_error("error: the window-size provided to "
                                   "sliding-window must be either an integer "
                                   "value or a fraction")

    return (window, score)
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'fhandle',
        metavar='in1.fastq',
        type=str,
        action=seq_io.Open,
        mode='rb',
        default=sys.stdin,
        help="input reads in fastq format. Can be a file containing either "
        "single-end or forward/interleaved reads if reads are paired-end "
        "[required]")
    input_arg = parser.add_mutually_exclusive_group(required=False)
    input_arg.add_argument('--interleaved',
                           action='store_true',
                           help="input is interleaved paired-end reads")
    input_arg.add_argument('-r',
                           '--reverse',
                           dest='rhandle',
                           metavar='in2.fastq',
                           action=seq_io.Open,
                           mode='rb',
                           help="input reverse reads in fastq format")
    parser.add_argument('-o',
                        '--out',
                        metavar='FILE',
                        dest='out_f',
                        type=str,
                        action=seq_io.Open,
                        mode='wt',
                        default=sys.stdout,
                        help="output trimmed reads [default: stdout]")
    parser.add_argument('-v',
                        '--out-reverse',
                        metavar='FILE',
                        dest='out_r',
                        type=str,
                        action=seq_io.Open,
                        mode='wt',
                        help="output trimmed reverse reads")
    parser.add_argument('-s',
                        '--singles',
                        metavar='FILE',
                        dest='out_s',
                        type=str,
                        action=seq_io.Open,
                        mode='wt',
                        help="output trimmed orphaned reads")
    parser.add_argument(
        '-q',
        '--qual-offset',
        metavar='TYPE',
        dest='offset',
        type=int,
        choices=[33, 64],
        default=33,
        help="ASCII base quality score encoding [default: 33]. Options are "
        "33 (phred33) or 64 (phred64)")
    parser.add_argument(
        '-m',
        '--min-len',
        metavar='LEN [,LEN]',
        dest='minlen',
        type=str,
        help="filter reads shorter than the minimum length threshold [default:"
        " 0]. Different values can be provided for the forward and "
        "reverse reads, respectively, by separating them with a comma "
        "(e.g. 80,60), or a single value can be provided for both")
    trim_args = parser.add_argument_group('trimming options')
    trim_args.add_argument(
        '-O',
        '--trim-order',
        metavar='ORDER',
        dest='trim_order',
        type=str,
        default='ltw',
        help="order that the trimming methods should be applied [default: ltw]"
        ". Available methods are l (leading), t (trailing), and w "
        "(sliding-window)")
    trim_args.add_argument(
        '-W',
        '--sliding-window',
        metavar='FRAME',
        dest='sw',
        type=parse_colons,
        help="trim read ends using a sliding window approach. Input should be "
        "of the form 'window_size:qual_threshold', where 'qual_threshold' "
        "is an integer between 0 and 42 and 'window_size' can either be "
        "length in bases or fraction of total read length")
    trim_args.add_argument(
        '-H',
        '--headcrop',
        metavar='INT [,INT]',
        type=str,
        help="remove exactly the number of bases specified from the start of "
        "the reads [default: 0]. Different values can be provided for "
        "the forward and reverse reads, respectively, by separating them "
        "with a comma (e.g. 2,0), or a single value can be provided for "
        "both. Cropping will always be applied first")
    trim_args.add_argument(
        '-C',
        '--crop',
        metavar='INT [,INT]',
        type=str,
        help="crop reads to the specified position [default: off]. The "
        "value(s) should be less than the maximum read length, otherwise "
        "no cropping will be applied. Different values can be provided "
        "for the forward and reverse reads, respectively, by separating "
        "them with a comma (e.g. 120,115), or a single value can be "
        "provided for both. Cropping will always be applied first")
    trim_args.add_argument(
        '-L',
        '--leading',
        metavar='SCORE',
        dest='lead_score',
        type=int,
        help="trim by removing low quality bases from the start of the read")
    trim_args.add_argument(
        '-T',
        '--trailing',
        metavar='SCORE',
        dest='trail_score',
        type=int,
        help="trim by removing low quality bases from the end of the read")
    trim_args.add_argument(
        '--trunc-n',
        dest='trunc_n',
        action='store_true',
        help="truncate sequence at position of first ambiguous base [default: "
        "off]. Truncation will always be applied last")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    parser.add_argument(
        '-t',
        '--threads',
        action=CheckThreads,
        type=int,
        default=1,
        help='number of threads to use for trimming [default: 1]')
    args = parser.parse_args()
    all_args = sys.argv[1:]

    seq_io.program_info('qtrim', all_args, __version__)

    # Track program run-time
    start_time = time()

    # Assign variables based on arguments supplied by the user
    crop = parse_commas(args.crop, "crop") if args.crop else (None, None)
    hcrop = parse_commas(args.headcrop, "headcrop") if \
        args.headcrop else (0, 0)
    minlen = parse_commas(args.minlen, "minlen") if args.minlen \
        else (0, 0)
    out_f = args.out_f
    paired = True if (args.interleaved or args.rhandle) else False
    trunc_n = trim.truncate_by_n if args.trunc_n else self

    # Prepare the iterator based on dataset type
    iterator = seq_io.read_iterator(args.fhandle, args.rhandle, \
                                    args.interleaved, "fastq")

    # Populate list of trimming tasks to perform on reads
    trim_tasks = {
        'l': (trim.trim_leading, args.lead_score),
        't': (trim.trim_trailing, args.trail_score),
        'w': (trim.adaptive_trim, args.sw)
    }

    trim_steps = []
    for task in args.trim_order:
        value = trim_tasks[task][-1]
        if value:
            trim_steps.append(trim_tasks[task])
    if len(trim_steps) < 1 and not (args.crop or args.headcrop):
        seq_io.print_error("error: no trimming steps were specified")

    # Counters for trimming statistics
    discarded = Counter(0)
    passed = Counter(0)

    # Assign variables based on dataset type (paired or single-end)
    if paired:
        print("Processing input as paired-end reads", file=sys.stderr)

        out_s = args.out_s if args.out_s else None
        out_r = out_f if not args.out_r else args.out_r

        output = "\nRecords processed:\t{!s}\nPassed filtering:\t{!s} " \
                 "({:.2%})\n  Reads pairs kept:\t{!s} ({:.2%})\n  Forward " \
                 "only kept:\t{!s} ({:.2%})\n  Reverse only kept:\t{!s} " \
                 "({:.2%})\nRecords discarded:\t{!s} ({:.2%})\n"

        singles1 = Counter(0)
        singles2 = Counter(0)

    else:
        if args.out_s:
            print(
                "warning: argument --singles used with single-end reads"
                "... ignoring\n",
                file=sys.stderr)

        if args.out_r:
            print(
                "warning: argument --out-reverse used when input is "
                "single-end... ignoring\n",
                file=sys.stderr)

        print("Processing input as single-end reads", file=sys.stderr)

        out_s = None
        out_r = None

        output = "\nRecords processed:\t{!s}\nPassed filtering:\t{!s} ({:.2%})" \
                 "\nRecords discarded:\t{!s} ({:.2%})\n"

        singles1 = singles2 = None

    max_read_threads = args.threads - 1 if args.threads > 1 else 1
    read_queue = Queue(
        max_read_threads)  # Max queue size prevents race conditions
    write_queue = Queue(max_read_threads)

    # Initialize threads to process reads and writes
    read_processes = []
    for i in range(max_read_threads):
        read_processes.append(Process(target=trim_reads, args=(read_queue, \
            write_queue, trim_steps, trunc_n, crop, hcrop, args.offset,)))
        read_processes[i].start()

    write_process = Process(target=write_reads, args=(write_queue, out_f, \
        out_r, out_s, minlen, passed, discarded, singles1, singles2,))
    write_process.start()

    # Iterate over reads, populating read queue for trimming
    for processed_total, records in enumerate(iterator):
        read_queue.put(records)

    # Send kill message to threads responsible for trimming
    for process in read_processes:
        read_queue.put('DONE')

    # Wait for processes to finish before continuing
    for process in read_processes:
        process.join()

    # Send kill message to threads responsible for trimming
    while not write_queue.empty():
        sleep(1)
    write_queue.put('DONE')

    # Wait for write processes to finish before continuing
    write_process.join()

    # Verify input file non-empty
    try:
        processed_total += 1
    except UnboundLocalError:
        seq_io.print_error("error: no sequences were found to process")

    # Calculate and print output statistics
    p = passed.value()
    d = discarded.value()

    if paired:
        processed_total = processed_total * 2
        s1, s2 = singles1.value(), singles2.value()
        passed_total = (p * 2 + s1 + s2)
        discarded_total = d * 2 + s1 + s2
        pairs = p
        frac_pairs = (pairs * 2) / processed_total
        frac_s1 = s1 / processed_total
        frac_s2 = s2 / processed_total
    else:
        passed_total = p
        discarded_total = d
        processed_total = discarded_total + passed_total
        s1 = s2 = frac_s1 = frac_s2 = pairs = frac_pairs = None

    frac_discarded = discarded_total / processed_total
    frac_passed = passed_total / processed_total
    stats = [processed_total, passed_total, frac_passed] + [i for i in (pairs, frac_pairs, s1, \
             frac_s1, s2, frac_s2) if i != None] + [discarded_total, frac_discarded]
    print(output.format(*tuple(stats)), file=sys.stderr)

    # Calculate and print program run-time
    end_time = time()
    total_time = (end_time - start_time) / 60.0
    print("It took {:.2e} minutes to process {!s} records\n"\
          .format(total_time, processed_total), file=sys.stderr)
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'fhandle',
        metavar='in1.fast<q|a>',
        action=seq_io.Open,
        mode='rb',
        help="input reads in fastq or fasta format. Can be a file containing "
        "either single-end or forward/interleaved reads if reads are "
        "paired-end [required]")
    input_arg = parser.add_mutually_exclusive_group(required=False)
    input_arg.add_argument('--interleaved',
                           action='store_true',
                           help="input is interleaved paired-end reads")
    input_arg.add_argument('-r',
                           '--reverse',
                           dest='rhandle',
                           metavar='in2.fast<q|a>',
                           action=seq_io.Open,
                           mode='rb',
                           help="input reverse reads")
    parser.add_argument(
        '-f',
        '--format',
        metavar='FORMAT',
        dest='format',
        default='fastq',
        choices=['fasta', 'fastq'],
        help="sequence file format [default: fastq]. Available options are "
        "'fasta' or 'fastq'")
    parser.add_argument(
        '-b',
        '--barcodes',
        metavar='FILE',
        action=seq_io.Open,
        mode='r',
        help="file containing sample names mapped to template barcode "
        "sequences, in tab-separated format. The first column should "
        "contain sample names and the second column should contain the "
        "appropriate barcodes. An optional third column can be used to "
        "assign barcodes to 'multiplex groups'. The name of a multiplex "
        "group should have first the run id, then the flowcell lane, "
        "separated by a colon (e.g. 432:4). If this argument is unused "
        "and --force is provided instead, the output files will be named "
        "for the barcode and run information found in the headers")
    parser.add_argument(
        '-s',
        '--suffix',
        metavar='STR',
        type=str,
        help="string to append to the end of the file name. The default is to "
        "append the file format (fastq or fasta) and the strand for PE "
        "data (forward, reverse, interleaved)")
    parser.add_argument(
        '-c',
        '--hist',
        metavar='FILE',
        action=seq_io.Open,
        mode='w',
        help="output histogram of barcode counts. This can be used for "
        "graphing the error distribution of a barcode sequence, for "
        "instance")
    parser.add_argument(
        '--no-out',
        dest='no_out',
        action='store_true',
        help="do not write sequences to a file. Only output a histogram of "
        "barcode counts")
    parser.add_argument(
        '--force',
        action='store_true',
        help="create new file for every barcode found in input")
    compress_arg = parser.add_mutually_exclusive_group(required=False)
    compress_arg.add_argument(
        '--gzip',
        action='store_true',
        help="output files should be compressed using the gzip algorithm. The "
        "suffix '.gz'. will be appended to the file names")
    compress_arg.add_argument(
        '--bzip2',
        action='store_true',
        help="output files should be compressed using the bzip2 algorithm. The "
        "suffix '.bz2' will be appended to the file names")
    parser.add_argument(
        '-d',
        '--distance',
        type=int,
        default=0,
        help="maximum hamming distance allowed between sequence barcodes to "
        "be sorted into the same partition. Requires a barcodes file "
        "providing template barcode sequences")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    args = parser.parse_args()
    all_args = sys.argv[1:]

    seq_io.program_info('demultiplex_by_header', all_args, __version__)

    if args.distance and not args.barcodes:
        parser.error("error: argument -d/--distance cannot be used without "
                     "-b/--barcodes")

    if args.no_out and not args.hist:
        parser.error("error: argument --no-out cannot be used without "
                     "-c/--hist")

    # Track program run-time
    start_time = time()

    # Assign variables based on arguments supplied by the user
    if args.barcodes and args.distance > 0:
        outstats = "Records processed:\t{0}\nBarcode partitions created:\t{1}"\
                   "\nSequence barcodes with -\n  exact match to a template:"\
                   "\t{2}\n  one or more mismatchs:\t{3}\nSequences with "\
                   "unknown barcode:\t{4}\n"
    elif args.barcodes and args.distance == 0:
        outstats = "Records processed:\t\t{0}\nBarcode partitions created:\t{1}"\
                   "\nSequences with unknown barcode:\t{2}\n"
    else:
        outstats = "Records processed:\t\t{0}\nBarcode partitions created:\t{1}"\
                   "\n"

    suffix = args.suffix if args.suffix else args.format
    out_hist = args.hist.write if args.hist else do_nothing
    exact_total = mismatch_total = unknowns = 0

    if args.gzip:
        compression = '.gz'
        algo = GzipFile
    elif args.bzip2:
        compression = '.bz2'
        algo = BZ2File
    else:
        compression = ''
        algo = io.open

    # Prepare the iterator based on dataset type
    iterator = seq_io.read_iterator(args.fhandle, args.rhandle, \
                                    args.interleaved, args.format)

    # Store user-supplied barcodes
    template_barcodes = {}
    names = []
    is_grouped = False
    if args.barcodes:
        barcode_dists = []
        for barcodes_total, line in enumerate(args.barcodes):
            tag = BarcodeEntry()
            group = ''

            # Verify that barcodes file is correctly formatted
            try:
                # Has filename and barcode only
                name, barcode = line.strip().split('\t')
            except ValueError:
                try:
                    # Has filename, barcode, and multiplex group
                    name, barcode, group = line.strip().split('\t')
                    is_grouped = True
                except ValueError:
                    seq_io.print_error("error: barcode mapping file does not "
                                       "appear to be formatted correctly. See "
                                       "the help message for formatting "
                                       "requirements")

            tag.id = name
            tag.barcode = barcode
            tag.group = group

            # Add to dictionary of template barcodes
            try:
                dict_group = template_barcodes[group]
            except KeyError:
                template_barcodes[group] = {barcode: tag}
            else:
                # Verify unique sample names and calculate barcode distances
                for i in dict_group:
                    if name == dict_group[i].id:
                        seq_io.print_error("error: sample name '{}' is being "
                                           "used for more than one template "
                                           "barcode".format(name))

                    barcode_dists.append((group, hamming_distance(dict_group[i]\
                                         .barcode, barcode)))

                if barcode not in dict_group:
                    dict_group[barcode] = tag
                else:
                    seq_io.error("error: template barcode '{}' has been seen "
                                 "more than once in a single multiplex group"\
                                 .format(barcode))

        # Verify non-empty barcodes file
        try:
            barcodes_total += 1
        except UnboundLocalError:
            seq_io.print_error("error: no barcodes were found to process")

        # Template barcode statistics
        out_bstats = "Template barcodes found:\t{!s}\n  # multiplex groups:\t{!s}\n\n"

        num_tem_groups = len(template_barcodes)
        bstats = [barcodes_total, num_tem_groups]

        if is_grouped:
            out_bstats += "  Multiplex Group\t# Barcodes\tMin distance\tMax distance\n"
            for mul_group in template_barcodes:
                out_bstats += "  {!s}\t{!s}\t{!s}\t{!s}\n"

                group_dists = [return_last(i) for i in barcode_dists if \
                               i[0] == group]

                bstats += [mul_group, len(template_barcodes[mul_group]), min(group_dists), \
                           max(group_dists)]

        else:
            out_bstats += "Minimum Hamming distance between all barcodes:\t"\
                          "{!s}\nMaximum Hamming distance between all barcodes:"\
                          "\t{!s}\n"

            all_dists = [return_last(i) for i in barcode_dists]

            bstats += [min(all_dists), max(all_dists)]

        print(out_bstats.format(*tuple(bstats)), file=sys.stderr)

    # Demultiplex reads
    outfiles = {}
    sequence_barcodes = []
    for processed_total, record in enumerate(iterator):
        # Prepare output dependant on whether paired or unpaired
        try:
            seq_tag = record.forward.description.split(':')[-1]
            header = record.forward.id
            outf = record.forward.write()
            outr = record.reverse.write()
        except AttributeError:
            seq_tag = record.description.split(':')[-1]
            header = record.id
            outf = record.write()
            outr = None

        # Verify headers are formatted as Casava 1.8
        try:
            run_info = tuple(header.split(':')[1:4])  #run id, flowcell, lane
        except IndexError:
            seq_io.print_error("error: the format of the sequence headers is "
                               "incompatible with this method. Demultiplexing "
                               "these reads will require a different method "
                               "to be used instead")

        if (not seq_tag.isalpha()) or (len(seq_tag) != 6):
            seq_io.print_error("error: the format of the sequence headers is "
                               "incompatible with this method. Demultiplexing "
                               "these reads will require a different method "
                               "to be used instead")

        file_prefix = "{0}_{1}_{2}_{3}".format(seq_tag, *run_info)

        # Increment barcode occurences
        index = "{0}:{1}:{2}:{3}".format(seq_tag, *run_info)
        already_seen = False
        for i in sequence_barcodes:
            if index == i.barcode:
                i.increment()
                already_seen = True

        if not already_seen:
            seq_entry = BarcodeEntry()
            seq_entry.barcode = index
            seq_entry.count = 1
            sequence_barcodes.append(seq_entry)

        # Map sequence barcodes to barcodes in the provided barcodes file
        if args.barcodes:
            # Only consider other barcodes within the same multiplex group
            if is_grouped:
                try:
                    relevant_barcodes = template_barcodes[":".join(run_info)]
                except KeyError:
                    seq_io.warning("warning: run information in sequence "
                                   "header '{}' doesn't match any multiplex "
                                   "group in the provided barcodes file\n"\
                                   .format(header))
                    continue

            else:
                # Consider all barcodes within file
                relevant_barcodes = template_barcodes['']

            # Find the template barcode with the smallest hamming distance to
            # the record sequence barcode
            distances = sort_by_last([(i, hamming_distance(seq_tag, i)) \
                                     for i in relevant_barcodes])

            min_tag, min_dist = distances[0]  #minimum is the first element

            if min_dist == 0:
                exact_total += 1
            else:
                mismatch_total += 1

            # Determine if more than one closest match
            if [i[1] for i in distances].count(min_dist) > 1:
                seq_io.print_warning("warning: barcode {0} in sequence "
                                     "{1} is equally similar to more than "
                                     "one template barcode. Unable to "
                                     "determine which partition to assign "
                                     "it to".format(seq_tag, header))
                continue
            else:
                # Assign to template partition if within threshold distance
                if min_dist <= args.distance and args.distance:
                    seq_tag = min_tag

            # Verify sequence tag is in the list of provided barcodes
            try:
                file_prefix = relevant_barcodes[seq_tag].id
            except KeyError:
                unknowns += 1
                if not args.force:
                    seq_io.print_warning("warning: barcode {0} in sequence {1}"
                                         " does not correspond to any of the "
                                         "template barcodes provided. The "
                                         "template ({2}) with the fewest "
                                         "mismatches is {3} nucleotides "
                                         "different. Use --force to write "
                                         "these records anyway".format(seq_tag,\
                                         header, min_tag, min_dist))
                    continue

        # Write record to appropriate output file
        if not args.no_out:
            try:
                outfiles[file_prefix][0](outf)
                outfiles[file_prefix][1](outr)

            except KeyError:
                # Barcode not encountered previously, open new file for writes
                if args.rhandle:
                    handle1 = io.TextIOWrapper(algo("{0}.forward.{1}{2}"\
                        .format(file_prefix, suffix, compression), mode='wb'))
                    handle2 = io.TextIOWrapper(algo("{0}.reverse.{1}{2}"\
                        .format(file_prefix, suffix, compression), mode='wb'))
                    write1, write2 = handle1.write, handle2.write
                elif args.interleaved:
                    handle1 = io.TextIOWrapper(algo("{0}.interleaved.{1}{2}"\
                        .format(file_prefix, suffix, compression), mode='wb'))
                    write1 = write2 = handle1.write
                else:
                    handle1 = io.TextIOWrapper(
                        algo("{0}.{1}{2}".format(file_prefix, suffix,
                                                 compression),
                             mode='wb'))
                    write1 = handle1.write
                    write2 = do_nothing

                outfiles[file_prefix] = (write1, write2)

                # Should be safe to write now
                outfiles[file_prefix][0](outf)
                outfiles[file_prefix][1](outr)

    # Write output histogram and sequence barcode statistics
    houtstats = "Sequence barcodes found:\t{!s}\n  Mean abundance:\t"\
                "{:.2f}\n  Median abundance:\t{:.2f}\n  Abundance SD:\t"\
                "\t{:.2f}\n\n  Barcode\tRun Information\tAbundance\n"

    num_seq_tags = len(sequence_barcodes)
    houtstats += "  {!s}\t{!s}\t{!s}\n" * num_seq_tags

    barcode_abundances = []
    hstats_extra = []
    for seq_bar in sequence_barcodes:
        abund = seq_bar.count
        barcode_abundances.append(abund)
        hstats_extra += [seq_bar.sequence(), seq_bar.run_info(), abund]


    b_mean, b_median, b_sd = (mean(barcode_abundances), \
                              median(barcode_abundances), \
                              stdev(barcode_abundances))

    hstats = [num_seq_tags, b_mean, b_median, b_sd] + hstats_extra
    print(houtstats.format(*tuple(hstats)), file=sys.stderr)

    if args.hist:
        out_hist("#Total:  {}\n#Mean:   {:.2f}\n#Median: {:.2f}\n#STDev:  "
                 "{:.2f}\n".format(num_seq_tags, b_mean, b_median, b_sd))

        for abundance in sorted(set(barcode_abundances)):
            counts = barcode_abundances.count(abundance)
            out_hist("{0}\t{1}\n".format(abundance, counts))

    # Verify input file non-empty
    try:
        processed_total += 1
    except UnboundLocalError:
        seq_io.print_error("error: no sequences were found to process")

    # Calculate and print output statistics
    if not args.no_out:
        partitions_total = len(outfiles)
        stats = [processed_total, partitions_total] + [i for i in \
                 (exact_total, mismatch_total, unknowns) if i != None]
        print(outstats.format(*tuple(stats)), file=sys.stderr)

    # Calculate and print program run-time
    end_time = time()
    total_time = (end_time - start_time) / 60.0
    print("It took {:.2e} minutes to process {!s} records\n"\
          .format(total_time, processed_total), file=sys.stderr)
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'fhandle',
        metavar='in1.fast<q|a>',
        type=str,
        action=Open,
        mode='rb',
        default=sys.stdin,
        help="input reads in fastq or fasta format. Can be a file containing "
        "either single-end or forward/interleaved reads if reads are "
        "paired-end [required]")
    input_arg = parser.add_mutually_exclusive_group(required=False)
    input_arg.add_argument('--interleaved',
                           action='store_true',
                           help="input is interleaved paired-end reads")
    input_arg.add_argument('-r',
                           '--reverse',
                           dest='rhandle',
                           metavar='in2.fast<q|a>',
                           action=Open,
                           mode='rb',
                           help="input reverse reads")
    parser.add_argument('-o',
                        '--out',
                        metavar='FILE',
                        dest='out_f',
                        type=str,
                        action=Open,
                        mode='wt',
                        default=sys.stdout,
                        help="output trimmed reads [default: stdout]")
    parser.add_argument('-v',
                        '--out-reverse',
                        metavar='FILE',
                        dest='out_r',
                        type=str,
                        action=Open,
                        mode='wt',
                        help="output reverse reads")
    parser.add_argument(
        '-f',
        '--format',
        metavar='FORMAT',
        dest='format',
        default='fastq',
        choices=['fasta', 'fastq'],
        help="sequence file format [default: fastq]. Available options are "
        "'fasta' or 'fastq'")
    parser.add_argument('-l',
                        '--log',
                        type=str,
                        action=Open,
                        mode='wt',
                        help="output log of replicate types")
    dup_args = parser.add_argument_group('replicate types')
    dup_args.add_argument('--prefix',
                          action='store_true',
                          help="replicate can be a 5' prefix of another read")
    dup_args.add_argument(
        '--rev-comp',
        dest='rev_comp',
        action='store_true',
        help="replicate can be the reverse-complement of another read")
    parser.add_argument(
        '--reduce-memory',
        dest='mem_use',
        action='store_true',
        help="reduce the mount of memory that the program uses. This could "
        "result in a drastic increase in run-time")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    args = parser.parse_args()
    all_args = sys.argv[1:]

    program_info('filter_replicates', all_args, __version__)

    # Track program run-time
    start_time = time()

    # Assign variables based on arguments supplied by the user
    out_f = args.out_f.write
    logger = args.log.write if args.log else do_nothing
    logger("#Replicate\tTemplate\tType\n")
    compress = zlib.compress if args.mem_use else self
    decompress = zlib.decompress if args.mem_use else self
    out_format = ">{0} {1}\n{2}\n" if args.format == "fasta" else \
                 "@{0} {1}\n{2}\n+\n{3}\n"
    paired = True if (args.interleaved or args.rhandle) else False

    # Prepare the iterator based on dataset type
    iterator = read_iterator(args.fhandle, args.rhandle, args.interleaved, \
                             args.format)

    # Assign variables based on dataset type (paired or single-end)
    if paired:
        print("Processing input as paired-end reads\n", file=sys.stderr)
        out_r = out_f if not args.out_r else args.out_r.write
        rc = reverse_complement_paired

    else:
        print("Processing input as single-end reads\n", file=sys.stderr)
        out_r = do_nothing
        rc = reverse_complement

    # Iterate over the reads, storing only the unique records
    uniques = {}
    for records_total, entry in enumerate(iterator):
        try:
            header = (entry.forward.id, entry.reverse.id)
            fdesc, rdesc = (entry.forward.description,
                            entry.reverse.description)
            fseq, rseq = (entry.forward.sequence, entry.reverse.sequence)
        except AttributeError:
            header = (entry.id, '')
            fdesc, rdesc = (entry.description, '')
            fseq, rseq = (entry.sequence, '')
            try:
                qual = compress(entry.quality)
            except AttributeError:
                # Must be fasta format
                qual = None
        else:
            try:
                qual = compress(entry.forward.quality + entry.reverse.quality)
            except AttributeError:
                qual = None

        flen, rlen = len(fseq), len(rseq)
        record = [i for i in (header, flen, fseq + rseq, qual) if i != None]

        # Use hash of full or prefixed sequence as a key for quick comparisons
        fsub, rsub = ((20, 20) if args.prefix else (flen, rlen))
        key = hashlib.md5((fseq[:fsub] + rseq[:rsub]).encode()).digest()

        # Search if replicate
        search_list = []
        try:
            search_list = uniques[key]
        except KeyError:
            # No match to the database found. Need to check the reverse
            # complement if requested
            if args.rev_comp:
                try:
                    fseq_rc, rseq_rc = rc(fseq, rseq)
                except TypeError:
                    fseq_rc, rseq_rc = rc(fseq), ''

                rckey = hashlib.md5((fseq_rc[:fsub] + rseq_rc[:rsub])\
                                    .encode()).digest()

                try:
                    search_list = uniques[rckey]
                except KeyError:
                    # Not a replicate. Add to the database of uniques
                    uniques[key] = [record]
                    continue
                else:
                    duplicate_key = rckey
                    fquery, rquery = fseq_rc, rseq_rc
                    query_id = header[0]

            # Not a replicate. Add to the database of uniques
            else:
                uniques[key] = [record]
                continue
        else:
            duplicate_key = key
            fquery, rquery = fseq, rseq
            query_id = header[0]

        # Search through list of records with common key to see if the sequence
        # matches one that has been observed before
        duplicate = None
        for index, search_record in enumerate(search_list):
            # Get search sequences by splitting combined sequence on forward
            # length
            search_id = search_record[0][0]
            fsearch, rsearch = split_by_length(search_record[2], \
                                               search_record[1])

            # Check replicate status of forward sequence
            fstatus = duplicate_status(fquery, fsearch)
            if fstatus:
                # Check reverse only if forward a duplicate
                rstatus = duplicate_status(rquery, rsearch)
                if rstatus:
                    # Query is an exact match to a DB record
                    if (fstatus == 1 and rstatus == 1):
                        duplicate_type = "exact"
                        duplicate = query_id
                        template = search_id
                        break

                    # Query is a prefix of a DB record
                    elif (fstatus == 1 and rstatus == 3) or \
                         (fstatus == 3 and rstatus == 1) or \
                         (fstatus == 3 and rstatus == 3):
                        duplicate_type = "prefix"
                        duplicate = query_id
                        template = search_id
                        break

                    # A DB record is a prefix of the query
                    elif (fstatus == 1 and rstatus == 2) or \
                         (fstatus == 2 and rstatus == 1) or \
                         (fstatus == 2 and rstatus == 2):
                        duplicate_type = "prefix"
                        duplicate = search_id
                        template = query_id
                        # Replace old DB record with new
                        uniques[duplicate_key][index] = record
                        break

        if duplicate:
            # Add rc to duplicate type if search_list from rev_comp
            duplicate_type = "rev-comp {}".format(duplicate_type) \
                             if key != duplicate_key else duplicate_type

            logger("{}\t{}\t{}\n".format(duplicate, template, duplicate_type))

        else:
            # record is definitely not a duplicate, so add to the list of
            # unique sequences with a common key
            uniques[duplicate_key].append(record)

    # Make sure input file non-empty
    try:
        records_total += 1  #number records processed
    except UnboundLocalError:
        print_error("error: no sequences were found to process.")

    # Write unique records
    uniques_total = 0  #remaining records after dereplication
    for unique_key in uniques:
        for record in uniques[unique_key]:
            uniques_total += 1
            fheader, rheader = record[0]

            fseq, rseq = split_by_length(record[2], record[1])
            try:
                fqual, rqual = split_by_length(decompress(record[3]),
                                               record[1])
            except IndexError:
                fqual = rqual = None

            out_f(out_format.format(*tuple([i for i in (fheader, fdesc, fseq, \
                                    fqual) if i != None])))
            out_r(out_format.format(*tuple([i for i in (rheader, rdesc, rseq, \
                                    rqual) if i != None])))

    # Calculate and print output statistics
    replicates_total = records_total - uniques_total
    print(
        "Records processed:\t{!s}\nUnique reads found:\t{!s} ({:.2%})\nReplicate reads found:\t{!s} "
        "({:.2%})\n".format(records_total, uniques_total,
                            uniques_total / records_total, replicates_total,
                            replicates_total / records_total),
        file=sys.stderr)

    # Calculate and print program run-time
    end_time = time()
    total_time = (end_time - start_time) / 60.0
    print("It took {:.2e} minutes to process {!s} records\n"\
          .format(total_time, records_total), file=sys.stderr)