def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('f_file', metavar='in1.fast<q|a>', help="input forward or interleaved reads [required]") input_arg = parser.add_mutually_exclusive_group(required=True) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('r_file', metavar='in2.fast<q|a>', nargs='?', help="input reverse reads") parser.add_argument('-o', '--out', dest='out_f', metavar='FILE', type=seq_io.open_output, default=sys.stdout, help="output reads") output_arg = parser.add_mutually_exclusive_group(required=False) output_arg.add_argument('-v', '--out-reverse', metavar='FILE', dest='out_r', type=seq_io.open_output, help="output reverse reads") output_arg.add_argument('--out-interleaved', dest='out_interleaved', action='store_true', help="output interleaved paired-end reads, even if input is split") parser.add_argument('-f', '--out-format', metavar='FORMAT', dest='out_format', default='fastq', choices=['fasta', 'fastq'], help="output file format. Can be fasta or fastq. [default: fastq]") parser.add_argument('-l', '--log', metavar='LOG', type=seq_io.open_output, help="output log file to keep track of replicates") dup_args = parser.add_argument_group('replicate types') dup_args.add_argument('--prefix', action='store_true', help="replicate can be a 5' prefix of another read") dup_args.add_argument('--rev-comp', dest='rev_comp', action='store_true', help="replicate can be the reverse-complement of another read") parser.add_argument('--reduce-memory', dest='mem_use', action='store_true', help="reduce the mount of memory that the program uses. This could " "result in a drastic increase in run time.") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('filter_replicates', all_args, __version__) if args.r_file and not (args.out_r or args.out_interleaved): parser.error("one of -v/--out-reverse or --out-interleaved is required " "when the argument -r/--reverse is used") f_file = sys.stdin if args.f_file == '-' else args.f_file out_f = args.out_f iterator = seq_io.get_iterator(f_file, args.r_file, args.interleaved) seq_io.logger(args.log, "Replicate\tTemplate\tType\n") compress = zlib.compress if args.mem_use else self decompress = zlib.decompress if args.mem_use else self writer = seq_io.fasta_writer if args.out_format == 'fasta' else \ seq_io.fastq_writer seq_db = {} uniques = {} for i, (forward, reverse) in enumerate(iterator): ident = forward['identifier'] fdesc, rdesc = (forward['description'], reverse['description']) fseq, rseq = (forward['sequence'], reverse['sequence']) fqual, rqual = (forward['quality'], reverse['quality']) flen, rlen = len(fseq), len(rseq) uniques[i] = (fseq + rseq, flen, compress(fqual + rqual), ident) fsubsize, rsubsize = ((20, 20) if args.prefix else (flen, rlen)) key = hashlib.md5(fseq[:fsubsize] + rseq[:rsubsize]).digest() dup_pos, temp_pos, dup_type = replicate_status(i, key, uniques, seq_db) # match to database found, so delete id from database of uniques if dup_pos: seq_io.logger(args.log, "{}\t{}\t{}\n".format(uniques[dup_pos][3], uniques[temp_pos][3], dup_type)) try: del uniques[dup_pos] except KeyError: seq_io.print_error("error: input file has more than one " "sequence with the same identifier") sys.exit(1) continue # sequence is unique, so check reverse-complement if set if args.rev_comp: f_rc, r_rc = pairs.reverse_complement_paired(fseq, rseq) rckey = hashlib.md5(f_rc[:fsubsize] + r_rc[:rsubsize]).digest() dup_pos, temp_pos, dup_type = replicate_status(i, rckey, uniques, seq_db) if dup_pos: dup_type = 'rev-comp ' + dup_type seq_io.logger(args.log, "{}\t{}\t{}\n".format( uniques[dup_pos][3], uniques[temp_pos][3], dup_type)) try: del uniques[dup_pos] except KeyError: seq_io.print_error("error: input file has more than one " "sequence with the same identifier") continue # record is definitely not a duplicate, so add to database of ids to # check a match for try: seq_db[key].append(i) except KeyError: seq_db[key] = [i] try: i += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process.") out_r = out_f if ((args.interleaved or args.out_interleaved) and not \ args.out_r) else args.out_r for j, index in enumerate(sorted(uniques.keys())): record = uniques[index] ident = record[3] fseq, rseq = split_by_length(record[0], record[1]) fqual, rqual = split_by_length(decompress(record[2]), record[1]) writer(out_f, {'identifier': ident, 'description': fdesc, 'sequence': fseq, 'quality': fqual}) writer(out_r, {'identifier': ident, 'description': rdesc, 'sequence': rseq, 'quality': rqual}) j += 1 num_reps = i - j print("\nRead Pairs processed:\t{!s}\nReplicates found:\t{!s} " "({:.2%})\n".format(i, num_reps, num_reps / i), file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('f_file', metavar='in1.fast<q|a>', help="input reads in fastq or fasta format. Can be a file containing " "either single-end or forward/interleaved reads if reads are " "paired-end [required]") input_arg = parser.add_mutually_exclusive_group(required=False) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('r_file', metavar='in2.fast<q|a>', nargs='?', help="input reverse reads") parser.add_argument('-b', '--barcodes', metavar='FILE', type=seq_io.open_input, help="file containing sample names mapped to the appropriate barcode" "sequences, in tab-separated format, with sample names in the first " "column. If this argument is unused, the output files will be named " "for each barcode sequence found in the fasta\q file.") parser.add_argument('-s', '--suffix', metavar='STR', type=str, help="string to append to the end of the file name. The default is to " "append the file format (fastq or fasta) and the strand for PE data " "(forward, reverse, interleaved).") parser.add_argument('-f', '--out-format', metavar='FORMAT', dest='out_format', default='fastq', choices=['fasta', 'fastq'], help="output file format. Can be fasta or fastq. [default: fastq]") compress_arg = parser.add_mutually_exclusive_group(required=False) compress_arg.add_argument('--gzip', action='store_true', help="output files should be compressed using the gzip algorithm. The " "suffix '.gz'. will be appended to the file names.") compress_arg.add_argument('--bzip2', action='store_true', help="output files should be compressed using the bzip2 algorithm. The " "suffix '.bz2' will be appended to the file names.") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('demultiplex_headers', all_args, __version__) f_file = sys.stdin if args.f_file == '-' else args.f_file iterator = seq_io.get_iterator(f_file, args.r_file, args.interleaved) writer = seq_io.fasta_writer if args.out_format == 'fasta' else \ seq_io.fastq_writer if args.gzip: compression = '.gz' elif args.bzip2: compression = '.bz2' else: compression = '' suffix = args.suffix if args.suffix else args.out_format tags = {} if args.barcodes: names = [] for line in args.barcodes: try: name, tag = line.strip().split('\t') except ValueError: seq_io.print_error("error: barcode mapping file does not " "appear to be formatted correctly") if name in names: seq_io.print_error("error: the same sample name is used for " "more than one barcode sequence") else: names.append(name) tags[tag] = name outfiles = {} for i, (forward, reverse) in enumerate(iterator): tag = forward['description'].split(':')[-1] if (not tag.isalpha()) or (len(tag) != 6): seq_io.print_error("error: unable to determine the format of the " "sequence headers") try: name = tags[tag] except KeyError: name = str(tag) try: writer(outfiles[name][0], forward) writer(outfiles[name][1], reverse) except KeyError: if args.r_file: handle1 = seq_io.open_output("{}.forward.{}{}".format(name, suffix, compression)) handle2 = seq_io.open_output("{}.reverse.{}{}".format(name, suffix, compression)) elif args.interleaved: handle1 = seq_io.open_output("{}.interleaved.{}{}".format(name, suffix, compression)) handle2 = handle1 else: handle1 = seq_io.open_output("{}.{}{}".format(name, suffix, compression)) handle2 = '' outfiles[name] = [handle1, handle2] writer(handle1, forward) writer(handle2, reverse) i += 1 num_parts = len(outfiles) print("\nRecords processed:\t{!s}\nNumber of partitions:\t{!s}\n".format(i, num_parts), file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('f_file', metavar='in1.fastq', help="input reads in fastq format. Can be a file containing either " "single-end or forward/interleaved reads if reads are paired-end " "[required]") input_arg = parser.add_mutually_exclusive_group(required=False) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('--force', action='store_true', help="force process as single-end reads even if input is interleaved " "paired-end reads") input_arg.add_argument('r_file', metavar='in2.fastq', nargs='?', help="input reverse reads in fastq format") parser.add_argument('-o', '--out', metavar='FILE', dest='out_f', type=seq_io.open_output, default=sys.stdout, help="output trimmed reads [required]") output_arg = parser.add_mutually_exclusive_group(required=False) output_arg.add_argument('-v', '--out-reverse', metavar='FILE', dest='out_r', type=seq_io.open_output, help="output trimmed reverse reads") output_arg.add_argument('--out-interleaved', dest='out_interleaved', action='store_true', help="output interleaved paired-end reads, even if input is split") parser.add_argument('-s', '--singles', metavar='FILE', dest='out_s', type=seq_io.open_output, help="output trimmed orphaned reads") parser.add_argument('-f', '--out-format', metavar='FORMAT', dest='out_format', default='fastq', choices=['fasta', 'fastq'], help="output files format (fastq or fasta) [default: fastq]") parser.add_argument('-l', '--log', type=seq_io.open_output, help="output log file to keep track of trimmed sequences") parser.add_argument('-q', '--qual-type', metavar='TYPE', dest='qual_type', type=int, default=33, choices=[33, 64], help="ASCII base quality score encoding [default: 33]. Options are " "33 (for phred33) or 64 (for phred64)") parser.add_argument('-m', '--min-len', metavar='LEN', dest='minlen', type=get_list, default='0', help="filter reads shorter than the minimum length threshold " "[default: 0,0]. Different values can be provided for the forward and " "reverse reads by separating them with a comma (e.g. 80,60)") trim_args = parser.add_argument_group('trimming options') trim_args.add_argument('-O', '--trim-order', metavar='ORDER', dest='trim_order', default='ltw', help="order of trimming steps [default: ltw (corresponds to leading, " "trailing, and sliding-window)]") trim_args.add_argument('-W', '--sliding-window', metavar='FRAME', dest='sw', type=parse_sw_arg, help="trim both 5' and 3' ends of a read using a sliding window " "approach. Input should be of the form 'window_size:qual_threshold', " "where 'qual_threshold' is an integer between 0 and 42 and " "'window_size' can either be length in bases or fraction of the total " "read length") trim_args.add_argument('-H', '--headcrop', metavar='INT,INT', type=get_list, default='0', help="remove exactly the number of bases specified from the start of " "the read. Different values can be provided for the forward and " "reverse reads by separating them with a comma (e.g. 2,0)") trim_args.add_argument('-C', '--crop', metavar='INT,INT', type=get_list, default='0', help="remove exactly the number of bases specified from the end of " "the read. Different values can be provided for the forward and " "reverse reads by separating them with a comma (e.g. 2,0)") trim_args.add_argument('-L', '--leading', metavar='SCORE', dest='lead_score', type=int, help="trim by removing low quality bases from the start of the read") trim_args.add_argument('-T', '--trailing', metavar='SCORE', dest='trail_score', type=int, help="trim by removing low quality bases from the end of the read") trim_args.add_argument('--trunc-n', dest='trunc_n', action='store_true', help="truncate sequence at the position of the first ambiguous base") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('qtrim', all_args, __version__) try: fcrop, rcrop = args.crop except ValueError: fcrop = rcrop = args.crop[0] try: fheadcrop, rheadcrop = args.headcrop except ValueError: fheadcrop = rheadcrop = args.headcrop[0] try: fminlen, rminlen = args.minlen except ValueError: fminlen = rminlen = args.minlen[0] f_file = sys.stdin if args.f_file == '-' else args.f_file out_f = args.out_f iterator = seq_io.get_iterator(f_file, args.r_file, args.interleaved) if args.r_file and not (args.out_r or args.out_interleaved): parser.error("one of -v/--out-reverse or --out-interleaved is required " "when the argument -r/--reverse is used") trim_tasks = {'l': (trim.trim_leading, args.lead_score), 't': (trim.trim_trailing, args.trail_score), 'w': (trim.adaptive_trim, args.sw)} trim_steps = [] for task in args.trim_order: value = trim_tasks[task][-1] if value: trim_steps.append(trim_tasks[task]) if len(trim_steps) < 1 and not (args.crop or args.headcrop): seq_io.print_error("error: no trimming steps were applied") writer = seq_io.fasta_writer if (args.out_format == 'fasta') else \ seq_io.fastq_writer paired = True if (args.interleaved or args.r_file) else False if paired: print("\nProcessing input as paired-end reads", file=sys.stderr) seq_io.logger(args.log, "Record\tForward length\tForward trimmed " "length\tReverse length\tReverse trimmed length\n") out_s = args.out_s if args.out_s else None out_r = out_f if ((args.interleaved or args.out_interleaved) and not \ args.out_r) else args.out_r pairs_passed = discarded_pairs = fsingles = rsingles = 0 for i, (forward, reverse) in enumerate(iterator): identifier = forward['identifier'] forig = len(forward['sequence']) rorig = len(reverse['sequence']) forward = apply_trimming(forward, trim_steps, args.qual_type, fheadcrop, fcrop, args.trunc_n) ftrim = len(forward['sequence']) reverse = apply_trimming(reverse, trim_steps, args.qual_type, rheadcrop, rcrop, args.trunc_n) rtrim = len(reverse['sequence']) # both good if ftrim >= fminlen and rtrim >= rminlen: pairs_passed += 1 writer(out_f, forward) writer(out_r, reverse) # forward orphaned, reverse filtered elif ftrim >= fminlen and rtrim < rminlen: fsingles += 1 writer(out_s, forward) # reverse orphaned, forward filtered elif ftrim < fminlen and rtrim >= rminlen: rsingles += 1 writer(out_s, reverse) # both discarded else: discarded_pairs += 1 seq_io.logger(args.log, "{}\t{}\t{}\t{}\t{}\n".format(identifier, forig, ftrim, rorig, rtrim)) try: i += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process") total = i * 2 passed = pairs_passed * 2 + fsingles + rsingles print("\nRecords processed:\t{!s} ({!s} pairs)\nPassed filtering:\t" "{!s} ({:.2%})\n Paired reads kept:\t{!s} ({:.2%})\n Forward " "only kept:\t{!s} ({:.2%})\n Reverse only kept:\t{!s} ({:.2%})" "\nRead pairs discarded:\t{!s} ({:.2%})\n".format(total, i, passed, passed / total, pairs_passed, pairs_passed / i, fsingles, fsingles / total, rsingles, rsingles / total, discarded_pairs, discarded_pairs / i), file=sys.stderr) else: print("\nProcessing input as single-end reads", file=sys.stderr) seq_io.logger(args.log, "Record\tLength\tTrimmed length\n") if args.out_s: print("\nwarning: argument --singles used with single-end reads" "... ignoring\n", file=sys.stderr) discarded = 0 for i, record in enumerate(iterator): if i == 0: first_read = record['identifier'] elif i == 1: if first_read == record['identifier'] and not args.force: seq_io.print_error("warning: the input fastq appears to " "contain interleaved paired-end reads. Please run with " "the --force flag to proceed with processing the data " "as single-end reads") origlen = len(record['sequence']) record = apply_trimming(record, trim_steps, args.qual_type, fheadcrop, fcrop, args.trunc_n) trimlen = len(record['sequence']) if trimlen >= fminlen: writer(out_f, record) else: discarded += 1 seq_io.logger(args.log, "{}\t{}\t{}\n".format(record['identifier'], origlen, trimlen)) try: i += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process. Is " "the input properly formatted?") passed = i - discarded print("\nRecords processed:\t{!s}\nPassed filtering:\t{!s} " "({:.2%})\nRecords discarded:\t{!s} ({:.2%})\n".format(i, passed, passed / i, discarded, discarded / i), file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('f_file', metavar='in1.fastq', help="input reads file in fastq format [required]. Can be a file " "containing either forward of interleaved reads") input_arg = parser.add_mutually_exclusive_group(required=True) input_arg.add_argument('--interleaved', action='store_true', help="input is interleaved paired-end reads") input_arg.add_argument('r_file', metavar='in2.fastq', nargs='?', help="input reverse reads file in fastq format") parser.add_argument('-o', '--out', metavar='FILE', dest='out_f', type=seq_io.open_output, default=sys.stdout, help="output file for filtered reads [required]") output_arg = parser.add_mutually_exclusive_group(required=False) output_arg.add_argument('-v', '--out-reverse', metavar='FILE', dest='out_r', type=seq_io.open_output, help="output file for filtered reverse reads") output_arg.add_argument('--out-interleaved', dest='out_interleaved', action='store_true', help="output interleaved paired-end reads, even if input is split") parser.add_argument('-s', '--singles', metavar='FILE', dest='out_s', type=seq_io.open_output, help="output file for filtered orphan reads") parser.add_argument('-f', '--out-format', metavar='FORMAT', dest='out_format', default='fastq', choices=['fasta', 'fastq'], help="output files format [default: fastq]. Options are fastq or fasta") parser.add_argument('-l', '--log', type=seq_io.open_output, help="output log file") parser.add_argument('-q', '--qual-type', metavar='TYPE', dest='qual_type', type=int, default=33, choices=[33, 64], help="ASCII base quality score encoding [default: 33]. Options are " "33 (for phred33) or 64 (for phred64)") parser.add_argument('-a', '--alpha', type=float, default=0.005, help="probability of underestimating the actual number of errors in a " "sequence [default: 0.005]") parser.add_argument('-c', '--crop', metavar='LEN,LEN', type=get_list, help="trim read to size specified by removing bases from the end of " "the read") parser.add_argument('-d', '--headcrop', metavar='LEN,LEN', type=get_list, help="trim of bases from the start of the read") parser.add_argument('--ambig', action='store_true', help="remove sequences with ambiguous bases. Default is to treat " "ambiguous bases as errors") parser.add_argument('-e', '--error-calc', choices = ('poisson_binomial', 'poisson'), default="poisson_binomial", help="method to use for calculating the number of errors expected in " "a sequence [default: poisson_binomial]") filter_mode = parser.add_mutually_exclusive_group() filter_mode.add_argument('-m', '--max-errors', dest='maxerror', type=float, default=1.0, help="maximum number of errors allowed in a sequence [default: 1]") filter_mode.add_argument('-u', '--uncert', type=float, default=0.01, help="maximum divergence of the observed sequence from the template " "due to sequencing error [default: 0.01]") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() all_args = sys.argv[1:] seq_io.program_info('error_filter', all_args, __version__) try: fcrop, rcrop = args.crop except ValueError: fcrop = rcrop = args.crop[0] except TypeError: fcrop = rcrop = None try: fheadcrop, rheadcrop = args.headcrop except ValueError: fheadcrop = rheadcrop = args.headcrop[0] except TypeError: fheadcrop = rheadcrop = None f_file = sys.stdin if args.f_file == '-' else args.f_file iterator = seq_io.get_iterator(f_file, args.r_file, args.interleaved) out_f = args.out_f if args.r_file and not (args.out_r or args.out_interleaved): parser.error("one of -v/--out-reverse or --out-interleaved is required " "when the argument -r/--reverse is used") writer = seq_io.fasta_writer if (args.out_format == 'fasta') else \ seq_io.fastq_writer out_r = out_f if ((args.interleaved or args.out_interleaved) and not \ args.out_r) else args.out_r out_s = args.out_s if args.out_s else None error_func = {'poisson_binomial': bernoulli.calculate_errors_PB, 'poisson': calculate_errors_poisson} pairs_passed = filtered_pairs = fsingles = rsingles = 0 for i, (forward, reverse) in enumerate(iterator): forward = crop_string(forward, fcrop, fheadcrop) fheader = "{} {}".format(forward['identifier'], forward['description']) fquals = [ord(j) - args.qual_type for j in forward['quality']] flen = len(forward['sequence']) fee, fNs = error_func[args.error_calc](forward['sequence'], fquals, args.alpha) reverse = crop_string(reverse, rcrop, rheadcrop) rheader = "{} {}".format(reverse['identifier'], reverse['description']) rquals = [ord(j) - args.qual_type for j in reverse['quality']] rlen = len(reverse['sequence']) ree, rNs = error_func[args.error_calc](reverse['sequence'], rquals, args.alpha) if args.maxerror: fthreshold = rthreshold = args.maxerror else: fthreshold = flen * args.uncert rthreshold = rlen * args.uncert # both good if fee <= fthreshold and ree <= rthreshold: pairs_passed += 1 writer(out_f, forward) writer(out_r, reverse) # forward orphaned, reverse filtered elif fee <= fthreshold and ree > rthreshold: fsingles += 1 writer(out_s, forward) seq_io.logger(args.log, "{}\terrors={!s}".format(rheader, ree)) # reverse orphaned, forward filtered elif fee > fthreshold and ree <=rthreshold: rsingles += 1 writer(out_s, reverse) seq_io.logger(args.log, "{}\terrors={!s}".format(fheader, fee)) # both discarded else: filtered_pairs += 1 seq_io.logger(args.log, "{}\terrors={!s}\n{}\terrors={!s}".format( fheader, fee, rheader, ree)) try: i += 1 except UnboundLocalError: seq_io.print_error("error: no sequences were found to process") total = i * 2 passed = pairs_passed * 2 + fsingles + rsingles print("\nRecords processed:\t{!s} ({!s} pairs)\nPassed filtering:\t" "{!s} ({:.2%})\n Paired reads kept:\t{!s} ({:.2%})\n Forward " "only kept:\t{!s} ({:.2%})\n Reverse only kept:\t{!s} ({:.2%})" "\nRead pairs discarded:\t{!s} ({:.2%})\n".format(total, i, passed, passed / total, pairs_passed, pairs_passed / i, fsingles, fsingles / total, rsingles, rsingles / total, filtered_pairs, filtered_pairs / i), file=sys.stderr)