def stipulate(args): """ REQUIRED TO CREATE ITERABLE FUNCTIONS TO RUN IN CUTADAPT 2.7. THIS FUNCTION IS CALLED ONLY ONE TIME. """ modifiers = [] pipeline_add = modifiers.append adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) adapters = adapter_parser.parse_multi(args.adapters) warn_duplicate_adapters(adapters) if args.nextseq_trim is not None: pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.phred64)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.phred64)) adapter_cutter = None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) pipeline_add(adapter_cutter) if args.trim_n: pipeline_add(NEndTrimmer()) add_unconditional_cutters(pipeline_add, args.cut) print("modifiers (cutadapt):", modifiers) return modifiers
def test_nend_trimmer(): trimmer = NEndTrimmer() seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN'] trims = ['AAACCTTGG', 'AAACNNNCTTGG', ''] for seq, trimmed in zip(seqs, trims): _seq = Sequence('read1', seq, qualities='#' * len(seq)) _trimmed = Sequence('read1', trimmed, qualities='#' * len(trimmed)) assert trimmer(_seq) == _trimmed
def modifiers_applying_to_both_ends_if_paired(args) -> Iterator[SingleEndModifier]: if args.length is not None: yield Shortener(args.length) if args.trim_n: yield NEndTrimmer() if args.length_tag: yield LengthTagModifier(args.length_tag) for suffix in args.strip_suffix: yield SuffixRemover(suffix) if args.prefix or args.suffix: yield PrefixSuffixAdder(args.prefix, args.suffix) if args.zero_cap: yield ZeroCapper(quality_base=args.quality_base)
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ logging.basicConfig(level=logging.INFO, format='%(message)s') # %(levelname)s parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' if options.adapters2 or options.front2 or options.anywhere2 or options.cut2: # Full paired-end trimming when both -p and -A/-G/-B/-U given # Also the read modifications (such as quality trimming) are applied # to second read. paired = 'both' if paired and len(args) == 1: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U or -p, " "two input files are required.") if paired: input_paired_filename = args[1] quality_filename = None else: input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") quality_filename = args[1] else: quality_filename = None if paired: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) else: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile, paired == 'both') writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile, check_second=paired == 'both') writers.append(too_long_filter) if options.max_n != -1: writers.append( NContentFilter(options.max_n, check_second=paired == 'both')) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) if untrimmed_outfile or untrimmed_paired_outfile: writers.append( DiscardUntrimmedFilter(untrimmed_outfile, untrimmed_paired_outfile, check_second=paired == 'both')) writer = DiscardTrimmedFilter(trimmed_outfile, trimmed_paired_outfile, check_second=paired == 'both') writers.append(writer) del writer if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: # TODO refactor this a bit def collect(back, anywhere, front): adapters = [] for name, seq, where in gather_adapters(back, anywhere, front): if not seq: parser.error("The adapter sequence is empty.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) if options.debug: adapter.enable_debug() adapters.append(adapter) return adapters adapters = collect(options.adapters, options.anywhere, options.front) adapters2 = collect(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise if not adapters and not adapters2 and not cutoffs and \ options.cut == [] and options.cut2 == [] and \ options.minimum_length == 0 and \ options.maximum_length == sys.maxsize and \ quality_filename is None and \ options.max_n == -1: parser.error("You need to provide at least one adapter sequence.") try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format) except (seqio.UnknownFileType, IOError) as e: parser.error(e) # Create the processing pipeline consisting of a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None # Modifiers that apply to both reads of paired-end reads modifiers_both = [] if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) # Due to backwards compatibility, from here on logging output needs to be # sent to standard output instead of standard error if the -o option is used. if options.output: logger.root.handlers = [] logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) start_time = time.clock() try: if paired: stats = process_paired_reads(reader, modifiers, modifiers2, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, writers) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def pipeline_from_parsed_args(options, paired, pair_filter_mode, quality_filename, is_interleaved_output): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if not paired: if options.untrimmed_paired_output: raise CommandLineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if paired: if not is_interleaved_output: if not options.paired_output: raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandLineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandLineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandLineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandLineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif quality_filename is not None: if options.format is not None: raise CommandLineError('If a pair of .fasta and .qual files is given, the -f/--format ' 'parameter cannot be used.') if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandLineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandLineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandLineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandLineError("Using --anywhere with colorspace reads is currently not supported " "(if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandLineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= options.gc_content <= 100): raise CommandLineError("GC content must be given as percentage between 0 and 100") if options.action == 'none': options.action = None if options.colorspace: if options.match_read_wildcards: raise CommandLineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline. # If no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. if paired: pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first') else: pipeline = SingleEndPipeline() if options.cut: if len(options.cut) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: pipeline.add1(UnconditionalCutter(cut)) if options.cut2: if len(options.cut2) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: pipeline.add2(UnconditionalCutter(cut)) if options.nextseq_trim is not None: pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if options.quality_cutoff is not None: cutoffs = parse_cutoffs(options.quality_cutoff) pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.action) pipeline.add1(adapter_cutter) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action) pipeline.add2(adapter_cutter2) # Modifiers that apply to both reads of paired-end reads unless in legacy mode if options.length is not None: pipeline.add(Shortener(options.length)) if options.trim_n: pipeline.add(NEndTrimmer()) if options.length_tag: pipeline.add(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: pipeline.add(SuffixRemover(suffix)) if options.prefix or options.suffix: pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: pipeline.add(DoubleEncoder()) if options.zero_cap: pipeline.add(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: pipeline.add(PrimerTrimmer()) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(options, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = options.max_n pipeline.discard_casava = options.discard_casava pipeline.discard_trimmed = options.discard_trimmed pipeline.discard_untrimmed = options.discard_untrimmed return pipeline
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if not paired: if args.untrimmed_paired_output: raise CommandLineError( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if paired: if not is_interleaved_output: if not args.paired_output: raise CommandLineError( "When a paired-end trimming option such as -A/-G/-B/-U, " "is used, a second output file needs to be specified via -p (--paired-output)." ) if not args.output: raise CommandLineError( "When you use -p or --paired-output, you must also " "use the -o option.") if bool(args.untrimmed_output) != bool(args.untrimmed_paired_output): raise CommandLineError( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if args.too_short_output and not args.too_short_paired_output: raise CommandLineError( "When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if args.too_long_output and not args.too_long_paired_output: raise CommandLineError( "When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") if args.format is not None: logger.warning( "Option --format is deprecated and ignored because the input file format is " "always auto-detected") if not (0 <= args.error_rate < 1.): raise CommandLineError( "The maximum error rate must be at least 0 and less than 1.") if args.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= args.gc_content <= 100): raise CommandLineError( "GC content must be given as percentage between 0 and 100") if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front) adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and (args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True for i, cut_arg in enumerate([args.cut, args.cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError( "You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError( "You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: pipeline.add(UnconditionalCutter(c), None) else: pipeline.add(UnconditionalCutter(c)) else: # R2 assert isinstance(pipeline, PairedEndPipeline) pipeline.add(None, UnconditionalCutter(c)) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add( NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: if not paired: raise CommandLineError( "Option --pair-adapters can only be used when trimming " "paired-end reads") if args.times != 1: raise CommandLineError( "--pair-adapters cannot be used with --times") try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) # Remaining modifiers that apply to both reads of paired-end reads if args.length is not None: pipeline_add(Shortener(args.length)) if args.trim_n: pipeline_add(NEndTrimmer()) if args.length_tag: pipeline_add(LengthTagModifier(args.length_tag)) for suffix in args.strip_suffix: pipeline_add(SuffixRemover(suffix)) if args.prefix or args.suffix: pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix)) if args.zero_cap: pipeline_add(ZeroCapper(quality_base=args.quality_base)) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError( 'Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) # Setup logging only if there are not already any handlers (can happen when # this function is being called externally such as from unit tests) if not logging.root.handlers: setup_logging(stdout=bool(options.output), quiet=options.quiet) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): parser.error( "If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: parser.error( "When --interleaved is used, you cannot provide both two input files and two output files" ) # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if not options.output: parser.error( "When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.too_short_output and not options.too_short_paired_output: parser.error( "When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: parser.error( "When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: parser.error(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append( filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append( filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int( options.untrimmed_output is not None) > 1: parser.error( "Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append( filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append( NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(reader, modifiers, filters) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) if paired == 'first' and (modifiers_both or cutoffs): logger.warning('\n'.join( textwrap.wrap( 'WARNING: Requested read ' 'modifications are applied only to the first ' 'read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U options. ' 'Use a dummy adapter sequence when necessary: -A XXX'))) start_time = time.clock() try: stats = pipeline.run() except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, filters) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def pipeline_from_parsed_args(options, args, default_outfile): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandlineError is thrown. """ if len(args) == 0: raise CommandlineError("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: raise CommandlineError("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): raise CommandlineError("If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: raise CommandlineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: raise CommandlineError("When --interleaved is used, you cannot provide both two input files and two output files") # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandlineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandlineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandlineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandlineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: raise CommandlineError("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandlineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: raise CommandlineError(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) else: raise CommandlineError("Expected one value or two values separated by comma for the quality cutoff") else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1: raise CommandlineError("Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: raise CommandlineError("Do not use --discard-trimmed when demultiplexing.") if paired: raise CommandlineError("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandlineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandlineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandlineError("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandlineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandlineError("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: raise CommandlineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandlineError(e) raise except ValueError as e: raise CommandlineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(adapters, adapters2, reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(adapters, adapters2, reader, modifiers, filters) # TODO the following should be done some other way pipeline.paired = paired pipeline.error_rate = options.error_rate pipeline.should_print_warning = paired == 'first' and (modifiers_both or cutoffs) for f in [writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer]: pipeline.register_file_to_close(f) return pipeline
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ check_arguments(args, paired, is_interleaved_output) if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front) adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and ( args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True for i, cut_arg in enumerate([args.cut, args.cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: pipeline.add(UnconditionalCutter(c), None) else: pipeline.add(UnconditionalCutter(c)) else: # R2 assert isinstance(pipeline, PairedEndPipeline) pipeline.add(None, UnconditionalCutter(c)) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: if not paired: raise CommandLineError("Option --pair-adapters can only be used when trimming " "paired-end reads") if args.times != 1: raise CommandLineError("--pair-adapters cannot be used with --times") try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) # Remaining modifiers that apply to both reads of paired-end reads if args.length is not None: pipeline_add(Shortener(args.length)) if args.trim_n: pipeline_add(NEndTrimmer()) if args.length_tag: pipeline_add(LengthTagModifier(args.length_tag)) for suffix in args.strip_suffix: pipeline_add(SuffixRemover(suffix)) if args.prefix or args.suffix: pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix)) if args.zero_cap: pipeline_add(ZeroCapper(quality_base=args.quality_base)) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline