def test_ncontentfilter_paired(): params = [ ('AAA', 'AAA', 0, KEEP), ('AAAN', 'AAA', 0, DISCARD), ('AAA', 'AANA', 0, DISCARD), ('ANAA', 'AANA', 1, KEEP), ] for seq1, seq2, count, expected in params: filter = NContentFilter(count=count, check_second=False) filter_cs = NContentFilter(count=count, check_second=True) read1 = Sequence('read1', seq1, qualities='#'*len(seq1)) read2 = Sequence('read1', seq2, qualities='#'*len(seq2)) assert filter(read1, read2) == filter(read1) # discard entire pair if one of the reads fulfills criteria assert filter_cs(read1, read2) == expected
def test_ncontentfilter_paired(seq1, seq2, count, expected): filter_ = NContentFilter(count=count) filter_legacy = PairedRedirector(None, filter_, filter_, pair_filter_mode='first') filter_any = PairedRedirector(None, filter_, filter_, pair_filter_mode='any') read1 = Sequence('read1', seq1, qualities='#'*len(seq1)) read2 = Sequence('read1', seq2, qualities='#'*len(seq2)) assert filter_legacy(read1, read2, [], []) == filter_(read1, []) # discard entire pair if one of the reads fulfills criteria assert filter_any(read1, read2, [], []) == expected
def test_ncontentfilter(): # third parameter is True if read should be discarded params = [('AAA', 0, KEEP), ('AAA', 1, KEEP), ('AAACCTTGGN', 1, KEEP), ('AAACNNNCTTGGN', 0.5, KEEP), ('NNNNNN', 1, DISCARD), ('ANAAAA', 1 / 6, KEEP), ('ANAAAA', 0, DISCARD)] for seq, count, expected in params: filter = NContentFilter(count=count) _seq = Sequence('read1', seq, qualities='#' * len(seq)) assert filter(_seq) == expected
def test_ncontentfilter_paired(): params = [ ('AAA', 'AAA', 0, KEEP), ('AAAN', 'AAA', 0, DISCARD), ('AAA', 'AANA', 0, DISCARD), ('ANAA', 'AANA', 1, KEEP), ] for seq1, seq2, count, expected in params: filter = NContentFilter(count=count) filter_legacy = LegacyPairedRedirector(None, filter) filter_both = PairedRedirector(None, filter) read1 = Sequence('read1', seq1, qualities='#' * len(seq1)) read2 = Sequence('read1', seq2, qualities='#' * len(seq2)) assert filter_legacy(read1, read2) == filter(read1) # discard entire pair if one of the reads fulfills criteria assert filter_both(read1, read2) == expected
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ logging.basicConfig(level=logging.INFO, format='%(message)s') # %(levelname)s parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' if options.adapters2 or options.front2 or options.anywhere2 or options.cut2: # Full paired-end trimming when both -p and -A/-G/-B/-U given # Also the read modifications (such as quality trimming) are applied # to second read. paired = 'both' if paired and len(args) == 1: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U or -p, " "two input files are required.") if paired: input_paired_filename = args[1] quality_filename = None else: input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") quality_filename = args[1] else: quality_filename = None if paired: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) else: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile, paired == 'both') writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile, check_second=paired == 'both') writers.append(too_long_filter) if options.max_n != -1: writers.append( NContentFilter(options.max_n, check_second=paired == 'both')) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) if untrimmed_outfile or untrimmed_paired_outfile: writers.append( DiscardUntrimmedFilter(untrimmed_outfile, untrimmed_paired_outfile, check_second=paired == 'both')) writer = DiscardTrimmedFilter(trimmed_outfile, trimmed_paired_outfile, check_second=paired == 'both') writers.append(writer) del writer if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: # TODO refactor this a bit def collect(back, anywhere, front): adapters = [] for name, seq, where in gather_adapters(back, anywhere, front): if not seq: parser.error("The adapter sequence is empty.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) if options.debug: adapter.enable_debug() adapters.append(adapter) return adapters adapters = collect(options.adapters, options.anywhere, options.front) adapters2 = collect(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise if not adapters and not adapters2 and not cutoffs and \ options.cut == [] and options.cut2 == [] and \ options.minimum_length == 0 and \ options.maximum_length == sys.maxsize and \ quality_filename is None and \ options.max_n == -1: parser.error("You need to provide at least one adapter sequence.") try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format) except (seqio.UnknownFileType, IOError) as e: parser.error(e) # Create the processing pipeline consisting of a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None # Modifiers that apply to both reads of paired-end reads modifiers_both = [] if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) # Due to backwards compatibility, from here on logging output needs to be # sent to standard output instead of standard error if the -o option is used. if options.output: logger.root.handlers = [] logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) start_time = time.clock() try: if paired: stats = process_paired_reads(reader, modifiers, modifiers2, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, writers) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def test_ncontentfilter(seq, count, expected): # third parameter is True if read should be discarded filter_ = NContentFilter(count=count) _seq = Sequence('read1', seq, qualities='#' * len(seq)) assert filter_(_seq, []) == expected
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) # Setup logging only if there are not already any handlers (can happen when # this function is being called externally such as from unit tests) if not logging.root.handlers: setup_logging(stdout=bool(options.output), quiet=options.quiet) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): parser.error( "If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: parser.error( "When --interleaved is used, you cannot provide both two input files and two output files" ) # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if not options.output: parser.error( "When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.too_short_output and not options.too_short_paired_output: parser.error( "When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: parser.error( "When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: parser.error(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append( filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append( filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int( options.untrimmed_output is not None) > 1: parser.error( "Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append( filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append( NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(reader, modifiers, filters) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) if paired == 'first' and (modifiers_both or cutoffs): logger.warning('\n'.join( textwrap.wrap( 'WARNING: Requested read ' 'modifications are applied only to the first ' 'read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U options. ' 'Use a dummy adapter sequence when necessary: -A XXX'))) start_time = time.clock() try: stats = pipeline.run() except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, filters) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def pipeline_from_parsed_args(options, args, default_outfile): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandlineError is thrown. """ if len(args) == 0: raise CommandlineError("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: raise CommandlineError("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): raise CommandlineError("If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: raise CommandlineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: raise CommandlineError("When --interleaved is used, you cannot provide both two input files and two output files") # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandlineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandlineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandlineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandlineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: raise CommandlineError("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandlineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: raise CommandlineError(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) else: raise CommandlineError("Expected one value or two values separated by comma for the quality cutoff") else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1: raise CommandlineError("Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: raise CommandlineError("Do not use --discard-trimmed when demultiplexing.") if paired: raise CommandlineError("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandlineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandlineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandlineError("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandlineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandlineError("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: raise CommandlineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandlineError(e) raise except ValueError as e: raise CommandlineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(adapters, adapters2, reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(adapters, adapters2, reader, modifiers, filters) # TODO the following should be done some other way pipeline.paired = paired pipeline.error_rate = options.error_rate pipeline.should_print_warning = paired == 'first' and (modifiers_both or cutoffs) for f in [writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer]: pipeline.register_file_to_close(f) return pipeline