def stipulate(args): """ REQUIRED TO CREATE ITERABLE FUNCTIONS TO RUN IN CUTADAPT 2.7. THIS FUNCTION IS CALLED ONLY ONE TIME. """ modifiers = [] pipeline_add = modifiers.append adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) adapters = adapter_parser.parse_multi(args.adapters) warn_duplicate_adapters(adapters) if args.nextseq_trim is not None: pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.phred64)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.phred64)) adapter_cutter = None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) pipeline_add(adapter_cutter) if args.trim_n: pipeline_add(NEndTrimmer()) add_unconditional_cutters(pipeline_add, args.cut) print("modifiers (cutadapt):", modifiers) return modifiers
def test_parse_file_notation(tmpdir): tmp_path = str(tmpdir.join('adapters.fasta')) with open(tmp_path, 'w') as f: f.write( dedent(""">first_name ADAPTER1 >second_name ADAPTER2 """)) parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False) adapters = list(parser.parse('file:' + tmp_path, cmdline_type='back')) assert len(adapters) == 2 assert adapters[0].name == 'first_name' assert adapters[0].sequence == 'ADAPTER1' assert adapters[1].name == 'second_name' assert adapters[1].sequence == 'ADAPTER2' for a in adapters: assert a.max_error_rate == 0.2 assert a.min_overlap == 4 assert not a.read_wildcards assert not a.adapter_wildcards assert not a.indels
def test_parse_with_adapter_sequence_as_a_path(tmp_path): parser = AdapterParser() with pytest.raises(InvalidCharacter): parser._parse("invalid.character", "back") # user forgot to write "file:" path = (tmp_path / "afile.fasta") path.write_text(">abc\nACGT\n") with pytest.raises(InvalidCharacter) as e: list(parser.parse(str(path), "back")) assert "A file exists named" in e.value.args[0]
def test_anywhere_parameter_front(): parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=True) adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'front'))[0] assert isinstance(adapter, FrontAdapter) assert adapter._force_anywhere # TODO move the rest to a separate test read = Sequence('foo1', 'AAAAAAAAAACTGAAGTGAA') from cutadapt.modifiers import AdapterCutter cutter = AdapterCutter([adapter]) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == ''
def test_anywhere_parameter(): parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=True) adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0] assert adapter.remove == 'suffix' assert adapter.where is Where.ANYWHERE read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA') from cutadapt.modifiers import AdapterCutter cutter = AdapterCutter([adapter]) trimmed_read = cutter(read, []) assert trimmed_read.sequence == ''
def test_parse_with_parameters(): parser = AdapterParser( max_error_rate=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False) a = parser._parse('ACGTACGT; e=0.15', 'front') assert a.max_error_rate == 0.15 assert a.min_overlap == 4 a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back') assert a.max_error_rate == 0.11 assert a.min_overlap == 5 for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17', 'thename=ACG;e=0.15...TGT;e=0.17'): a = parser._parse(spec, 'back') assert isinstance(a, LinkedAdapter) assert a.front_adapter.max_error_rate == 0.15 assert a.back_adapter.max_error_rate == 0.17
def test_linked_adapter_parameters(): # issue #394 a = AdapterParser(max_error_rate=0.17, indels=False)._parse("ACG...TGT") assert isinstance(a, LinkedAdapter) assert a.front_adapter.max_error_rate == 0.17 assert a.back_adapter.max_error_rate == 0.17 assert not a.front_adapter.indels assert not a.back_adapter.indels
def test_parse_with_parameters(tmp_path): parser = AdapterParser(max_errors=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False) a = parser._parse('ACGTACGT; e=0.15', 'front') assert isinstance(a, FrontAdapter) assert a.max_error_rate == 0.15 assert a.min_overlap == 4 a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back') assert isinstance(a, BackAdapter) assert a.max_error_rate == 0.11 assert a.min_overlap == 5 for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17', 'thename=ACG;e=0.15...TGT;e=0.17'): a = parser._parse(spec, 'back') assert isinstance(a, LinkedAdapter) assert a.front_adapter.max_error_rate == 0.15 assert a.back_adapter.max_error_rate == 0.17 with pytest.raises(ValueError) as e: parser._parse("A", "invalid-cmdline-type") assert "cmdline_type cannot be" in e.value.args[0]
def adapters_from_args(args) -> Tuple[List[Adapter], List[Adapter]]: adapter_parser = AdapterParser( max_errors=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters) adapters2 = adapter_parser.parse_multi(args.adapters2) except (FileNotFoundError, ValueError, InvalidCharacter) as e: raise CommandLineError(e) warn_duplicate_adapters(adapters) warn_duplicate_adapters(adapters2) if args.debug == "trace": for adapter in adapters + adapters2: adapter.enable_debug() return adapters, adapters2
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ check_arguments(args, paired, is_interleaved_output) if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front) adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and ( args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True for i, cut_arg in enumerate([args.cut, args.cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: pipeline.add(UnconditionalCutter(c), None) else: pipeline.add(UnconditionalCutter(c)) else: # R2 assert isinstance(pipeline, PairedEndPipeline) pipeline.add(None, UnconditionalCutter(c)) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: if not paired: raise CommandLineError("Option --pair-adapters can only be used when trimming " "paired-end reads") if args.times != 1: raise CommandLineError("--pair-adapters cannot be used with --times") try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) # Remaining modifiers that apply to both reads of paired-end reads if args.length is not None: pipeline_add(Shortener(args.length)) if args.trim_n: pipeline_add(NEndTrimmer()) if args.length_tag: pipeline_add(LengthTagModifier(args.length_tag)) for suffix in args.strip_suffix: pipeline_add(SuffixRemover(suffix)) if args.prefix or args.suffix: pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix)) if args.zero_cap: pipeline_add(ZeroCapper(quality_base=args.quality_base)) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline
def test_parse_invalid_adapter_specific_parameter(where, reqopt): parser = AdapterParser() with pytest.raises(ValueError) as e: parser._parse_not_linked("A;{}".format(reqopt), "name", where) assert "can only be used within linked adapters" in e.value.args[0]
def test_linked_adapter_name(): # issue #414 a = AdapterParser()._parse("the_name=^ACG...TGT") assert isinstance(a, LinkedAdapter) assert a.create_statistics().name == "the_name"
def test_linked_adapter_front_required_optional(r1, r2, exp1, exp2): # -g X...Y a = AdapterParser()._parse("ACG" + r1 + "...TGT" + r2, "front") assert isinstance(a, LinkedAdapter) assert a.front_required is exp1 assert a.back_required is exp2
def test_linked_adapter_back_required_optional(r1, r2, req1, req2): # -a X...Y a = AdapterParser()._parse("ACG" + r1 + "...TGT" + r2, "back") assert isinstance(a, LinkedAdapter) assert a.front_required is req1 assert a.back_required is req2
def test_anchoring_makes_front_linked_adapter_required(seq, req1, req2): # -a X...Y a = AdapterParser()._parse(seq, "back") assert isinstance(a, LinkedAdapter) assert a.front_required is req1 assert a.back_required is req2
def test_parse_multi(): parser = AdapterParser() with pytest.raises(ValueError) as e: parser.parse_multi([("invalid-type", "A")]) assert "adapter type must be" in e.value.args[0]
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is raised. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ check_arguments(args, paired, is_interleaved_output) if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters) adapters2 = adapter_parser.parse_multi(args.adapters2) except (FileNotFoundError, ValueError) as e: raise CommandLineError(e) warn_duplicate_adapters(adapters) warn_duplicate_adapters(adapters2) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and (args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True add_unconditional_cutters(pipeline, args.cut, args.cut2, paired) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add( NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) for modifier in modifiers_applying_to_both_ends_if_paired(args): pipeline_add(modifier) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError( 'Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline