Пример #1
0
def stipulate(args):
    """
    REQUIRED TO CREATE ITERABLE FUNCTIONS TO RUN IN CUTADAPT 2.7. THIS FUNCTION IS CALLED ONLY ONE TIME. 
    """
    modifiers = []
    pipeline_add = modifiers.append
    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    adapters = adapter_parser.parse_multi(args.adapters)
    warn_duplicate_adapters(adapters)

    if args.nextseq_trim is not None:
        pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.phred64))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.phred64))

    adapter_cutter = None
    if adapters:
        adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        pipeline_add(adapter_cutter)
    if args.trim_n:
        pipeline_add(NEndTrimmer())
    add_unconditional_cutters(pipeline_add, args.cut)

    print("modifiers (cutadapt):", modifiers)
    return modifiers
Пример #2
0
def test_parse_file_notation(tmpdir):
    tmp_path = str(tmpdir.join('adapters.fasta'))
    with open(tmp_path, 'w') as f:
        f.write(
            dedent(""">first_name
            ADAPTER1
            >second_name
            ADAPTER2
            """))
    parser = AdapterParser(max_error_rate=0.2,
                           min_overlap=4,
                           read_wildcards=False,
                           adapter_wildcards=False,
                           indels=False)

    adapters = list(parser.parse('file:' + tmp_path, cmdline_type='back'))
    assert len(adapters) == 2
    assert adapters[0].name == 'first_name'
    assert adapters[0].sequence == 'ADAPTER1'
    assert adapters[1].name == 'second_name'
    assert adapters[1].sequence == 'ADAPTER2'
    for a in adapters:
        assert a.max_error_rate == 0.2
        assert a.min_overlap == 4
        assert not a.read_wildcards
        assert not a.adapter_wildcards
        assert not a.indels
Пример #3
0
def test_parse_with_adapter_sequence_as_a_path(tmp_path):
    parser = AdapterParser()
    with pytest.raises(InvalidCharacter):
        parser._parse("invalid.character", "back")
    # user forgot to write "file:"
    path = (tmp_path / "afile.fasta")
    path.write_text(">abc\nACGT\n")
    with pytest.raises(InvalidCharacter) as e:
        list(parser.parse(str(path), "back"))
    assert "A file exists named" in e.value.args[0]
Пример #4
0
def test_anywhere_parameter_front():
    parser = AdapterParser(max_error_rate=0.2, min_overlap=4, read_wildcards=False,
        adapter_wildcards=False, indels=True)
    adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'front'))[0]
    assert isinstance(adapter, FrontAdapter)
    assert adapter._force_anywhere

    # TODO move the rest to a separate test
    read = Sequence('foo1', 'AAAAAAAAAACTGAAGTGAA')
    from cutadapt.modifiers import AdapterCutter
    cutter = AdapterCutter([adapter])
    trimmed_read = cutter(read, ModificationInfo(read))
    assert trimmed_read.sequence == ''
Пример #5
0
def test_anywhere_parameter():
    parser = AdapterParser(max_error_rate=0.2,
                           min_overlap=4,
                           read_wildcards=False,
                           adapter_wildcards=False,
                           indels=True)
    adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0]
    assert adapter.remove == 'suffix'
    assert adapter.where is Where.ANYWHERE
    read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA')
    from cutadapt.modifiers import AdapterCutter
    cutter = AdapterCutter([adapter])
    trimmed_read = cutter(read, [])
    assert trimmed_read.sequence == ''
Пример #6
0
def test_parse_with_parameters():
    parser = AdapterParser(
        max_error_rate=0.2, min_overlap=4, read_wildcards=False,
        adapter_wildcards=False, indels=False)
    a = parser._parse('ACGTACGT; e=0.15', 'front')
    assert a.max_error_rate == 0.15
    assert a.min_overlap == 4

    a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back')
    assert a.max_error_rate == 0.11
    assert a.min_overlap == 5

    for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17', 'thename=ACG;e=0.15...TGT;e=0.17'):
        a = parser._parse(spec, 'back')
        assert isinstance(a, LinkedAdapter)
        assert a.front_adapter.max_error_rate == 0.15
        assert a.back_adapter.max_error_rate == 0.17
Пример #7
0
def test_linked_adapter_parameters():
    # issue #394
    a = AdapterParser(max_error_rate=0.17, indels=False)._parse("ACG...TGT")
    assert isinstance(a, LinkedAdapter)
    assert a.front_adapter.max_error_rate == 0.17
    assert a.back_adapter.max_error_rate == 0.17
    assert not a.front_adapter.indels
    assert not a.back_adapter.indels
Пример #8
0
def test_parse_with_parameters(tmp_path):
    parser = AdapterParser(max_errors=0.2,
                           min_overlap=4,
                           read_wildcards=False,
                           adapter_wildcards=False,
                           indels=False)
    a = parser._parse('ACGTACGT; e=0.15', 'front')
    assert isinstance(a, FrontAdapter)
    assert a.max_error_rate == 0.15
    assert a.min_overlap == 4

    a = parser._parse('ACGTAAAA; o=5; e=0.11', 'back')
    assert isinstance(a, BackAdapter)
    assert a.max_error_rate == 0.11
    assert a.min_overlap == 5

    for spec in ('thename=ACG;e=0.15 ... TGT;e=0.17',
                 'thename=ACG;e=0.15...TGT;e=0.17'):
        a = parser._parse(spec, 'back')
        assert isinstance(a, LinkedAdapter)
        assert a.front_adapter.max_error_rate == 0.15
        assert a.back_adapter.max_error_rate == 0.17

    with pytest.raises(ValueError) as e:
        parser._parse("A", "invalid-cmdline-type")
    assert "cmdline_type cannot be" in e.value.args[0]
Пример #9
0
def adapters_from_args(args) -> Tuple[List[Adapter], List[Adapter]]:
    adapter_parser = AdapterParser(
        max_errors=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters)
        adapters2 = adapter_parser.parse_multi(args.adapters2)
    except (FileNotFoundError, ValueError, InvalidCharacter) as e:
        raise CommandLineError(e)
    warn_duplicate_adapters(adapters)
    warn_duplicate_adapters(adapters2)
    if args.debug == "trace":
        for adapter in adapters + adapters2:
            adapter.enable_debug()
    return adapters, adapters2
Пример #10
0
def pipeline_from_parsed_args(args, paired, is_interleaved_output):
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is thrown.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """
    check_arguments(args, paired, is_interleaved_output)
    if args.action == 'none':
        args.action = None

    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front)
        adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise CommandLineError(e)
        raise
    except ValueError as e:
        raise CommandLineError(e)
    if args.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode)
    else:
        pipeline = SingleEndPipeline()

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if paired and (not adapters2 or not adapters) and (
            args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    for i, cut_arg in enumerate([args.cut, args.cut2]):
        # cut_arg is a list
        if not cut_arg:
            continue
        if len(cut_arg) > 2:
            raise CommandLineError("You cannot remove bases from more than two ends.")
        if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0:
            raise CommandLineError("You cannot remove bases from the same end twice.")
        for c in cut_arg:
            if c == 0:
                continue
            if i == 0:  # R1
                if paired:
                    pipeline.add(UnconditionalCutter(c), None)
                else:
                    pipeline.add(UnconditionalCutter(c))
            else:
                # R2
                assert isinstance(pipeline, PairedEndPipeline)
                pipeline.add(None, UnconditionalCutter(c))

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    if args.pair_adapters:
        if not paired:
            raise CommandLineError("Option --pair-adapters can only be used when trimming "
                "paired-end reads")
        if args.times != 1:
            raise CommandLineError("--pair-adapters cannot be used with --times")
        try:
            cutter = PairedAdapterCutter(adapters, adapters2, args.action)
        except PairedAdapterCutterError as e:
            raise CommandLineError("--pair-adapters: " + str(e))
        pipeline.add_paired_modifier(cutter)
    else:
        adapter_cutter, adapter_cutter2 = None, None
        if adapters:
            adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action)
        if paired:
            if adapter_cutter or adapter_cutter2:
                pipeline.add(adapter_cutter, adapter_cutter2)
        else:
            if adapter_cutter:
                pipeline.add(adapter_cutter)

    # Remaining modifiers that apply to both reads of paired-end reads
    if args.length is not None:
        pipeline_add(Shortener(args.length))
    if args.trim_n:
        pipeline_add(NEndTrimmer())
    if args.length_tag:
        pipeline_add(LengthTagModifier(args.length_tag))
    for suffix in args.strip_suffix:
        pipeline_add(SuffixRemover(suffix))
    if args.prefix or args.suffix:
        pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix))
    if args.zero_cap:
        pipeline_add(ZeroCapper(quality_base=args.quality_base))

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError('Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline
Пример #11
0
def test_parse_invalid_adapter_specific_parameter(where, reqopt):
    parser = AdapterParser()
    with pytest.raises(ValueError) as e:
        parser._parse_not_linked("A;{}".format(reqopt), "name", where)
    assert "can only be used within linked adapters" in e.value.args[0]
Пример #12
0
def test_linked_adapter_name():
    # issue #414
    a = AdapterParser()._parse("the_name=^ACG...TGT")
    assert isinstance(a, LinkedAdapter)
    assert a.create_statistics().name == "the_name"
Пример #13
0
def test_linked_adapter_front_required_optional(r1, r2, exp1, exp2):
    # -g X...Y
    a = AdapterParser()._parse("ACG" + r1 + "...TGT" + r2, "front")
    assert isinstance(a, LinkedAdapter)
    assert a.front_required is exp1
    assert a.back_required is exp2
Пример #14
0
def test_linked_adapter_back_required_optional(r1, r2, req1, req2):
    # -a X...Y
    a = AdapterParser()._parse("ACG" + r1 + "...TGT" + r2, "back")
    assert isinstance(a, LinkedAdapter)
    assert a.front_required is req1
    assert a.back_required is req2
Пример #15
0
def test_anchoring_makes_front_linked_adapter_required(seq, req1, req2):
    # -a X...Y
    a = AdapterParser()._parse(seq, "back")
    assert isinstance(a, LinkedAdapter)
    assert a.front_required is req1
    assert a.back_required is req2
Пример #16
0
def test_parse_multi():
    parser = AdapterParser()
    with pytest.raises(ValueError) as e:
        parser.parse_multi([("invalid-type", "A")])
    assert "adapter type must be" in e.value.args[0]
Пример #17
0
def pipeline_from_parsed_args(args, paired, is_interleaved_output):
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is raised.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """
    check_arguments(args, paired, is_interleaved_output)
    if args.action == 'none':
        args.action = None

    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters)
        adapters2 = adapter_parser.parse_multi(args.adapters2)
    except (FileNotFoundError, ValueError) as e:
        raise CommandLineError(e)
    warn_duplicate_adapters(adapters)
    warn_duplicate_adapters(adapters2)
    if args.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode)
    else:
        pipeline = SingleEndPipeline()

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if paired and (not adapters2
                   or not adapters) and (args.discard_untrimmed
                                         or args.untrimmed_output
                                         or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    add_unconditional_cutters(pipeline, args.cut, args.cut2, paired)

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(
            NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    if args.pair_adapters:
        try:
            cutter = PairedAdapterCutter(adapters, adapters2, args.action)
        except PairedAdapterCutterError as e:
            raise CommandLineError("--pair-adapters: " + str(e))
        pipeline.add_paired_modifier(cutter)
    else:
        adapter_cutter, adapter_cutter2 = None, None
        if adapters:
            adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action)
        if paired:
            if adapter_cutter or adapter_cutter2:
                pipeline.add(adapter_cutter, adapter_cutter2)
        else:
            if adapter_cutter:
                pipeline.add(adapter_cutter)

    for modifier in modifiers_applying_to_both_ends_if_paired(args):
        pipeline_add(modifier)

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError(
                    'Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline