Exemplo n.º 1
0
def test_compare_prefixes():
    assert compare_prefixes(b'AAXAA', b'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
    assert compare_prefixes(b'AANAA', b'AACAATTTTTTTTT',
                            ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes(b'AANAA', b'AACAATTTTTTTTT',
                            ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes(b'XAAAAA', b'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)
Exemplo n.º 2
0
def test_compare_prefixes():
	assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)

	a = WILDCARD_SEQUENCES[0]
	for s in WILDCARD_SEQUENCES:
		r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
		result = compare_prefixes(a, r, wildcard_query=True)
		assert result == (0, 10, 0, 10, 10, 0), result

		result = compare_prefixes(r, a, wildcard_ref=True)
		assert result == (0, 10, 0, 10, 10, 0)

	for s in WILDCARD_SEQUENCES:
		for t in WILDCARD_SEQUENCES:
			r = s + 'GCCAGGG'
			result = compare_prefixes(s, r, )
			assert result == (0, 10, 0, 10, 10, 0)

			result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True)
			assert result == (0, 10, 0, 10, 10, 0)

	r = WILDCARD_SEQUENCES[0] + 'GCCAGG'
	for wildc_ref in (False, True):
		for wildc_query in (False, True):
			result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query)
			assert result == (0, 10, 0, 10, 8, 2)
Exemplo n.º 3
0
def test_compare_prefixes():
	assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)

	a = WILDCARD_SEQUENCES[0]
	for s in WILDCARD_SEQUENCES:
		r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
		result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2)
		assert result == (0, 10, 0, 10, 10, 0), result

		result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1)
		assert result == (0, 10, 0, 10, 10, 0)

	for s in WILDCARD_SEQUENCES:
		for t in WILDCARD_SEQUENCES:
			r = s + 'GCCAGGG'
			result = compare_prefixes(s, r, degenerate=ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2)
			assert result == (0, 10, 0, 10, 10, 0)

			result = compare_prefixes(r, s, degenerate=ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2)
			assert result == (0, 10, 0, 10, 10, 0)

	r = WILDCARD_SEQUENCES[0] + 'GCCAGG'
	for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2:
		result = compare_prefixes('CCCXTTXATC', r, degenerate=deg)
		assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes():
    assert compare_prefixes("AAXAA", "AAAAATTTTTTTTT") == (0, 5, 0, 5, 4, 1)
    assert compare_prefixes("AANAA", "AACAATTTTTTTTT", ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes("AANAA", "AACAATTTTTTTTT", ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes("XAAAAA", "AAAAATTTTTTTTT") == (0, 6, 0, 6, 4, 2)

    a = WILDCARD_SEQUENCES[0]
    for s in WILDCARD_SEQUENCES:
        r = s + "GCCAGGGTTGATTCGGCTGATCTGGCCG"
        result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2)
        assert result == (0, 10, 0, 10, 10, 0), result

        result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1)
        assert result == (0, 10, 0, 10, 10, 0)

    for s in WILDCARD_SEQUENCES:
        for t in WILDCARD_SEQUENCES:
            r = s + "GCCAGGG"
            result = compare_prefixes(s, r, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2)
            assert result == (0, 10, 0, 10, 10, 0)

            result = compare_prefixes(r, s, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2)
            assert result == (0, 10, 0, 10, 10, 0)

    r = WILDCARD_SEQUENCES[0] + "GCCAGG"
    for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2:
        result = compare_prefixes("CCCXTTXATC", r, degenerate=deg)
        assert result == (0, 10, 0, 10, 8, 2)
Exemplo n.º 5
0
	def match_to(self, read, match_class=Match):
		"""
		Attempt to match this adapter to the given read.

		Return a Match instance if a match was found;
		return None if no match was found given the matching criteria (minimum
		overlap length, maximum error rate).
		"""
		read_seq = read.sequence.upper()  # temporary copy
		pos = -1

		# try to find an exact match first unless wildcards are allowed
		if not self.adapter_wildcards:
			if self.where == PREFIX:
				pos = 0 if read_seq.startswith(self.sequence) else -1
			elif self.where == SUFFIX:
				pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
			elif self.where == BACK or self.where == FRONT:
				pos = read_seq.find(self.sequence)
			# TODO BACK_NOT_INTERNAL, FRONT_NOT_INTERNAL
		if pos >= 0:
			match_args = (
				0, len(self.sequence), pos, pos + len(self.sequence),
				len(self.sequence), 0)
		else:
			# try approximate matching
			if not self.indels and self.where in (PREFIX, SUFFIX):
				if self.where == PREFIX:
					alignment = align.compare_prefixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				else:
					alignment = align.compare_suffixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				astart, astop, rstart, rstop, matches, errors = alignment
				if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate:
					match_args = alignment
				else:
					match_args = None
			else:
				alignment = self.aligner.locate(read_seq)
				if self._debug:
					print(self.aligner.dpmatrix)  # pragma: no cover
				if alignment is None:
					match_args = None
				else:
					astart, astop, rstart, rstop, matches, errors = alignment
					match_args = (astart, astop, rstart, rstop, matches, errors)

		if match_args is None:
			return None
		if self.remove == 'auto':
			# guess: if alignment starts at pos 0, it’s a 5' adapter
			remove_before = match_args[2] == 0  # index 2 is rstart
		else:
			remove_before = self.remove == 'prefix'
		match = match_class(*match_args, remove_before=remove_before, adapter=self, read=read)

		assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match
		assert match.length >= self.min_overlap
		return match
Exemplo n.º 6
0
	def match(self, read):
		"""
		Try to match this adapter to the given read and return an AdapterMatch instance.

		Return None if the minimum overlap length is not met or the error rate is too high.
		"""
		read_seq = read.sequence.upper()
		pos = -1
		# try to find an exact match first unless wildcards are allowed
		if not self.match_adapter_wildcards:
			if self.where == PREFIX:
				pos = 0 if read_seq.startswith(self.sequence) else -1
			else:
				pos = read_seq.find(self.sequence)
		if pos >= 0:
			match = AdapterMatch(
				0, len(self.sequence), pos, pos + len(self.sequence),
				len(self.sequence), 0, self._front_flag, self, read)
		else:
			# try approximate matching
			if not self.indels:
				alignment = align.compare_prefixes(self.sequence, read_seq, self.wildcard_flags)
			else:
				alignment = align.globalalign_locate(self.sequence, read_seq,
					self.max_error_rate, self.where, self.wildcard_flags)
			# TODO line-based profiling tells me that the following line
			# is slow (takes 30% of match()'s running time)
			match = AdapterMatch(*(alignment + (self._front_flag, self, read)))

		# TODO globalalign_locate should be modified to allow the following
		# assertion.
		# assert length == 0 or match.errors / length <= self.max_error_rate
		if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate:
			return None
		return match
Exemplo n.º 7
0
    def match_to(self, read):
        """
		Try to match this adapter to the given read and return an AdapterMatch instance.

		Return None if the minimum overlap length is not met or the error rate is too high.
		"""
        read_seq = read.sequence.upper()
        pos = -1
        # try to find an exact match first unless wildcards are allowed
        if not self.adapter_wildcards:
            if self.where == PREFIX:
                pos = 0 if read_seq.startswith(self.sequence) else -1
            elif self.where == SUFFIX:
                pos = (len(read_seq) -
                       len(self.sequence)) if read_seq.endswith(
                           self.sequence) else -1
            else:
                pos = read_seq.find(self.sequence)
        if pos >= 0:
            match = AdapterMatch(0, len(self.sequence),
                                 pos, pos + len(self.sequence),
                                 len(self.sequence), 0, self._front_flag, self,
                                 read)
        else:
            # try approximate matching
            if not self.indels and self.where in (PREFIX, SUFFIX):
                if self.where == PREFIX:
                    alignment = align.compare_prefixes(self.sequence, read_seq,
                                                       self.wildcard_flags)
                else:
                    alignment = align.compare_suffixes(self.sequence, read_seq,
                                                       self.wildcard_flags)
                astart, astop, rstart, rstop, matches, errors = alignment
                if astop - astart >= self.min_overlap and errors / (
                        astop - astart) <= self.max_error_rate:
                    match = AdapterMatch(*(alignment +
                                           (self._front_flag, self, read)))
                else:
                    match = None
            else:
                alignment = self.aligner.locate(read_seq)
                if self.debug:
                    print(self.aligner.dpmatrix)
                if alignment is None:
                    match = None
                else:
                    astart, astop, rstart, rstop, matches, errors = alignment
                    match = AdapterMatch(astart, astop, rstart, rstop, matches,
                                         errors, self._front_flag, self, read)

        if match is None:
            return None
        assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match
        assert match.length >= self.min_overlap
        return match
Exemplo n.º 8
0
    def match(self, read):
        """
		Try to match this adapter to the given read and return an AdapterMatch instance.

		Return None if the minimum overlap length is not met or the error rate is too high.
		"""
        read_seq = read.sequence.upper()
        pos = -1
        # try to find an exact match first unless wildcards are allowed
        if not self.adapter_wildcards:
            if self.where == PREFIX:
                pos = 0 if read_seq.startswith(self.sequence) else -1
            elif self.where == SUFFIX:
                pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
            else:
                pos = read_seq.find(self.sequence)
        if pos >= 0:
            match = AdapterMatch(
                0,
                len(self.sequence),
                pos,
                pos + len(self.sequence),
                len(self.sequence),
                0,
                self._front_flag,
                self,
                read,
            )
        else:
            # try approximate matching
            if not self.indels:
                assert self.where in (PREFIX, SUFFIX)
                if self.where == PREFIX:
                    alignment = align.compare_prefixes(self.sequence, read_seq, self.wildcard_flags)
                else:
                    alignment = align.compare_suffixes(self.sequence, read_seq, self.wildcard_flags)
                astart, astop, rstart, rstop, matches, errors = alignment
                match = AdapterMatch(*(alignment + (self._front_flag, self, read)))
            else:
                alignment = self.aligner.locate(read_seq)
                astart, astop, rstart, rstop, matches, errors = alignment
                length = astop - astart
                if length < self.min_overlap or errors / length > self.max_error_rate:
                    return None
                return AdapterMatch(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read)

                # TODO Aligner.locate should be modified to allow the following
                # assertion.
                # assert length == 0 or match.errors / length <= self.max_error_rate
        if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate:
            return None
        return match
Exemplo n.º 9
0
	def match_to(self, read):
		"""
		Attempt to match this adapter to the given read.

		Return an Match instance if a match was found;
		return None if no match was found given the matching criteria (minimum
		overlap length, maximum error rate).
		"""
		read_seq = read.sequence.upper()
		pos = -1
		# try to find an exact match first unless wildcards are allowed
		if not self.adapter_wildcards:
			if self.where == PREFIX:
				pos = 0 if read_seq.startswith(self.sequence) else -1
			elif self.where == SUFFIX:
				pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
			else:
				pos = read_seq.find(self.sequence)
		if pos >= 0:
			match = Match(
				0, len(self.sequence), pos, pos + len(self.sequence),
				len(self.sequence), 0, self._front_flag, self, read)
		else:
			# try approximate matching
			if not self.indels and self.where in (PREFIX, SUFFIX):
				if self.where == PREFIX:
					alignment = align.compare_prefixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				else:
					alignment = align.compare_suffixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				astart, astop, rstart, rstop, matches, errors = alignment
				if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate:
					match = Match(*(alignment + (self._front_flag, self, read)))
				else:
					match = None
			else:
				alignment = self.aligner.locate(read_seq)
				if self.debug:
					print(self.aligner.dpmatrix)  # pragma: no cover
				if alignment is None:
					match = None
				else:
					astart, astop, rstart, rstop, matches, errors = alignment
					match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read)

		if match is None:
			return None
		assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match
		assert match.length >= self.min_overlap
		return match
Exemplo n.º 10
0
def test_compare_prefixes():
    assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
    assert compare_prefixes('AANAA', 'AACAATTTTTTTTT',
                            wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes('AANAA', 'AACAATTTTTTTTT',
                            wildcard_ref=True) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)

    a = WILDCARD_SEQUENCES[0]
    for s in WILDCARD_SEQUENCES:
        r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
        result = compare_prefixes(a, r, wildcard_query=True)
        assert result == (0, 10, 0, 10, 10, 0), result

        result = compare_prefixes(r, a, wildcard_ref=True)
        assert result == (0, 10, 0, 10, 10, 0)

    for s in WILDCARD_SEQUENCES:
        for t in WILDCARD_SEQUENCES:
            r = s + 'GCCAGGG'
            result = compare_prefixes(
                s,
                r,
            )
            assert result == (0, 10, 0, 10, 10, 0)

            result = compare_prefixes(r,
                                      s,
                                      wildcard_ref=True,
                                      wildcard_query=True)
            assert result == (0, 10, 0, 10, 10, 0)

    r = WILDCARD_SEQUENCES[0] + 'GCCAGG'
    for wildc_ref in (False, True):
        for wildc_query in (False, True):
            result = compare_prefixes('CCCXTTXATC',
                                      r,
                                      wildcard_ref=wildc_ref,
                                      wildcard_query=wildc_query)
            assert result == (0, 10, 0, 10, 8, 2)
Exemplo n.º 11
0
def test_compare_prefixes():
    assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
    assert compare_prefixes('AANAA', 'AACAATTTTTTTTT',
                            ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes('AANAA', 'AACAATTTTTTTTT',
                            ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
    assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)

    a = WILDCARD_SEQUENCES[0]
    for s in WILDCARD_SEQUENCES:
        r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG'
        result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2)
        assert result == (0, 10, 0, 10, 10, 0), result

        result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1)
        assert result == (0, 10, 0, 10, 10, 0)

    for s in WILDCARD_SEQUENCES:
        for t in WILDCARD_SEQUENCES:
            r = s + 'GCCAGGG'
            result = compare_prefixes(s,
                                      r,
                                      degenerate=ALLOW_WILDCARD_SEQ1
                                      | ALLOW_WILDCARD_SEQ2)
            assert result == (0, 10, 0, 10, 10, 0)

            result = compare_prefixes(r,
                                      s,
                                      degenerate=ALLOW_WILDCARD_SEQ1
                                      | ALLOW_WILDCARD_SEQ2)
            assert result == (0, 10, 0, 10, 10, 0)

    r = WILDCARD_SEQUENCES[0] + 'GCCAGG'
    for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2:
        result = compare_prefixes('CCCXTTXATC', r, degenerate=deg)
        assert result == (0, 10, 0, 10, 8, 2)
Exemplo n.º 12
0
def test_compare_prefixes():
	assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0)
	assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)
Exemplo n.º 13
0
	def match_to(self, read):
		"""
		Attempt to match this adapter to the given read.

		Return an Match instance if a match was found;
		return None if no match was found given the matching criteria (minimum
		overlap length, maximum error rate).
		"""
		read_seq = read.sequence.upper()
		pos = -1
		# try to find an exact match first unless wildcards are allowed
		if not self.adapter_wildcards:
			if self.where == PREFIX:
				pos = 0 if read_seq.startswith(self.sequence) else -1
			elif self.where == SUFFIX:
				pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
			else:
				pos = read_seq.find(self.sequence)
		if pos >= 0:

			if self.partial_trim > 0:
				match_end = len(self.sequence) - self.partial_trim
				match = Match(
					0, len(self.sequence), pos, match_end,
					len(self.sequence), 0, self._front_flag, self, read)
			else:
				match = Match(
					0, len(self.sequence), pos, pos + len(self.sequence),
					len(self.sequence), 0, self._front_flag, self, read)
		else:
			# try approximate matching
			if not self.indels and self.where in (PREFIX, SUFFIX):
				if self.where == PREFIX:
					alignment = align.compare_prefixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				else:
					alignment = align.compare_suffixes(self.sequence, read_seq,
						wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
				astart, astop, rstart, rstop, matches, errors = alignment

				## HACK HERE! Trying to trim off only a partial part of the adapter
				if self.partial_trim > 0:
					rstop = rstop - self.partial_trim
					#alignment = (astart, astop, rstart, rstop, matches, error)

				if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate:
					match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read)
					#match = Match(*(alignment + (self._front_flag, self, read)))
				else:
					match = None
			else:
				alignment = self.aligner.locate(read_seq)
				if self.debug:
					print(self.aligner.dpmatrix)  # pragma: no cover
				if alignment is None:
					match = None
				else:
					astart, astop, rstart, rstop, matches, errors = alignment
					
					## HACK HERE! Trying to trim off only a partial part of the adapter
					if self.partial_trim > 0:
						rstop -= self.partial_trim

					match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read)

		if match is None:
			return None
		assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match
		assert match.length >= self.min_overlap
		return match