示例#1
0
	def __init__(self, sequence, where, remove=None, max_error_rate=0.1, min_overlap=3,
			read_wildcards=False, adapter_wildcards=True, name=None, indels=True):
		self._debug = False
		self.name = _generate_adapter_name() if name is None else name
		self.sequence = parse_braces(sequence.upper().replace('U', 'T'))  # TODO move away
		if not self.sequence:
			raise ValueError('Sequence is empty')
		self.where = where
		if remove not in (None, 'prefix', 'suffix'):
			raise ValueError('remove parameter must be "prefix", "suffix", "auto" or None')
		self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove
		self.max_error_rate = max_error_rate
		self.min_overlap = min(min_overlap, len(self.sequence))
		iupac = frozenset('XACGTURYSWKMBDHVN')
		if adapter_wildcards and not set(self.sequence) <= iupac:
			for c in self.sequence:
				if c not in iupac:
					raise ValueError('Character {!r} in adapter sequence {!r} is '
						'not a valid IUPAC code. Use only characters '
						'XACGTURYSWKMBDHVN.'.format(c, self.sequence))
		# Optimization: Use non-wildcard matching if only ACGT is used
		self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT')
		self.read_wildcards = read_wildcards

		self.aligner = align.Aligner(self.sequence, self.max_error_rate,
			flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
		self.aligner.min_overlap = self.min_overlap
		self.indels = indels
		if not self.indels:
			# TODO
			# When indels are disallowed, an entirely different algorithm
			# should be used.
			self.aligner.indel_cost = 100000
示例#2
0
    def __init__(self,
                 sequence,
                 where,
                 max_error_rate,
                 min_overlap=3,
                 read_wildcards=False,
                 adapter_wildcards=True,
                 name=None,
                 indels=True):
        self.debug = False
        if name is None:
            self.name = str(self.__class__.automatic_name)
            self.__class__.automatic_name += 1
            self.name_is_generated = True
        else:
            self.name = name
            self.name_is_generated = False

        self.sequence = self.parse_braces(sequence.upper().replace('U', 'T'))
        self.where = where
        self.max_error_rate = max_error_rate
        self.min_overlap = min_overlap
        self.indels = indels
        self.wildcard_flags = 0
        self.adapter_wildcards = adapter_wildcards and not set(
            self.sequence) <= set('ACGT')
        if read_wildcards:
            self.wildcard_flags |= align.ALLOW_WILDCARD_SEQ2
        if self.adapter_wildcards:
            self.wildcard_flags |= align.ALLOW_WILDCARD_SEQ1
        # redirect trimmed() to appropriate function depending on adapter type
        trimmers = {
            FRONT: self._trimmed_front,
            PREFIX: self._trimmed_front,
            BACK: self._trimmed_back,
            SUFFIX: self._trimmed_back,
            ANYWHERE: self._trimmed_anywhere
        }
        self.trimmed = trimmers[where]
        if where == ANYWHERE:
            self._front_flag = None  # means: guess
        else:
            self._front_flag = where not in (BACK, SUFFIX)
        # statistics about length of removed sequences
        self.lengths_front = defaultdict(int)
        self.lengths_back = defaultdict(int)
        self.errors_front = defaultdict(lambda: defaultdict(int))
        self.errors_back = defaultdict(lambda: defaultdict(int))
        self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0}

        self.aligner = align.Aligner(self.sequence,
                                     self.max_error_rate,
                                     flags=self.where,
                                     degenerate=self.wildcard_flags)
        self.aligner.min_overlap = self.min_overlap
        if not self.indels:
            # TODO
            # When indels are disallowed, an entirely different algorithm
            # should be used.
            self.aligner.indel_cost = 100000
示例#3
0
 def __init__(self,
              sequence,
              where,
              remove=None,
              max_error_rate=0.1,
              min_overlap=3,
              read_wildcards=False,
              adapter_wildcards=True,
              name=None,
              indels=True):
     self._debug = False
     self.name = _generate_adapter_name() if name is None else name
     self.sequence = sequence.upper().replace('U', 'T')
     if not self.sequence:
         raise ValueError('Sequence is empty')
     self.where = where
     if remove not in (None, 'prefix', 'suffix', 'auto'):
         raise ValueError(
             'remove parameter must be "prefix", "suffix", "auto" or None')
     self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove
     self.max_error_rate = max_error_rate
     self.min_overlap = min(min_overlap, len(self.sequence))
     iupac = frozenset('XACGTURYSWKMBDHVN')
     if adapter_wildcards and not set(self.sequence) <= iupac:
         for c in self.sequence:
             if c not in iupac:
                 raise ValueError(
                     'Character {!r} in adapter sequence {!r} is '
                     'not a valid IUPAC code. Use only characters '
                     'XACGTURYSWKMBDHVN.'.format(c, self.sequence))
     # Optimization: Use non-wildcard matching if only ACGT is used
     self.adapter_wildcards = adapter_wildcards and not set(
         self.sequence) <= set('ACGT')
     self.read_wildcards = read_wildcards
     self.indels = indels
     if self.is_anchored and not self.indels:
         aligner_class = align.PrefixComparer if self.where is Where.PREFIX else align.SuffixComparer
         self.aligner = aligner_class(self.sequence,
                                      self.max_error_rate,
                                      wildcard_ref=self.adapter_wildcards,
                                      wildcard_query=self.read_wildcards,
                                      min_overlap=self.min_overlap)
     else:
         # TODO
         # Indels are suppressed by setting their cost very high, but a different algorithm
         # should be used instead.
         indel_cost = 1 if self.indels else 100000
         self.aligner = align.Aligner(
             self.sequence,
             self.max_error_rate,
             flags=self.where.value,
             wildcard_ref=self.adapter_wildcards,
             wildcard_query=self.read_wildcards,
             indel_cost=indel_cost,
             min_overlap=self.min_overlap,
         )
示例#4
0
    def __init__(self,
                 sequence,
                 where,
                 max_error_rate=0.1,
                 min_overlap=3,
                 read_wildcards=False,
                 adapter_wildcards=True,
                 name=None,
                 indels=True):
        self.debug = False
        self.name = _generate_adapter_name() if name is None else name
        self.sequence = parse_braces(sequence.upper().replace('U', 'T'))
        if not self.sequence:
            raise ValueError('Sequence is empty')
        self.where = where
        self.max_error_rate = max_error_rate
        self.min_overlap = min(min_overlap, len(self.sequence))
        self.indels = indels
        self.adapter_wildcards = adapter_wildcards and not set(
            self.sequence) <= set('ACGT')
        self.read_wildcards = read_wildcards
        # redirect trimmed() to appropriate function depending on adapter type
        trimmers = {
            FRONT: self._trimmed_front,
            PREFIX: self._trimmed_front,
            BACK: self._trimmed_back,
            SUFFIX: self._trimmed_back,
            ANYWHERE: self._trimmed_anywhere
        }
        self.trimmed = trimmers[where]
        if where == ANYWHERE:
            self._front_flag = None  # means: guess
        else:
            self._front_flag = where not in (BACK, SUFFIX)
        # statistics about length of removed sequences
        self.lengths_front = defaultdict(int)
        self.lengths_back = defaultdict(int)
        self.errors_front = defaultdict(lambda: defaultdict(int))
        self.errors_back = defaultdict(lambda: defaultdict(int))
        self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0}

        self.aligner = align.Aligner(self.sequence,
                                     self.max_error_rate,
                                     flags=self.where,
                                     wildcard_ref=self.adapter_wildcards,
                                     wildcard_query=self.read_wildcards)
        self.aligner.min_overlap = self.min_overlap
        if not self.indels:
            # TODO
            # When indels are disallowed, an entirely different algorithm
            # should be used.
            self.aligner.indel_cost = 100000