def __init__(self, sequence, where, remove=None, max_error_rate=0.1, min_overlap=3, read_wildcards=False, adapter_wildcards=True, name=None, indels=True): self._debug = False self.name = _generate_adapter_name() if name is None else name self.sequence = parse_braces(sequence.upper().replace('U', 'T')) # TODO move away if not self.sequence: raise ValueError('Sequence is empty') self.where = where if remove not in (None, 'prefix', 'suffix'): raise ValueError('remove parameter must be "prefix", "suffix", "auto" or None') self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove self.max_error_rate = max_error_rate self.min_overlap = min(min_overlap, len(self.sequence)) iupac = frozenset('XACGTURYSWKMBDHVN') if adapter_wildcards and not set(self.sequence) <= iupac: for c in self.sequence: if c not in iupac: raise ValueError('Character {!r} in adapter sequence {!r} is ' 'not a valid IUPAC code. Use only characters ' 'XACGTURYSWKMBDHVN.'.format(c, self.sequence)) # Optimization: Use non-wildcard matching if only ACGT is used self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT') self.read_wildcards = read_wildcards self.aligner = align.Aligner(self.sequence, self.max_error_rate, flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) self.aligner.min_overlap = self.min_overlap self.indels = indels if not self.indels: # TODO # When indels are disallowed, an entirely different algorithm # should be used. self.aligner.indel_cost = 100000
def __init__(self, sequence, where, max_error_rate, min_overlap=3, read_wildcards=False, adapter_wildcards=True, name=None, indels=True): self.debug = False if name is None: self.name = str(self.__class__.automatic_name) self.__class__.automatic_name += 1 self.name_is_generated = True else: self.name = name self.name_is_generated = False self.sequence = self.parse_braces(sequence.upper().replace('U', 'T')) self.where = where self.max_error_rate = max_error_rate self.min_overlap = min_overlap self.indels = indels self.wildcard_flags = 0 self.adapter_wildcards = adapter_wildcards and not set( self.sequence) <= set('ACGT') if read_wildcards: self.wildcard_flags |= align.ALLOW_WILDCARD_SEQ2 if self.adapter_wildcards: self.wildcard_flags |= align.ALLOW_WILDCARD_SEQ1 # redirect trimmed() to appropriate function depending on adapter type trimmers = { FRONT: self._trimmed_front, PREFIX: self._trimmed_front, BACK: self._trimmed_back, SUFFIX: self._trimmed_back, ANYWHERE: self._trimmed_anywhere } self.trimmed = trimmers[where] if where == ANYWHERE: self._front_flag = None # means: guess else: self._front_flag = where not in (BACK, SUFFIX) # statistics about length of removed sequences self.lengths_front = defaultdict(int) self.lengths_back = defaultdict(int) self.errors_front = defaultdict(lambda: defaultdict(int)) self.errors_back = defaultdict(lambda: defaultdict(int)) self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0} self.aligner = align.Aligner(self.sequence, self.max_error_rate, flags=self.where, degenerate=self.wildcard_flags) self.aligner.min_overlap = self.min_overlap if not self.indels: # TODO # When indels are disallowed, an entirely different algorithm # should be used. self.aligner.indel_cost = 100000
def __init__(self, sequence, where, remove=None, max_error_rate=0.1, min_overlap=3, read_wildcards=False, adapter_wildcards=True, name=None, indels=True): self._debug = False self.name = _generate_adapter_name() if name is None else name self.sequence = sequence.upper().replace('U', 'T') if not self.sequence: raise ValueError('Sequence is empty') self.where = where if remove not in (None, 'prefix', 'suffix', 'auto'): raise ValueError( 'remove parameter must be "prefix", "suffix", "auto" or None') self.remove = WHERE_TO_REMOVE_MAP[where] if remove is None else remove self.max_error_rate = max_error_rate self.min_overlap = min(min_overlap, len(self.sequence)) iupac = frozenset('XACGTURYSWKMBDHVN') if adapter_wildcards and not set(self.sequence) <= iupac: for c in self.sequence: if c not in iupac: raise ValueError( 'Character {!r} in adapter sequence {!r} is ' 'not a valid IUPAC code. Use only characters ' 'XACGTURYSWKMBDHVN.'.format(c, self.sequence)) # Optimization: Use non-wildcard matching if only ACGT is used self.adapter_wildcards = adapter_wildcards and not set( self.sequence) <= set('ACGT') self.read_wildcards = read_wildcards self.indels = indels if self.is_anchored and not self.indels: aligner_class = align.PrefixComparer if self.where is Where.PREFIX else align.SuffixComparer self.aligner = aligner_class(self.sequence, self.max_error_rate, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, min_overlap=self.min_overlap) else: # TODO # Indels are suppressed by setting their cost very high, but a different algorithm # should be used instead. indel_cost = 1 if self.indels else 100000 self.aligner = align.Aligner( self.sequence, self.max_error_rate, flags=self.where.value, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, indel_cost=indel_cost, min_overlap=self.min_overlap, )
def __init__(self, sequence, where, max_error_rate=0.1, min_overlap=3, read_wildcards=False, adapter_wildcards=True, name=None, indels=True): self.debug = False self.name = _generate_adapter_name() if name is None else name self.sequence = parse_braces(sequence.upper().replace('U', 'T')) if not self.sequence: raise ValueError('Sequence is empty') self.where = where self.max_error_rate = max_error_rate self.min_overlap = min(min_overlap, len(self.sequence)) self.indels = indels self.adapter_wildcards = adapter_wildcards and not set( self.sequence) <= set('ACGT') self.read_wildcards = read_wildcards # redirect trimmed() to appropriate function depending on adapter type trimmers = { FRONT: self._trimmed_front, PREFIX: self._trimmed_front, BACK: self._trimmed_back, SUFFIX: self._trimmed_back, ANYWHERE: self._trimmed_anywhere } self.trimmed = trimmers[where] if where == ANYWHERE: self._front_flag = None # means: guess else: self._front_flag = where not in (BACK, SUFFIX) # statistics about length of removed sequences self.lengths_front = defaultdict(int) self.lengths_back = defaultdict(int) self.errors_front = defaultdict(lambda: defaultdict(int)) self.errors_back = defaultdict(lambda: defaultdict(int)) self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0} self.aligner = align.Aligner(self.sequence, self.max_error_rate, flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) self.aligner.min_overlap = self.min_overlap if not self.indels: # TODO # When indels are disallowed, an entirely different algorithm # should be used. self.aligner.indel_cost = 100000