class MolType(object): """MolType: Handles operations that depend on the sequence type (e.g. DNA). The MolType knows how to connect alphabets, sequences, alignments, and so forth, and how to disambiguate ambiguous symbols and perform base pairing (where appropriate). WARNING: Objects passed to a MolType become associated with that MolType, i.e. if you pass ProteinSequence to a new MolType you make up, all ProteinSequences will now be associated with the new MolType. This may not be what you expect. Use preserve_existing_moltypes=True if you don't want to reset the moltype. """ def __init__( self, motifset, gap=IUPAC_gap, missing=IUPAC_missing, gaps=None, seq_constructor=None, ambiguities=None, label=None, complements=None, pairs=None, mw_calculator=None, add_lower=False, preserve_existing_moltypes=False, make_alphabet_group=False, array_seq_constructor=None, colors=None, ): """Returns a new MolType object. Note that the parameters are in flux. Parameters ---------- motifset Alphabet or sequence of items in the default alphabet. Does not include degenerates. gap default gap symbol missing symbol for missing data gaps any other symbols that should be treated as gaps (doesn't have to include gap or missing; they will be silently added) seq_constructor Class for constructing sequences. ambiguities dict of char:tuple, doesn't include gaps (these are hard-coded as - and ?, and added later. label text label, don't know what this is used for. Unnecessary? complements dict of symbol:symbol showing how the non-degenerate single characters complement each other. Used for constructing on the fly the complement table, incl. support for must_pair and can_pair. pairs dict in which keys are pairs of symbols that can pair with each other, values are True (must pair) or False (might pair). Currently, the meaning of GU pairs as 'weak' is conflated with the meaning of degenerate symbol pairs (which might pair with each other but don't necessarily, depending on how the symbol is resolved). This should be refactored. mw_calculator f(seq) -> molecular weight. add_lower if True (default: False) adds the lowercase versions of everything into the alphabet. Slated for deletion. preserve_existing_moltypes if True (default: False), does not set the MolType of the things added in **kwargs to self. make_alphabet_group if True, makes an AlphabetGroup relating the various alphabets to one another. array_seq_constructor sequence type for array sequence colors dict mapping moltype characters to colors for display Note on "degenerates" versus "ambiguities": self.degenerates contains _only_ mappings for degenerate symbols, whereas self.ambiguities contains mappings for both degenerate and non-degenerate symbols. Sometimes you want one, sometimes the other, so both are provided. """ self._serialisable = {k: v for k, v in locals().items() if k != "self"} self.gap = gap self.missing = missing self.gaps = frozenset([gap, missing]) if gaps: self.gaps = self.gaps.union(frozenset(gaps)) self.label = label # set the sequence constructor if seq_constructor is None: seq_constructor = "".join # safe default string constructor elif not preserve_existing_moltypes: seq_constructor.moltype = self self._make_seq = seq_constructor # set the ambiguities ambigs = { self.missing: tuple(motifset) + (self.gap, ), self.gap: (self.gap, ) } if ambiguities: ambigs.update(ambiguities) for c in motifset: ambigs[c] = (c, ) self.ambiguities = ambigs # set complements -- must set before we make the alphabet group self.complements = complements or {} if make_alphabet_group: # note: must use _original_ ambiguities here self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self) self.alphabet = self.alphabets.base else: if isinstance(motifset, Enumeration): self.alphabet = motifset elif max(len(motif) for motif in motifset) == 1: self.alphabet = CharAlphabet(motifset, moltype=self) else: self.alphabet = Alphabet(motifset, moltype=self) # set the other properties self.degenerates = ambiguities and ambiguities.copy() or {} self.degenerates[self.missing] = "".join(motifset) + self.gap self.matches = make_matches(motifset, self.gaps, self.degenerates) self.pairs = pairs and pairs.copy() or {} self.pairs.update( make_pairs(pairs, motifset, self.gaps, self.degenerates)) self.mw_calculator = mw_calculator # add lowercase characters, if we're doing that if add_lower: self._add_lowercase() # cache various other data that make the calculations faster self._make_all() self._make_comp_table() # a gap can be a true gap char or a degenerate character, typically '?' # we therefore want to ensure consistent treatment across the definition # of characters as either gap or degenerate self.gap_string = "".join(self.gaps) strict_gap = "".join(set(self.gap_string) - set(self.degenerates)) self.strip_degenerate = FunctionWrapper( KeepChars(strict_gap + "".join(self.alphabet))) self.strip_bad = FunctionWrapper(KeepChars("".join(self.All))) to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps) self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep))) # make inverse degenerates from degenerates # ensure that lowercase versions also exist if appropriate inv_degens = {} for key, val in list(self.degenerates.items()): inv_degens[frozenset(val)] = key.upper() if add_lower: inv_degens[frozenset("".join(val).lower())] = key.lower() for m in self.alphabet: inv_degens[frozenset(m)] = m if add_lower: inv_degens[frozenset("".join(m).lower())] = m.lower() for m in self.gaps: inv_degens[frozenset(m)] = m self.inverse_degenerates = inv_degens # set array type for modeling alphabets try: self.array_type = self.alphabet.array_type except AttributeError: self.array_type = None # set modeling sequence self._make_array_seq = array_seq_constructor self._colors = colors or defaultdict(_DefaultValue("black")) def __repr__(self): """String representation of MolType. WARNING: This doesn't allow you to reconstruct the object in its present incarnation. """ return "MolType(%s)" % (self.alphabet, ) def __getnewargs_ex__(self, *args, **kw): data = self.to_rich_dict(for_pickle=True) return (), data def to_rich_dict(self, for_pickle=False): data = self._serialisable.copy() if not for_pickle: # we rely on reconstruction from label data = dict(type=get_object_provenance(self), moltype=self.label) data["version"] = __version__ return data def to_json(self): """returns result of json formatted string""" data = self.to_rich_dict(for_pickle=False) return json.dumps(data) def to_regex(self, seq): """returns a regex pattern with ambiguities expanded to a character set""" if not self.is_valid(seq): raise ValueError(f"'{seq}' is invalid for this moltype") degen_indices = self.get_degenerate_positions(sequence=seq, include_gap=False) seq = list(seq) # seq can now be modified for index in degen_indices: expanded = self.ambiguities[seq[index]] seq[index] = f"[{''.join(expanded)}]" return "".join(seq) def gettype(self): """Returns type, e.g. 'dna', 'rna', 'protein'. Delete?""" return self.label def make_seq(self, seq, name=None, **kwargs): """Returns sequence of correct type.""" return self._make_seq(seq, name, **kwargs) def make_array_seq(self, seq, name=None, **kwargs): """ creates an array sequence Parameters ---------- seq characters or array name : str kwargs keyword arguments for the ArraySequence constructor. Returns ------- ArraySequence """ alphabet = kwargs.pop("alphabet", None) if alphabet is None and hasattr(self, "alphabets"): alphabet = self.alphabets.degen_gapped elif alphabet is None: alphabet = self.alphabet return self._make_array_seq(seq, alphabet=alphabet, name=name, **kwargs) def verify_sequence(self, seq, gaps_allowed=True, wildcards_allowed=True): """Checks whether sequence is valid on the default alphabet. Has special-case handling for gaps and wild-cards. This mechanism is probably useful to have in parallel with the validation routines that check specifically whether the sequence has gaps, degenerate symbols, etc., or that explicitly take an alphabet as input. """ alpha = frozenset(self.ambiguities) if gaps_allowed: alpha = alpha.union(self.gaps) if wildcards_allowed: alpha = alpha.union(self.missing) try: nonalpha = re.compile("[^%s]" % re.escape("".join(alpha))) badchar = nonalpha.search(seq) if badchar: motif = badchar.group() raise AlphabetError(motif) except TypeError: # not alphabetic sequence: try slow method for motif in seq: if motif not in alpha: raise AlphabetError(motif) def is_ambiguity(self, querymotif): """Return True if querymotif is an amibiguity character in alphabet. Parameters ---------- querymotif the motif being queried. """ return len(self.ambiguities[querymotif]) > 1 def _what_ambiguity(self, motifs): """The code that represents all of 'motifs', and minimal others. Does this duplicate DegenerateFromSequence directly? """ most_specific = len(self.alphabet) + 1 result = self.missing for (code, motifs2) in list(self.ambiguities.items()): for c in motifs: if c not in motifs2: break else: if len(motifs2) < most_specific: most_specific = len(motifs2) result = code return result def what_ambiguity(self, motifs): """The code that represents all of 'motifs', and minimal others. Does this duplicate DegenerateFromSequence directly? """ if not hasattr(self, "_reverse_ambiguities"): self._reverse_ambiguities = {} motifs = frozenset(motifs) if motifs not in self._reverse_ambiguities: self._reverse_ambiguities[motifs] = self._what_ambiguity(motifs) return self._reverse_ambiguities[motifs] def _add_lowercase(self): """Adds lowercase versions of keys and vals to each internal dict.""" for name in [ "alphabet", "degenerates", "gaps", "complements", "pairs", "matches", ]: curr = getattr(self, name) # temp hack to get around re-ordering if isinstance(curr, Alphabet): curr = tuple(curr) new = add_lowercase(curr) setattr(self, name, new) def _make_all(self): """Sets self.All, which contains all the symbols self knows about. Note that the value of items in self.All will be the string containing the possibly degenerate set of symbols that the items expand to. """ all = {} for i in self.alphabet: curr = str(i) all[i] = i for key, val in list(self.degenerates.items()): all[key] = val for i in self.gaps: all[i] = i self.All = all def _make_comp_table(self): """Sets self.ComplementTable, which maps items onto their complements. Note: self.ComplementTable is only set if self.complements exists. """ if self.complements: self.ComplementTable = maketrans( "".join(list(self.complements.keys())), "".join(list(self.complements.values())), ) def complement(self, item): """Returns complement of item, using data from self.complements. Always tries to return same type as item: if item looks like a dict, will return list of keys. """ if not self.complements: raise TypeError( "Tried to complement sequence using alphabet without complements." ) try: return item.translate(self.ComplementTable) except (AttributeError, TypeError): item = iterable(item) get = self.complements.get return item.__class__([get(i, i) for i in item]) def rc(self, item): """Returns reverse complement of item w/ data from self.complements. Always returns same type as input. """ comp = list(self.complement(item)) comp.reverse() if isinstance(item, str): return item.__class__("".join(comp)) else: return item.__class__(comp) def strand_symmetric_motifs(self, motif_length=1): """returns ordered pairs of strand complementary motifs""" if not self.pairs: raise TypeError("moltype must be DNA or RNA") motif_set = self.alphabet.get_word_alphabet(word_length=motif_length) motif_pairs = [] for m in motif_set: pair = tuple(sorted([m, self.complement(m)])) motif_pairs.append(pair) motif_pairs = set(motif_pairs) return motif_pairs def __contains__(self, item): """A MolType contains every character it knows about.""" return item in self.All def __iter__(self): """A MolType iterates only over the characters in its Alphabet..""" return iter(self.alphabet) def is_gap(self, char): """Returns True if char is a gap.""" return char in self.gaps def is_gapped(self, sequence): """Returns True if sequence contains gaps.""" return self.first_gap(sequence) is not None def is_degenerate(self, sequence): """Returns True if sequence contains degenerate characters.""" return self.first_degenerate(sequence) is not None def is_valid(self, sequence): """Returns True if sequence contains no items that are not in self.""" try: return self.first_invalid(sequence) is None except: return False def is_strict(self, sequence): """Returns True if sequence contains only items in self.alphabet.""" try: return (len(sequence) == 0) or (self.first_non_strict(sequence) is None) except: return False def valid_on_alphabet(self, sequence, alphabet=None): """Returns True if sequence contains only items in alphabet. alphabet can actually be anything that implements __contains__. Defaults to self.alphabet if not supplied. """ if alphabet is None: alphabet = self.alphabet return first_index_in_set(sequence, alphabet) is not None def first_not_in_alphabet(self, sequence, alphabet=None): """Returns index of first item not in alphabet, or None. Defaults to self.alphabet if alphabet not supplied. """ if alphabet is None: alphabet = self.alphabet return first_index_in_set(sequence, alphabet) def first_gap(self, sequence): """Returns the index of the first gap in the sequence, or None.""" gap = self.gaps for i, s in enumerate(sequence): if s in gap: return i return None def first_degenerate(self, sequence): """Returns the index of first degenerate symbol in sequence, or None.""" degen = self.degenerates for i, s in enumerate(sequence): if s in degen: return i return None def first_invalid(self, sequence): """Returns the index of first invalid symbol in sequence, or None.""" all = self.All for i, s in enumerate(sequence): if s not in all: return i return None def first_non_strict(self, sequence): """Returns the index of first non-strict symbol in sequence, or None.""" monomers = self.alphabet for i, s in enumerate(sequence): if s not in monomers: return i return None def disambiguate(self, sequence, method="strip"): """Returns a non-degenerate sequence from a degenerate one. method can be 'strip' (deletes any characters not in monomers or gaps) or 'random'(assigns the possibilities at random, using equal frequencies). """ if method == "strip": try: return sequence.__class__(self.strip_degenerate(sequence)) except: ambi = self.degenerates def not_ambiguous(x): return x not in ambi return sequence.__class__(list(filter(not_ambiguous, sequence))) elif method == "random": degen = self.degenerates result = [] for i in sequence: if i in degen: result.append(choice(degen[i])) else: result.append(i) if isinstance(sequence, str): return sequence.__class__("".join(result)) else: return sequence.__class__(result) else: raise NotImplementedError("Got unknown method %s" % method) def degap(self, sequence): """Deletes all gap characters from sequence.""" try: trans = dict([(i, None) for i in map(ord, self.gaps)]) return sequence.__class__(sequence.translate(trans)) except AttributeError: gap = self.gaps def not_gap(x): return x not in gap return sequence.__class__(list(filter(not_gap, sequence))) def gap_indices(self, sequence): """Returns list of indices of all gaps in the sequence, or [].""" gaps = self.gaps return [i for i, s in enumerate(sequence) if s in gaps] def gap_vector(self, sequence): """Returns list of bool indicating gap or non-gap in sequence.""" return list(map(self.is_gap, sequence)) def gap_maps(self, sequence): """Returns tuple containing dicts mapping between gapped and ungapped. First element is a dict such that d[ungapped_coord] = gapped_coord. Second element is a dict such that d[gapped_coord] = ungapped_coord. Note that the dicts will be invalid if the sequence changes after the dicts are made. The gaps themselves are not in the dictionary, so use d.get() or test 'if pos in d' to avoid KeyErrors if looking up all elements in a gapped sequence. """ ungapped = {} gapped = {} num_gaps = 0 for i, is_gap in enumerate(self.gap_vector(sequence)): if is_gap: num_gaps += 1 else: ungapped[i] = i - num_gaps gapped[i - num_gaps] = i return gapped, ungapped def count_gaps(self, sequence): """Counts the gaps in the specified sequence.""" gaps = self.gaps gap_count = sum(1 for s in sequence if s in gaps) return gap_count def get_degenerate_positions(self, sequence, include_gap=True): """returns indices matching degenerate characters""" degen = list(self.degenerates) if include_gap: degen.append(self.gap) pos = [i for i, c in enumerate(sequence) if c in degen] return pos def count_degenerate(self, sequence): """Counts the degenerate bases in the specified sequence.""" degen = self.degenerates degen_count = 0 for s in sequence: if s in degen: degen_count += 1 return degen_count def possibilities(self, sequence): """Counts number of possible sequences matching the sequence. Uses self.degenerates to decide how many possibilites there are at each position in the sequence. """ degen = self.degenerates count = 1 for s in sequence: if s in degen: count *= len(degen[s]) return count def mw(self, sequence, method="random", delta=None): """Returns the molecular weight of the sequence. If the sequence is ambiguous, uses method (random or strip) to disambiguate the sequence. if delta is present, uses it instead of the standard weight adjustment. """ if not sequence: return 0 try: return self.mw_calculator(sequence, delta) except KeyError: # assume sequence was ambiguous return self.mw_calculator(self.disambiguate(sequence, method), delta) def can_match(self, first, second): """Returns True if every pos in 1st could match same pos in 2nd. Truncates at length of shorter sequence. gaps are only allowed to match other gaps. """ m = self.matches for pair in zip(first, second): if pair not in m: return False return True def can_mismatch(self, first, second): """Returns True if any position in 1st could cause a mismatch with 2nd. Truncates at length of shorter sequence. gaps are always counted as matches. """ m = self.matches if not first or not second: return False for pair in zip(first, second): if not m.get(pair, None): return True return False def must_match(self, first, second): """Returns True if all positions in 1st must match positions in second.""" return not self.can_mismatch(first, second) def can_pair(self, first, second): """Returns True if first and second could pair. Pairing occurs in reverse order, i.e. last position of second with first position of first, etc. Truncates at length of shorter sequence. gaps are only allowed to pair with other gaps, and are counted as 'weak' (same category as GU and degenerate pairs). NOTE: second must be able to be reverse """ p = self.pairs sec = list(second) sec.reverse() for pair in zip(first, sec): if pair not in p: return False return True def can_mispair(self, first, second): """Returns True if any position in 1st could mispair with 2nd. Pairing occurs in reverse order, i.e. last position of second with first position of first, etc. Truncates at length of shorter sequence. gaps are always counted as possible mispairs, as are weak pairs like GU. """ p = self.pairs if not first or not second: return False sec = list(second) sec.reverse() for pair in zip(first, sec): if not p.get(pair, None): return True return False def must_pair(self, first, second): """Returns True if all positions in 1st must pair with second. Pairing occurs in reverse order, i.e. last position of second with first position of first, etc. """ return not self.can_mispair(first, second) def degenerate_from_seq(self, sequence): """Returns least degenerate symbol corresponding to chars in sequence. First tries to look up in self.inverse_degenerates. Then disambiguates and tries to look up in self.inverse_degenerates. Then tries converting the case (tries uppercase before lowercase). Raises TypeError if conversion fails. """ symbols = frozenset(sequence) # check if symbols are already known inv_degens = self.inverse_degenerates result = inv_degens.get(symbols, None) if result: return result # then, try converting the symbols degens = self.All converted = set() for sym in symbols: for char in degens[sym]: converted.add(char) symbols = frozenset(converted) result = inv_degens.get(symbols, None) if result: return result # then, try converting case symbols = frozenset([s.upper() for s in symbols]) result = inv_degens.get(symbols, None) if result: return result symbols = frozenset([s.lower() for s in symbols]) result = inv_degens.get(symbols, None) if result: return result # finally, try to find the minimal subset containing the symbols symbols = frozenset([s.upper() for s in symbols]) lengths = {} for i in inv_degens: if symbols.issubset(i): lengths[len(i)] = i if lengths: # found at least some matches sorted = list(lengths.keys()) sorted.sort() return inv_degens[lengths[sorted[0]]] # if we got here, nothing worked raise TypeError("Cannot find degenerate char for symbols: %s" % symbols) def get_css_style(self, colors=None, font_size=12, font_family="Lucida Console"): """returns string of CSS classes and {character: <CSS class name>, ...} Parameters ---------- colors {char font_size in points font_family name of a monospace font """ colors = colors or self._colors # !important required to stop some browsers over-riding the style sheet ...!! template = ('.%s_%s{font-family: "%s",monospace !important; ' "font-size: %dpt !important; color: %s; }") label = self.label or "" styles = _style_defaults[label].copy() styles.update({ c: "_".join([c, label]) for c in list(self.alphabet) + ["terminal_ambig"] }) css = [ template % (char, label, font_family, font_size, colors[char]) for char in list(styles) + ["ambig"] ] return css, styles
def __init__( self, motifset, gap=IUPAC_gap, missing=IUPAC_missing, gaps=None, seq_constructor=None, ambiguities=None, label=None, complements=None, pairs=None, mw_calculator=None, add_lower=False, preserve_existing_moltypes=False, make_alphabet_group=False, array_seq_constructor=None, colors=None, ): """Returns a new MolType object. Note that the parameters are in flux. Parameters ---------- motifset Alphabet or sequence of items in the default alphabet. Does not include degenerates. gap default gap symbol missing symbol for missing data gaps any other symbols that should be treated as gaps (doesn't have to include gap or missing; they will be silently added) seq_constructor Class for constructing sequences. ambiguities dict of char:tuple, doesn't include gaps (these are hard-coded as - and ?, and added later. label text label, don't know what this is used for. Unnecessary? complements dict of symbol:symbol showing how the non-degenerate single characters complement each other. Used for constructing on the fly the complement table, incl. support for must_pair and can_pair. pairs dict in which keys are pairs of symbols that can pair with each other, values are True (must pair) or False (might pair). Currently, the meaning of GU pairs as 'weak' is conflated with the meaning of degenerate symbol pairs (which might pair with each other but don't necessarily, depending on how the symbol is resolved). This should be refactored. mw_calculator f(seq) -> molecular weight. add_lower if True (default: False) adds the lowercase versions of everything into the alphabet. Slated for deletion. preserve_existing_moltypes if True (default: False), does not set the MolType of the things added in **kwargs to self. make_alphabet_group if True, makes an AlphabetGroup relating the various alphabets to one another. array_seq_constructor sequence type for array sequence colors dict mapping moltype characters to colors for display Note on "degenerates" versus "ambiguities": self.degenerates contains _only_ mappings for degenerate symbols, whereas self.ambiguities contains mappings for both degenerate and non-degenerate symbols. Sometimes you want one, sometimes the other, so both are provided. """ self._serialisable = {k: v for k, v in locals().items() if k != "self"} self.gap = gap self.missing = missing self.gaps = frozenset([gap, missing]) if gaps: self.gaps = self.gaps.union(frozenset(gaps)) self.label = label # set the sequence constructor if seq_constructor is None: seq_constructor = "".join # safe default string constructor elif not preserve_existing_moltypes: seq_constructor.moltype = self self._make_seq = seq_constructor # set the ambiguities ambigs = { self.missing: tuple(motifset) + (self.gap, ), self.gap: (self.gap, ) } if ambiguities: ambigs.update(ambiguities) for c in motifset: ambigs[c] = (c, ) self.ambiguities = ambigs # set complements -- must set before we make the alphabet group self.complements = complements or {} if make_alphabet_group: # note: must use _original_ ambiguities here self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self) self.alphabet = self.alphabets.base else: if isinstance(motifset, Enumeration): self.alphabet = motifset elif max(len(motif) for motif in motifset) == 1: self.alphabet = CharAlphabet(motifset, moltype=self) else: self.alphabet = Alphabet(motifset, moltype=self) # set the other properties self.degenerates = ambiguities and ambiguities.copy() or {} self.degenerates[self.missing] = "".join(motifset) + self.gap self.matches = make_matches(motifset, self.gaps, self.degenerates) self.pairs = pairs and pairs.copy() or {} self.pairs.update( make_pairs(pairs, motifset, self.gaps, self.degenerates)) self.mw_calculator = mw_calculator # add lowercase characters, if we're doing that if add_lower: self._add_lowercase() # cache various other data that make the calculations faster self._make_all() self._make_comp_table() # a gap can be a true gap char or a degenerate character, typically '?' # we therefore want to ensure consistent treatment across the definition # of characters as either gap or degenerate self.gap_string = "".join(self.gaps) strict_gap = "".join(set(self.gap_string) - set(self.degenerates)) self.strip_degenerate = FunctionWrapper( KeepChars(strict_gap + "".join(self.alphabet))) self.strip_bad = FunctionWrapper(KeepChars("".join(self.All))) to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps) self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep))) # make inverse degenerates from degenerates # ensure that lowercase versions also exist if appropriate inv_degens = {} for key, val in list(self.degenerates.items()): inv_degens[frozenset(val)] = key.upper() if add_lower: inv_degens[frozenset("".join(val).lower())] = key.lower() for m in self.alphabet: inv_degens[frozenset(m)] = m if add_lower: inv_degens[frozenset("".join(m).lower())] = m.lower() for m in self.gaps: inv_degens[frozenset(m)] = m self.inverse_degenerates = inv_degens # set array type for modeling alphabets try: self.array_type = self.alphabet.array_type except AttributeError: self.array_type = None # set modeling sequence self._make_array_seq = array_seq_constructor self._colors = colors or defaultdict(_DefaultValue("black"))
def _with(self, motifs): a = Alphabet._with(self, motifs) a.__class__ = type(self) a._gc = self._gc return a