示例#1
0
文件: moltype.py 项目: mr-c/cogent3
class MolType(object):
    """MolType: Handles operations that depend on the sequence type (e.g. DNA).

    The MolType knows how to connect alphabets, sequences, alignments, and so
    forth, and how to disambiguate ambiguous symbols and perform base
    pairing (where appropriate).

    WARNING: Objects passed to a MolType become associated with that MolType,
    i.e. if you pass ProteinSequence to a new MolType you make up, all
    ProteinSequences will now be associated with the new MolType. This may
    not be what you expect. Use preserve_existing_moltypes=True if you
    don't want to reset the moltype.
    """
    def __init__(
        self,
        motifset,
        gap=IUPAC_gap,
        missing=IUPAC_missing,
        gaps=None,
        seq_constructor=None,
        ambiguities=None,
        label=None,
        complements=None,
        pairs=None,
        mw_calculator=None,
        add_lower=False,
        preserve_existing_moltypes=False,
        make_alphabet_group=False,
        array_seq_constructor=None,
        colors=None,
    ):
        """Returns a new MolType object. Note that the parameters are in flux.

        Parameters
        ----------
        motifset
            Alphabet or sequence of items in the default
            alphabet. Does not include degenerates.
        gap
            default gap symbol
        missing
            symbol for missing data
        gaps
            any other symbols that should be treated as gaps (doesn't have
            to include gap or missing; they will be silently added)
        seq_constructor
            Class for constructing sequences.
        ambiguities
            dict of char:tuple, doesn't include gaps (these are
            hard-coded as - and ?, and added later.
        label
            text label, don't know what this is used for. Unnecessary?
        complements
            dict of symbol:symbol showing how the non-degenerate
            single characters complement each other. Used for constructing
            on the fly the complement table, incl. support for must_pair and
            can_pair.
        pairs
            dict in which keys are pairs of symbols that can pair
            with each other, values are True (must pair) or False (might
            pair). Currently, the meaning of GU pairs as 'weak' is conflated
            with the meaning of degenerate symbol pairs (which might pair
            with each other but don't necessarily, depending on how the
            symbol is resolved). This should be refactored.
        mw_calculator
            f(seq) -> molecular weight.
        add_lower
            if True (default: False) adds the lowercase versions of
            everything into the alphabet. Slated for deletion.
        preserve_existing_moltypes
            if True (default: False), does not
            set the MolType of the things added in **kwargs to self.
        make_alphabet_group
            if True, makes an AlphabetGroup relating
            the various alphabets to one another.
        array_seq_constructor
            sequence type for array sequence
        colors
            dict mapping moltype characters to colors for display

        Note on "degenerates" versus "ambiguities": self.degenerates contains
        _only_ mappings for degenerate symbols, whereas self.ambiguities
        contains mappings for both degenerate and non-degenerate symbols.
        Sometimes you want one, sometimes the other, so both are provided.
        """
        self._serialisable = {k: v for k, v in locals().items() if k != "self"}
        self.gap = gap
        self.missing = missing
        self.gaps = frozenset([gap, missing])
        if gaps:
            self.gaps = self.gaps.union(frozenset(gaps))
        self.label = label
        # set the sequence constructor
        if seq_constructor is None:
            seq_constructor = "".join  # safe default string constructor
        elif not preserve_existing_moltypes:
            seq_constructor.moltype = self
        self._make_seq = seq_constructor

        # set the ambiguities
        ambigs = {
            self.missing: tuple(motifset) + (self.gap, ),
            self.gap: (self.gap, )
        }
        if ambiguities:
            ambigs.update(ambiguities)
        for c in motifset:
            ambigs[c] = (c, )
        self.ambiguities = ambigs

        # set complements -- must set before we make the alphabet group
        self.complements = complements or {}

        if make_alphabet_group:  # note: must use _original_ ambiguities here
            self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self)
            self.alphabet = self.alphabets.base
        else:
            if isinstance(motifset, Enumeration):
                self.alphabet = motifset
            elif max(len(motif) for motif in motifset) == 1:
                self.alphabet = CharAlphabet(motifset, moltype=self)
            else:
                self.alphabet = Alphabet(motifset, moltype=self)
        # set the other properties
        self.degenerates = ambiguities and ambiguities.copy() or {}
        self.degenerates[self.missing] = "".join(motifset) + self.gap
        self.matches = make_matches(motifset, self.gaps, self.degenerates)
        self.pairs = pairs and pairs.copy() or {}
        self.pairs.update(
            make_pairs(pairs, motifset, self.gaps, self.degenerates))
        self.mw_calculator = mw_calculator

        # add lowercase characters, if we're doing that
        if add_lower:
            self._add_lowercase()
        # cache various other data that make the calculations faster
        self._make_all()
        self._make_comp_table()
        # a gap can be a true gap char or a degenerate character, typically '?'
        # we therefore want to ensure consistent treatment across the definition
        # of characters as either gap or degenerate
        self.gap_string = "".join(self.gaps)
        strict_gap = "".join(set(self.gap_string) - set(self.degenerates))
        self.strip_degenerate = FunctionWrapper(
            KeepChars(strict_gap + "".join(self.alphabet)))
        self.strip_bad = FunctionWrapper(KeepChars("".join(self.All)))
        to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps)
        self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep)))

        # make inverse degenerates from degenerates
        # ensure that lowercase versions also exist if appropriate
        inv_degens = {}
        for key, val in list(self.degenerates.items()):
            inv_degens[frozenset(val)] = key.upper()
            if add_lower:
                inv_degens[frozenset("".join(val).lower())] = key.lower()
        for m in self.alphabet:
            inv_degens[frozenset(m)] = m
            if add_lower:
                inv_degens[frozenset("".join(m).lower())] = m.lower()
        for m in self.gaps:
            inv_degens[frozenset(m)] = m
        self.inverse_degenerates = inv_degens

        # set array type for modeling alphabets
        try:
            self.array_type = self.alphabet.array_type
        except AttributeError:
            self.array_type = None

        # set modeling sequence
        self._make_array_seq = array_seq_constructor

        self._colors = colors or defaultdict(_DefaultValue("black"))

    def __repr__(self):
        """String representation of MolType.

        WARNING: This doesn't allow you to reconstruct the object in its present
        incarnation.
        """
        return "MolType(%s)" % (self.alphabet, )

    def __getnewargs_ex__(self, *args, **kw):
        data = self.to_rich_dict(for_pickle=True)
        return (), data

    def to_rich_dict(self, for_pickle=False):
        data = self._serialisable.copy()
        if not for_pickle:  # we rely on reconstruction from label
            data = dict(type=get_object_provenance(self), moltype=self.label)
            data["version"] = __version__
        return data

    def to_json(self):
        """returns result of json formatted string"""
        data = self.to_rich_dict(for_pickle=False)
        return json.dumps(data)

    def to_regex(self, seq):
        """returns a regex pattern with ambiguities expanded to a character set"""
        if not self.is_valid(seq):
            raise ValueError(f"'{seq}' is invalid for this moltype")

        degen_indices = self.get_degenerate_positions(sequence=seq,
                                                      include_gap=False)
        seq = list(seq)  # seq can now be modified
        for index in degen_indices:
            expanded = self.ambiguities[seq[index]]
            seq[index] = f"[{''.join(expanded)}]"
        return "".join(seq)

    def gettype(self):
        """Returns type, e.g. 'dna', 'rna', 'protein'. Delete?"""
        return self.label

    def make_seq(self, seq, name=None, **kwargs):
        """Returns sequence of correct type."""
        return self._make_seq(seq, name, **kwargs)

    def make_array_seq(self, seq, name=None, **kwargs):
        """
        creates an array sequence

        Parameters
        ----------
        seq
            characters or array
        name : str
        kwargs
            keyword arguments for the ArraySequence constructor.

        Returns
        -------
        ArraySequence
        """
        alphabet = kwargs.pop("alphabet", None)
        if alphabet is None and hasattr(self, "alphabets"):
            alphabet = self.alphabets.degen_gapped
        elif alphabet is None:
            alphabet = self.alphabet
        return self._make_array_seq(seq,
                                    alphabet=alphabet,
                                    name=name,
                                    **kwargs)

    def verify_sequence(self, seq, gaps_allowed=True, wildcards_allowed=True):
        """Checks whether sequence is valid on the default alphabet.

        Has special-case handling for gaps and wild-cards. This mechanism is
        probably useful to have in parallel with the validation routines that
        check specifically whether the sequence has gaps, degenerate symbols,
        etc., or that explicitly take an alphabet as input.
        """
        alpha = frozenset(self.ambiguities)
        if gaps_allowed:
            alpha = alpha.union(self.gaps)
        if wildcards_allowed:
            alpha = alpha.union(self.missing)
        try:
            nonalpha = re.compile("[^%s]" % re.escape("".join(alpha)))
            badchar = nonalpha.search(seq)
            if badchar:
                motif = badchar.group()
                raise AlphabetError(motif)
        except TypeError:  # not alphabetic sequence: try slow method
            for motif in seq:
                if motif not in alpha:
                    raise AlphabetError(motif)

    def is_ambiguity(self, querymotif):
        """Return True if querymotif is an amibiguity character in alphabet.

        Parameters
        ----------
        querymotif
            the motif being queried.

        """

        return len(self.ambiguities[querymotif]) > 1

    def _what_ambiguity(self, motifs):
        """The code that represents all of 'motifs', and minimal others.

        Does this duplicate DegenerateFromSequence directly?
        """
        most_specific = len(self.alphabet) + 1
        result = self.missing
        for (code, motifs2) in list(self.ambiguities.items()):
            for c in motifs:
                if c not in motifs2:
                    break
            else:
                if len(motifs2) < most_specific:
                    most_specific = len(motifs2)
                    result = code
        return result

    def what_ambiguity(self, motifs):
        """The code that represents all of 'motifs', and minimal others.

        Does this duplicate DegenerateFromSequence directly?
        """
        if not hasattr(self, "_reverse_ambiguities"):
            self._reverse_ambiguities = {}
        motifs = frozenset(motifs)
        if motifs not in self._reverse_ambiguities:
            self._reverse_ambiguities[motifs] = self._what_ambiguity(motifs)
        return self._reverse_ambiguities[motifs]

    def _add_lowercase(self):
        """Adds lowercase versions of keys and vals to each internal dict."""
        for name in [
                "alphabet",
                "degenerates",
                "gaps",
                "complements",
                "pairs",
                "matches",
        ]:
            curr = getattr(self, name)
            # temp hack to get around re-ordering
            if isinstance(curr, Alphabet):
                curr = tuple(curr)
            new = add_lowercase(curr)
            setattr(self, name, new)

    def _make_all(self):
        """Sets self.All, which contains all the symbols self knows about.

        Note that the value of items in self.All will be the string containing
        the possibly degenerate set of symbols that the items expand to.
        """
        all = {}
        for i in self.alphabet:
            curr = str(i)
            all[i] = i
        for key, val in list(self.degenerates.items()):
            all[key] = val
        for i in self.gaps:
            all[i] = i
        self.All = all

    def _make_comp_table(self):
        """Sets self.ComplementTable, which maps items onto their complements.

        Note: self.ComplementTable is only set if self.complements exists.
        """
        if self.complements:
            self.ComplementTable = maketrans(
                "".join(list(self.complements.keys())),
                "".join(list(self.complements.values())),
            )

    def complement(self, item):
        """Returns complement of item, using data from self.complements.

        Always tries to return same type as item: if item looks like a dict,
        will return list of keys.
        """
        if not self.complements:
            raise TypeError(
                "Tried to complement sequence using alphabet without complements."
            )
        try:
            return item.translate(self.ComplementTable)
        except (AttributeError, TypeError):
            item = iterable(item)
            get = self.complements.get
            return item.__class__([get(i, i) for i in item])

    def rc(self, item):
        """Returns reverse complement of item w/ data from self.complements.

        Always returns same type as input.
        """
        comp = list(self.complement(item))
        comp.reverse()
        if isinstance(item, str):
            return item.__class__("".join(comp))
        else:
            return item.__class__(comp)

    def strand_symmetric_motifs(self, motif_length=1):
        """returns ordered pairs of strand complementary motifs"""
        if not self.pairs:
            raise TypeError("moltype must be DNA or RNA")

        motif_set = self.alphabet.get_word_alphabet(word_length=motif_length)
        motif_pairs = []
        for m in motif_set:
            pair = tuple(sorted([m, self.complement(m)]))
            motif_pairs.append(pair)

        motif_pairs = set(motif_pairs)
        return motif_pairs

    def __contains__(self, item):
        """A MolType contains every character it knows about."""
        return item in self.All

    def __iter__(self):
        """A MolType iterates only over the characters in its Alphabet.."""
        return iter(self.alphabet)

    def is_gap(self, char):
        """Returns True if char is a gap."""
        return char in self.gaps

    def is_gapped(self, sequence):
        """Returns True if sequence contains gaps."""
        return self.first_gap(sequence) is not None

    def is_degenerate(self, sequence):
        """Returns True if sequence contains degenerate characters."""
        return self.first_degenerate(sequence) is not None

    def is_valid(self, sequence):
        """Returns True if sequence contains no items that are not in self."""
        try:
            return self.first_invalid(sequence) is None
        except:
            return False

    def is_strict(self, sequence):
        """Returns True if sequence contains only items in self.alphabet."""
        try:
            return (len(sequence)
                    == 0) or (self.first_non_strict(sequence) is None)
        except:
            return False

    def valid_on_alphabet(self, sequence, alphabet=None):
        """Returns True if sequence contains only items in alphabet.

        alphabet can actually be anything that implements __contains__.
        Defaults to self.alphabet if not supplied.
        """
        if alphabet is None:
            alphabet = self.alphabet
        return first_index_in_set(sequence, alphabet) is not None

    def first_not_in_alphabet(self, sequence, alphabet=None):
        """Returns index of first item not in alphabet, or None.

        Defaults to self.alphabet if alphabet not supplied.
        """
        if alphabet is None:
            alphabet = self.alphabet
        return first_index_in_set(sequence, alphabet)

    def first_gap(self, sequence):
        """Returns the index of the first gap in the sequence, or None."""
        gap = self.gaps
        for i, s in enumerate(sequence):
            if s in gap:
                return i
        return None

    def first_degenerate(self, sequence):
        """Returns the index of first degenerate symbol in sequence, or None."""
        degen = self.degenerates
        for i, s in enumerate(sequence):
            if s in degen:
                return i
        return None

    def first_invalid(self, sequence):
        """Returns the index of first invalid symbol in sequence, or None."""
        all = self.All
        for i, s in enumerate(sequence):
            if s not in all:
                return i
        return None

    def first_non_strict(self, sequence):
        """Returns the index of first non-strict symbol in sequence, or None."""
        monomers = self.alphabet
        for i, s in enumerate(sequence):
            if s not in monomers:
                return i
        return None

    def disambiguate(self, sequence, method="strip"):
        """Returns a non-degenerate sequence from a degenerate one.

        method can be 'strip' (deletes any characters not in monomers or gaps)
        or 'random'(assigns the possibilities at random, using equal
        frequencies).
        """
        if method == "strip":
            try:
                return sequence.__class__(self.strip_degenerate(sequence))
            except:
                ambi = self.degenerates

                def not_ambiguous(x):
                    return x not in ambi

                return sequence.__class__(list(filter(not_ambiguous,
                                                      sequence)))

        elif method == "random":
            degen = self.degenerates
            result = []
            for i in sequence:
                if i in degen:
                    result.append(choice(degen[i]))
                else:
                    result.append(i)
            if isinstance(sequence, str):
                return sequence.__class__("".join(result))
            else:
                return sequence.__class__(result)
        else:
            raise NotImplementedError("Got unknown method %s" % method)

    def degap(self, sequence):
        """Deletes all gap characters from sequence."""
        try:
            trans = dict([(i, None) for i in map(ord, self.gaps)])
            return sequence.__class__(sequence.translate(trans))
        except AttributeError:
            gap = self.gaps

            def not_gap(x):
                return x not in gap

            return sequence.__class__(list(filter(not_gap, sequence)))

    def gap_indices(self, sequence):
        """Returns list of indices of all gaps in the sequence, or []."""
        gaps = self.gaps
        return [i for i, s in enumerate(sequence) if s in gaps]

    def gap_vector(self, sequence):
        """Returns list of bool indicating gap or non-gap in sequence."""
        return list(map(self.is_gap, sequence))

    def gap_maps(self, sequence):
        """Returns tuple containing dicts mapping between gapped and ungapped.

        First element is a dict such that d[ungapped_coord] = gapped_coord.
        Second element is a dict such that d[gapped_coord] = ungapped_coord.

        Note that the dicts will be invalid if the sequence changes after the
        dicts are made.

        The gaps themselves are not in the dictionary, so use d.get() or test
        'if pos in d' to avoid KeyErrors if looking up all elements in a gapped
        sequence.
        """
        ungapped = {}
        gapped = {}
        num_gaps = 0
        for i, is_gap in enumerate(self.gap_vector(sequence)):
            if is_gap:
                num_gaps += 1
            else:
                ungapped[i] = i - num_gaps
                gapped[i - num_gaps] = i
        return gapped, ungapped

    def count_gaps(self, sequence):
        """Counts the gaps in the specified sequence."""
        gaps = self.gaps
        gap_count = sum(1 for s in sequence if s in gaps)
        return gap_count

    def get_degenerate_positions(self, sequence, include_gap=True):
        """returns indices matching degenerate characters"""
        degen = list(self.degenerates)
        if include_gap:
            degen.append(self.gap)

        pos = [i for i, c in enumerate(sequence) if c in degen]
        return pos

    def count_degenerate(self, sequence):
        """Counts the degenerate bases in the specified sequence."""
        degen = self.degenerates
        degen_count = 0
        for s in sequence:
            if s in degen:
                degen_count += 1
        return degen_count

    def possibilities(self, sequence):
        """Counts number of possible sequences matching the sequence.

        Uses self.degenerates to decide how many possibilites there are at
        each position in the sequence.
        """
        degen = self.degenerates
        count = 1
        for s in sequence:
            if s in degen:
                count *= len(degen[s])
        return count

    def mw(self, sequence, method="random", delta=None):
        """Returns the molecular weight of the sequence.

        If the sequence is ambiguous, uses method (random or strip) to
        disambiguate the sequence.

        if delta is present, uses it instead of the standard weight adjustment.
        """
        if not sequence:
            return 0
        try:
            return self.mw_calculator(sequence, delta)
        except KeyError:  # assume sequence was ambiguous
            return self.mw_calculator(self.disambiguate(sequence, method),
                                      delta)

    def can_match(self, first, second):
        """Returns True if every pos in 1st could match same pos in 2nd.

        Truncates at length of shorter sequence.
        gaps are only allowed to match other gaps.
        """
        m = self.matches
        for pair in zip(first, second):
            if pair not in m:
                return False
        return True

    def can_mismatch(self, first, second):
        """Returns True if any position in 1st could cause a mismatch with 2nd.

        Truncates at length of shorter sequence.
        gaps are always counted as matches.
        """
        m = self.matches
        if not first or not second:
            return False

        for pair in zip(first, second):
            if not m.get(pair, None):
                return True
        return False

    def must_match(self, first, second):
        """Returns True if all positions in 1st must match positions in second."""
        return not self.can_mismatch(first, second)

    def can_pair(self, first, second):
        """Returns True if first and second could pair.

        Pairing occurs in reverse order, i.e. last position of second with
        first position of first, etc.

        Truncates at length of shorter sequence.
        gaps are only allowed to pair with other gaps, and are counted as 'weak'
        (same category as GU and degenerate pairs).

        NOTE: second must be able to be reverse
        """
        p = self.pairs
        sec = list(second)
        sec.reverse()
        for pair in zip(first, sec):
            if pair not in p:
                return False
        return True

    def can_mispair(self, first, second):
        """Returns True if any position in 1st could mispair with 2nd.

        Pairing occurs in reverse order, i.e. last position of second with
        first position of first, etc.

        Truncates at length of shorter sequence.
        gaps are always counted as possible mispairs, as are weak pairs like GU.
        """
        p = self.pairs
        if not first or not second:
            return False

        sec = list(second)
        sec.reverse()
        for pair in zip(first, sec):
            if not p.get(pair, None):
                return True
        return False

    def must_pair(self, first, second):
        """Returns True if all positions in 1st must pair with second.

        Pairing occurs in reverse order, i.e. last position of second with
        first position of first, etc.
        """
        return not self.can_mispair(first, second)

    def degenerate_from_seq(self, sequence):
        """Returns least degenerate symbol corresponding to chars in sequence.

        First tries to look up in self.inverse_degenerates. Then disambiguates
        and tries to look up in self.inverse_degenerates. Then tries converting
        the case (tries uppercase before lowercase). Raises TypeError if
        conversion fails.
        """
        symbols = frozenset(sequence)
        # check if symbols are already known
        inv_degens = self.inverse_degenerates
        result = inv_degens.get(symbols, None)
        if result:
            return result
        # then, try converting the symbols
        degens = self.All
        converted = set()
        for sym in symbols:
            for char in degens[sym]:
                converted.add(char)
        symbols = frozenset(converted)
        result = inv_degens.get(symbols, None)
        if result:
            return result
        # then, try converting case
        symbols = frozenset([s.upper() for s in symbols])
        result = inv_degens.get(symbols, None)
        if result:
            return result
        symbols = frozenset([s.lower() for s in symbols])
        result = inv_degens.get(symbols, None)
        if result:
            return result
        # finally, try to find the minimal subset containing the symbols
        symbols = frozenset([s.upper() for s in symbols])
        lengths = {}
        for i in inv_degens:
            if symbols.issubset(i):
                lengths[len(i)] = i
        if lengths:  # found at least some matches
            sorted = list(lengths.keys())
            sorted.sort()
            return inv_degens[lengths[sorted[0]]]

        # if we got here, nothing worked
        raise TypeError("Cannot find degenerate char for symbols: %s" %
                        symbols)

    def get_css_style(self,
                      colors=None,
                      font_size=12,
                      font_family="Lucida Console"):
        """returns string of CSS classes and {character: <CSS class name>, ...}

        Parameters
        ----------
        colors
            {char
        font_size
            in points
        font_family
            name of a monospace font

        """
        colors = colors or self._colors
        # !important required to stop some browsers over-riding the style sheet ...!!
        template = ('.%s_%s{font-family: "%s",monospace !important; '
                    "font-size: %dpt !important; color: %s; }")
        label = self.label or ""
        styles = _style_defaults[label].copy()
        styles.update({
            c: "_".join([c, label])
            for c in list(self.alphabet) + ["terminal_ambig"]
        })

        css = [
            template % (char, label, font_family, font_size, colors[char])
            for char in list(styles) + ["ambig"]
        ]

        return css, styles
示例#2
0
文件: moltype.py 项目: mr-c/cogent3
    def __init__(
        self,
        motifset,
        gap=IUPAC_gap,
        missing=IUPAC_missing,
        gaps=None,
        seq_constructor=None,
        ambiguities=None,
        label=None,
        complements=None,
        pairs=None,
        mw_calculator=None,
        add_lower=False,
        preserve_existing_moltypes=False,
        make_alphabet_group=False,
        array_seq_constructor=None,
        colors=None,
    ):
        """Returns a new MolType object. Note that the parameters are in flux.

        Parameters
        ----------
        motifset
            Alphabet or sequence of items in the default
            alphabet. Does not include degenerates.
        gap
            default gap symbol
        missing
            symbol for missing data
        gaps
            any other symbols that should be treated as gaps (doesn't have
            to include gap or missing; they will be silently added)
        seq_constructor
            Class for constructing sequences.
        ambiguities
            dict of char:tuple, doesn't include gaps (these are
            hard-coded as - and ?, and added later.
        label
            text label, don't know what this is used for. Unnecessary?
        complements
            dict of symbol:symbol showing how the non-degenerate
            single characters complement each other. Used for constructing
            on the fly the complement table, incl. support for must_pair and
            can_pair.
        pairs
            dict in which keys are pairs of symbols that can pair
            with each other, values are True (must pair) or False (might
            pair). Currently, the meaning of GU pairs as 'weak' is conflated
            with the meaning of degenerate symbol pairs (which might pair
            with each other but don't necessarily, depending on how the
            symbol is resolved). This should be refactored.
        mw_calculator
            f(seq) -> molecular weight.
        add_lower
            if True (default: False) adds the lowercase versions of
            everything into the alphabet. Slated for deletion.
        preserve_existing_moltypes
            if True (default: False), does not
            set the MolType of the things added in **kwargs to self.
        make_alphabet_group
            if True, makes an AlphabetGroup relating
            the various alphabets to one another.
        array_seq_constructor
            sequence type for array sequence
        colors
            dict mapping moltype characters to colors for display

        Note on "degenerates" versus "ambiguities": self.degenerates contains
        _only_ mappings for degenerate symbols, whereas self.ambiguities
        contains mappings for both degenerate and non-degenerate symbols.
        Sometimes you want one, sometimes the other, so both are provided.
        """
        self._serialisable = {k: v for k, v in locals().items() if k != "self"}
        self.gap = gap
        self.missing = missing
        self.gaps = frozenset([gap, missing])
        if gaps:
            self.gaps = self.gaps.union(frozenset(gaps))
        self.label = label
        # set the sequence constructor
        if seq_constructor is None:
            seq_constructor = "".join  # safe default string constructor
        elif not preserve_existing_moltypes:
            seq_constructor.moltype = self
        self._make_seq = seq_constructor

        # set the ambiguities
        ambigs = {
            self.missing: tuple(motifset) + (self.gap, ),
            self.gap: (self.gap, )
        }
        if ambiguities:
            ambigs.update(ambiguities)
        for c in motifset:
            ambigs[c] = (c, )
        self.ambiguities = ambigs

        # set complements -- must set before we make the alphabet group
        self.complements = complements or {}

        if make_alphabet_group:  # note: must use _original_ ambiguities here
            self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self)
            self.alphabet = self.alphabets.base
        else:
            if isinstance(motifset, Enumeration):
                self.alphabet = motifset
            elif max(len(motif) for motif in motifset) == 1:
                self.alphabet = CharAlphabet(motifset, moltype=self)
            else:
                self.alphabet = Alphabet(motifset, moltype=self)
        # set the other properties
        self.degenerates = ambiguities and ambiguities.copy() or {}
        self.degenerates[self.missing] = "".join(motifset) + self.gap
        self.matches = make_matches(motifset, self.gaps, self.degenerates)
        self.pairs = pairs and pairs.copy() or {}
        self.pairs.update(
            make_pairs(pairs, motifset, self.gaps, self.degenerates))
        self.mw_calculator = mw_calculator

        # add lowercase characters, if we're doing that
        if add_lower:
            self._add_lowercase()
        # cache various other data that make the calculations faster
        self._make_all()
        self._make_comp_table()
        # a gap can be a true gap char or a degenerate character, typically '?'
        # we therefore want to ensure consistent treatment across the definition
        # of characters as either gap or degenerate
        self.gap_string = "".join(self.gaps)
        strict_gap = "".join(set(self.gap_string) - set(self.degenerates))
        self.strip_degenerate = FunctionWrapper(
            KeepChars(strict_gap + "".join(self.alphabet)))
        self.strip_bad = FunctionWrapper(KeepChars("".join(self.All)))
        to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps)
        self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep)))

        # make inverse degenerates from degenerates
        # ensure that lowercase versions also exist if appropriate
        inv_degens = {}
        for key, val in list(self.degenerates.items()):
            inv_degens[frozenset(val)] = key.upper()
            if add_lower:
                inv_degens[frozenset("".join(val).lower())] = key.lower()
        for m in self.alphabet:
            inv_degens[frozenset(m)] = m
            if add_lower:
                inv_degens[frozenset("".join(m).lower())] = m.lower()
        for m in self.gaps:
            inv_degens[frozenset(m)] = m
        self.inverse_degenerates = inv_degens

        # set array type for modeling alphabets
        try:
            self.array_type = self.alphabet.array_type
        except AttributeError:
            self.array_type = None

        # set modeling sequence
        self._make_array_seq = array_seq_constructor

        self._colors = colors or defaultdict(_DefaultValue("black"))
示例#3
0
文件: moltype.py 项目: mr-c/cogent3
 def _with(self, motifs):
     a = Alphabet._with(self, motifs)
     a.__class__ = type(self)
     a._gc = self._gc
     return a