def indelizeCodons(self, aas, code="Standard"):
    """
    This method returns a copy of *self* where the codons corresponding to
    amino-acids listed in *aas* are replaced by indels.
    *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted
    or a dictionary converting from codons to amino-acids.
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # Ensure the amino-acids are in lowercase.
    aas = set([aa.lower() for aa in aas])
    # Build a conversion dictionary.
    # The keys are the codons, the values their "indelized" replacements.
    cod2indels = {}
    for codon in code.keys():
        if code[codon] in aas:
            cod2indels[codon] = "---"
        else:
            cod2indels[codon] = codon
    # Make a copy of self.
    ali = self.dupe()
    for seq in ali.sequences:
        # Recode the sequence using the conversion dictionary built previously.
        # recode_sequence is defined in Codon_utils.py
        seq.sequence = recode_sequence(seq.sequence, cod2indels, code=code)
    return ali
def degenerateByCodonColumn(self, code="Standard", restrict_to=[]):
    """
    This method returns a copy of *self* where codons coding for the same
    amino-acid in a given column of the matrix are replaced by their union
    (i.e. degenerated), contrary to `degenerate` which does this regardless
    of the codons present, just using the degeneracy observed in the genetic code.
    If *restrict_to* is not empty, only those codons that code amino-acids listed
    in *restrict_to* will be degenerated.
    *code* is the Biopython name of the genetic code under which degeneracy has
    to be interpreted or a dictionary converting from codons to amino-acids.
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # Ensure the amino-acids are in lowercase.
    restrict_to = set([aa.lower() for aa in restrict_to])
    # The matrix will be rebuilt column-wise,
    # and then the sequences will be rebuilt from these columns of codons.
    new_slices = []
    for cod_slice in iter_codon_slices(self, code):
        new_slices.append(degenerate_codon_slice(cod_slice, restrict_to))
    # Make a copy of self.
    ali = self.dupe()
    # Loop over the sequences.
    for i in range(self.nChar):
        ali.sequences[i].sequence = CAT(
            [str(cod_slice[i]) for cod_slice in new_slices])
    return ali
def degenerateByCodonColumn(self, code="Standard", restrict_to=[]):
    """
    This method returns a copy of *self* where codons coding for the same
    amino-acid in a given column of the matrix are replaced by their union
    (i.e. degenerated), contrary to `degenerate` which does this regardless
    of the codons present, just using the degeneracy observed in the genetic code.
    If *restrict_to* is not empty, only those codons that code amino-acids listed
    in *restrict_to* will be degenerated.
    *code* is the Biopython name of the genetic code under which degeneracy has
    to be interpreted or a dictionary converting from codons to amino-acids.
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # Ensure the amino-acids are in lowercase.
    restrict_to = set([aa.lower() for aa in restrict_to])
    # The matrix will be rebuilt column-wise,
    # and then the sequences will be rebuilt from these columns of codons.
    new_slices = []
    for cod_slice in iter_codon_slices(self, code):
        new_slices.append(degenerate_codon_slice(cod_slice, restrict_to))
    # Make a copy of self.
    ali = self.dupe()
    # Loop over the sequences.
    for i in range(self.nChar):
        ali.sequences[i].sequence = CAT([str(cod_slice[i]) for cod_slice in new_slices])
    return ali
def indelizeCodons(self, aas, code="Standard"):
    """
    This method returns a copy of *self* where the codons corresponding to
    amino-acids listed in *aas* are replaced by indels.
    *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted
    or a dictionary converting from codons to amino-acids.
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # Ensure the amino-acids are in lowercase.
    aas = set([aa.lower() for aa in aas])
    # Build a conversion dictionary.
    # The keys are the codons, the values their "indelized" replacements.
    cod2indels = {}
    for codon in code.keys():
        if code[codon] in aas:
            cod2indels[codon] = "---"
        else:
            cod2indels[codon] = codon
    # Make a copy of self.
    ali = self.dupe()
    for seq in ali.sequences:
        # Recode the sequence using the conversion dictionary built previously.
        # recode_sequence is defined in Codon_utils.py
        seq.sequence = recode_sequence(seq.sequence, cod2indels, code=code)
    return ali
def getConstantAAMask(self, transl_table=1, code=None, restrict_to=[]):
    """

    This method returns a mask corresponding to the triplets of sites where
    only one amino-acid is coded. This is intended to be used for submatrix
    extraction using :meth:`Alignment.getSubsetUsingMask` (with the option
    *inverse=True*, to get the non-constant sites).

    If *restrict_to* is not empty, only those sites that are constant and code
    for one of the amino-acids the one-letter code of which is in *restrict_to*
    will be considered constant.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getConstantAAMask()"]

    if code is None:
        code = Code(transl_table).code
    elif isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    return CAT(
        map(
            lambda c_slice: "111"
            if codon_slice_is_constant_aa(c_slice, restrict_to) else "000",
            self.iter_codon_slices(code)))
def getConstantAAMask(self, transl_table=1, code=None, restrict_to=[]):
    """

    This method returns a mask corresponding to the triplets of sites where
    only one amino-acid is coded. This is intended to be used for submatrix
    extraction using :meth:`Alignment.getSubsetUsingMask` (with the option
    *inverse=True*, to get the non-constant sites).

    If *restrict_to* is not empty, only those sites that are constant and code
    for one of the amino-acids the one-letter code of which is in *restrict_to*
    will be considered constant.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getConstantAAMask()"]

    if code is None:
        code = Code(transl_table).code
    elif isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    return CAT(
        map(lambda c_slice: "111" if codon_slice_is_constant_aa(c_slice, restrict_to) else "000",
            self.iter_codon_slices(code)))
def degenerate(self, code="Standard", positions=[1, 2, 3], restrict_to=[], ignore=[], sub_code=None):
    """
    This method returns a copy of *self* where the codons are replaced with degenerate versions.
    If *restrict_to* is not empty, only those codons that code amino-acids listed in *restrict_to*
    will be degenerated.
    If *ignore* is not empty, those codons that code amino-acids listed in *ignore*
    will not be degenerated.
    *positions* determines which codon positions are degenerated. By default, the whole codons are
    degenerated (if there is degeneracy of course).
    *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted
    or a dictionary converting from codons to amino-acids (all in lower case).
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    *sub_code*, if provided, should be a dictionary associating amino-acids to codons
    (all in lower case). For the purpose of defining degeneracy groups, the codons present
    in *sub_code* will be considered as coding for the amino-acid defined there instead of
    the one defined by *code*. This can be used for instance to keep two distinct types of
    serine codons, with degeneracy only within each type. The codons still count as coding
    their original amino-acid with respect to the *restrict_to* and *ignore* options.
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # codons belonging to different sub-groups of codons for one amino-acid
    # can be considered as coding different amino-acids
    # (sub-amino-acids of the "normal" amino-acid, for instance two types of serine)
    if sub_code is None:
        sub_code = {}
    else:
        assert isinstance(sub_code, dict), "sub_code must be a dictionary."
        sub_code = copy.copy(sub_code) # otherwise there are side effects:
                                       # the content of sub_code can be modified
                                       # in the calling context
        if any([sub_aa in code.values() for sub_aa in sub_code.values()]):
            msg = CAT(["Note that at least one sub-aminoacid provided in sub_code ",
                       "is identical to an amino-acid provided by the chosen genetic code.\n",
                       "The sub-amino-acids are:\n%s\n" % ", ".join(
                           [str(aa) for aa in sub_code.values()])])
            warnings.warn(msg)
    # Ensure the amino-acids are in lowercase.
    restrict_to = set([aa.lower() for aa in restrict_to])
    ignored_aas = set([aa.lower() for aa in ignore])
    # Find the groups of synonymous codons.
    # The keys are amino-acids, the values are lists of codons that code the amino-acid.
    aas_codons = {}
    for codon in code.keys():
        aa = code[codon]
        if not sub_code.has_key(codon):
            sub_code[codon] = aa # sub_aa will be the same as aa
        sub_aa = sub_code[codon]
        # Only consider codons that are compatible with the restriction rule, if there is one.
        if (len(restrict_to) == 0 or aa.lower() in restrict_to) and not (aa.lower() in ignored_aas) :
            #if aas_codons.has_key(aa):
            if aas_codons.has_key(sub_aa):
                #aas_codons[aa].append(codon)
                aas_codons[sub_aa].append(codon)
            else:
                #aas_codons[aa] = [codon]
                aas_codons[sub_aa] = [codon]
    # Build a conversion dictionary.
    # The keys are the codons, the values their degenerate replacements.
    cod2degen = {}
    for codons in aas_codons.values():
        # Compute degeneracy values at the 3 positions
        # The degenerate value at a position is the binary union
        # of the values of the nucleotides found at that position.
        # nuc2val and reduce_by_or are defined in code_utils.py
        degen1 = reduce_by_or([nuc2val[cod[0]] for cod in codons])
        degen2 = reduce_by_or([nuc2val[cod[1]] for cod in codons])
        degen3 = reduce_by_or([nuc2val[cod[2]] for cod in codons])
        # Compute the string representation of the resulting degenerate codon.
        # val2nuc is defined in code_utils.py
        degenerate_codon = val2nuc[degen1] + val2nuc[degen2] + val2nuc[degen3]
        # Associate this representation to all the synonymous codons it represents.
        for cod in codons:
            cod2degen[cod.lower()] = degenerate_codon.lower()
            # If restrict_to is not empty, it is likely that not all codons
            # are present in cod2degen, but the code_utils.recode_sequence function
            # will just keep those codons as is.
    # Make a copy of self.
    ali = self.dupe()
    for seq in ali.sequences:
        # Recode the sequence using the conversion dictionary built previously.
        # recode_sequence is defined in Codon_utils.py
        seq.sequence = recode_sequence(seq.sequence, cod2degen, positions, code=code)
    return ali
def getDegenerateSiteMaskForPos(self, pos, transl_table=1, code=None, restrict_to=[], ignore=[]):
    """

    This method returns a mask corresponding to the sites where degeneracy has
    been observed if they correspond to a *pos*-th codon position.
    This is intended to be used for submatrix extraction using
    :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to
    get the degeneracy-free sites).

    If *restrict_to* is not empty, only those amino-acids
    the one-lettre code of which is in *restrict_to* are considered.

    If *ignore* is not empty, only those amino-acid the one-lettre code of
    which is not in *ignore* are considered.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getDegenerateSiteMaskForPos()"]

    if code is None:
        code = Code(transl_table).code
    elif isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    def pos_mask(i):
        if i == pos:
            return "1"
        else:
            return "0"
    def triplet_mask(selected):
        if selected:
            return CAT(map(pos_mask, [1, 2, 3]))
        else:
            return "000"
    # Iterate over the slices, find the triplets that will be included in the mask
    # (those where degeneracy occurs), generate the corresponding mask portions,
    # and join the mask portions to make the matrix mask.
    return CAT(map(
        triplet_mask, [codon_position_is_degenerate(
            cod_slice, pos, restrict_to, ignore) for cod_slice in self.iter_codon_slices(code)]))
def getDegenerateCodonsMask(self, transl_table=1, code=None, restrict_to=[], ignore=[]):
    """

    This method returns a mask corresponding to the triplets of sites where
    degeneracy has been observed. This is intended to be used for submatrix
    extraction using :meth:`Alignment.getSubsetUsingMask` (with the option
    *inverse=True*, to get the sites with no degenerate codons).

    If *restrict_to* is not empty, only those amino-acid the one-lettre code of
    which is in *restrict_to* are considered.

    If *ignore* is not empty, only those amino-acid the one-lettre code of
    which is not in *ignore* are considered.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getDegenerateCodonsMask()"]

    if code is None:
        #code = GeneticCode(transl_table).code
        code = Code(transl_table).code
    elif isinstance(code, types.StringType):
        code = getBiopythonCode(code) # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    # Experiments to test the speed of execution.
    #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #return mask
    #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #mask = ""
    # Loop over the successive triplets of sites.
    #for codon_slice in self.iter_codon_slices(code):
    #for codon_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]:
    #for i in xrange(0, self.length, 3):
    #    # i is positioned at the first codon position of the triplet.
    #    codon_slice = codons_from_triplet_slice(self.triplet_slice(i), code)
    #    if codon_slice_is_degenerate(codon_slice, restrict_to):
    #        mask += "111"
    #    else:
    #        mask += "000"
    #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)])
    #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    #mask = "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)])
    #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]))
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)]))
    #return mask
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    return CAT(map(
        lambda c_slice: "111" if codon_slice_is_degenerate(
            c_slice, restrict_to, ignore) else "000",
        self.iter_codon_slices(code)))
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None):
    """Returns a pseudo protein alignment from *self*, a DNA alignment.
    The result is of datatype standard instead of protein, which allows
    the use of special recodings, like distinguishing between two types
    of serines, like in :meth:`Alignment.recode23aa()`.

    *self* is translated using :attribute:`Code(transl_table).code`.

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    It may be possible to use a code based on another codon length as 3,
    but this has not been tested as of June 2012.


    At the moment, we can only do translations where the sequences are phased
    with the coding frame, ie the first sequence position is the first position
    of the codon, and the last sequence position should be a last codon position.

    The default behaviour is to use translation table 1, that is the standard genetic code.
    Other available translation tables, this week::

        if transl_table == 1: # standard
        elif transl_table == 2: # vertebrate mito
        elif transl_table == 4: # Mold, Protozoan,
                                # and Coelenterate Mitochondrial Code
                                # and the Mycoplasma/Spiroplasma Code
        elif transl_table == 5: # invertebrate mito
        elif transl_table == 9: # echinoderm mito

        and now 6, 10, 11, 12, 13, 14, 21.

    (These are found in p4.GeneticCode.py or in :class:`Code`)

    *transl_table* may also be provided as text consisting in blank-separated elements.
    Each elements consists in n characters, where n is the number of defined codons.
    The first element lists the coded (pseudo-)amino-acids.
    The second elements describes whether a codon can be a start codon ('M') or not ('-').
    The other elements correspond to the (pseudo-)nucleotides at the successive codon positions.
    Example::
        FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG
        ---M---------------M------------MMMM---------------M------------
        TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
        TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
        TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG

    """

    gm = ['p4.alignment_recoding.pseudoTranslate()']
    if self.dataType != 'dna':
        gm.append("Self should be a DNA alignment")
        raise P4Error(gm)

    if code is None:
        #from GeneticCode import Code
        code = Code(transl_table, in_type="dna", out_type=out_type).code
        codelength = Code(transl_table).codelength
    else:
        if isinstance(code, types.StringType):
            code = getBiopythonCode(code) # defined in code_utils.py
        # We assume that the "codons" have all the same length,
        # and we look at the first codon in the dictionary to know this length.
        codelength = len(code.keys()[0])
        # We use standard type, because, depending on the code used to make the translation,
        # we may get something that contains symbols not corresponding to normal amino-acids.
        out_type = "standard"

    if self.length % codelength != 0:
        gm.append("The length of self should be a multiple of %i" % codelength)
        raise P4Error(gm)

    ali = self.dupe()
    ali.dataType = out_type
    ali.length = self.length / codelength
    ali.symbols = CAT(sorted(set(code.values())))
    ali.equates = {}
    ali.dim = len(ali.symbols)
    ali.nexusSets = None
    ali.parts = []
    ali.excludeDelete = None
    for seq in ali.sequences:
        # Initialize an all-gap sequence.
        seq.sequence = ['-'] * ali.length
        seq.dataType = out_type

    for i in range(len(self.sequences)):
        # the original sequence
        dnaSeq = self.sequences[i].sequence
        # the future pseudo-translation
        pseudoProtSeq = ali.sequences[i].sequence
        for j in range(ali.length):
            theCodon = dnaSeq[(j * codelength):((j+1) * codelength)]
            if code.has_key(theCodon):
                pseudoProtSeq[j] = code[theCodon]
            elif theCodon == '-' * codelength:
                # full indel
                pseudoProtSeq[j] = '-'
            elif theCodon.count('-'):
                # partial indel
                gm.append("    seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % (
                    i, j, (j*codelength), theCodon))
                raise P4Error(gm)
            else:
                # Should we use a CodonTranslationError (defined in code_utils.py) here ?
                gm.append("    seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % (
                    i, j, (j*codelength), theCodon))
                raise P4Error(gm)

    for seq in ali.sequences:
        # Convert from list to string.
        #s.sequence = string.join(s.sequence, '')
        seq.sequence = CAT(seq.sequence)
        #print s.sequence
    return ali
def degenerate(self,
               code="Standard",
               positions=[1, 2, 3],
               restrict_to=[],
               ignore=[],
               sub_code=None):
    """
    This method returns a copy of *self* where the codons are replaced with degenerate versions.
    If *restrict_to* is not empty, only those codons that code amino-acids listed in *restrict_to*
    will be degenerated.
    If *ignore* is not empty, those codons that code amino-acids listed in *ignore*
    will not be degenerated.
    *positions* determines which codon positions are degenerated. By default, the whole codons are
    degenerated (if there is degeneracy of course).
    *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted
    or a dictionary converting from codons to amino-acids (all in lower case).
    Default is to use the standard genetic code. Possible values for *code* are:
    %s
    *sub_code*, if provided, should be a dictionary associating amino-acids to codons
    (all in lower case). For the purpose of defining degeneracy groups, the codons present
    in *sub_code* will be considered as coding for the amino-acid defined there instead of
    the one defined by *code*. This can be used for instance to keep two distinct types of
    serine codons, with degeneracy only within each type. The codons still count as coding
    their original amino-acid with respect to the *restrict_to* and *ignore* options.
    """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys()))
    if isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # codons belonging to different sub-groups of codons for one amino-acid
    # can be considered as coding different amino-acids
    # (sub-amino-acids of the "normal" amino-acid, for instance two types of serine)
    if sub_code is None:
        sub_code = {}
    else:
        assert isinstance(sub_code, dict), "sub_code must be a dictionary."
        sub_code = copy.copy(sub_code)  # otherwise there are side effects:
        # the content of sub_code can be modified
        # in the calling context
        if any([sub_aa in code.values() for sub_aa in sub_code.values()]):
            msg = CAT([
                "Note that at least one sub-aminoacid provided in sub_code ",
                "is identical to an amino-acid provided by the chosen genetic code.\n",
                "The sub-amino-acids are:\n%s\n" %
                ", ".join([str(aa) for aa in sub_code.values()])
            ])
            warnings.warn(msg)
    # Ensure the amino-acids are in lowercase.
    restrict_to = set([aa.lower() for aa in restrict_to])
    ignored_aas = set([aa.lower() for aa in ignore])
    # Find the groups of synonymous codons.
    # The keys are amino-acids, the values are lists of codons that code the amino-acid.
    aas_codons = {}
    for codon in code.keys():
        aa = code[codon]
        if codon not in sub_code:
            sub_code[codon] = aa  # sub_aa will be the same as aa
        sub_aa = sub_code[codon]
        # Only consider codons that are compatible with the restriction rule, if there is one.
        if (len(restrict_to) == 0 or aa.lower()
                in restrict_to) and not (aa.lower() in ignored_aas):
            #if aa in aas_codons:
            if sub_aa in aas_codons:
                #aas_codons[aa].append(codon)
                aas_codons[sub_aa].append(codon)
            else:
                #aas_codons[aa] = [codon]
                aas_codons[sub_aa] = [codon]
    # Build a conversion dictionary.
    # The keys are the codons, the values their degenerate replacements.
    cod2degen = {}
    for codons in aas_codons.values():
        # Compute degeneracy values at the 3 positions
        # The degenerate value at a position is the binary union
        # of the values of the nucleotides found at that position.
        # nuc2val and reduce_by_or are defined in code_utils.py
        degen1 = reduce_by_or([nuc2val[cod[0]] for cod in codons])
        degen2 = reduce_by_or([nuc2val[cod[1]] for cod in codons])
        degen3 = reduce_by_or([nuc2val[cod[2]] for cod in codons])
        # Compute the string representation of the resulting degenerate codon.
        # val2nuc is defined in code_utils.py
        degenerate_codon = val2nuc[degen1] + val2nuc[degen2] + val2nuc[degen3]
        # Associate this representation to all the synonymous codons it represents.
        for cod in codons:
            cod2degen[cod.lower()] = degenerate_codon.lower()
            # If restrict_to is not empty, it is likely that not all codons
            # are present in cod2degen, but the code_utils.recode_sequence function
            # will just keep those codons as is.
    # Make a copy of self.
    ali = self.dupe()
    for seq in ali.sequences:
        # Recode the sequence using the conversion dictionary built previously.
        # recode_sequence is defined in Codon_utils.py
        seq.sequence = recode_sequence(seq.sequence,
                                       cod2degen,
                                       positions,
                                       code=code)
    return ali
def getDegenerateSiteMaskForPos(self,
                                pos,
                                transl_table=1,
                                code=None,
                                restrict_to=[],
                                ignore=[]):
    """

    This method returns a mask corresponding to the sites where degeneracy has
    been observed if they correspond to a *pos*-th codon position.
    This is intended to be used for submatrix extraction using
    :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to
    get the degeneracy-free sites).

    If *restrict_to* is not empty, only those amino-acids
    the one-lettre code of which is in *restrict_to* are considered.

    If *ignore* is not empty, only those amino-acid the one-lettre code of
    which is not in *ignore* are considered.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getDegenerateSiteMaskForPos()"]

    if code is None:
        code = Code(transl_table).code
    elif isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    def pos_mask(i):
        if i == pos:
            return "1"
        else:
            return "0"

    def triplet_mask(selected):
        if selected:
            return CAT(map(pos_mask, [1, 2, 3]))
        else:
            return "000"

    # Iterate over the slices, find the triplets that will be included in the mask
    # (those where degeneracy occurs), generate the corresponding mask portions,
    # and join the mask portions to make the matrix mask.
    return CAT(
        map(triplet_mask, [
            codon_position_is_degenerate(cod_slice, pos, restrict_to, ignore)
            for cod_slice in self.iter_codon_slices(code)
        ]))
def getDegenerateCodonsMask(self,
                            transl_table=1,
                            code=None,
                            restrict_to=[],
                            ignore=[]):
    """

    This method returns a mask corresponding to the triplets of sites where
    degeneracy has been observed. This is intended to be used for submatrix
    extraction using :meth:`Alignment.getSubsetUsingMask` (with the option
    *inverse=True*, to get the sites with no degenerate codons).

    If *restrict_to* is not empty, only those amino-acid the one-lettre code of
    which is in *restrict_to* are considered.

    If *ignore* is not empty, only those amino-acid the one-lettre code of
    which is not in *ignore* are considered.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.

    """

    gm = ["Alignment.getDegenerateCodonsMask()"]

    if code is None:
        #code = GeneticCode(transl_table).code
        code = Code(transl_table).code
    elif isinstance(code, str):
        code = getBiopythonCode(code)  # defined in code_utils.py
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg

    # Experiments to test the speed of execution.
    #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #return mask
    #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #mask = ""
    # Loop over the successive triplets of sites.
    #for codon_slice in self.iter_codon_slices(code):
    #for codon_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]:
    #for i in xrange(0, self.length, 3):
    #    # i is positioned at the first codon position of the triplet.
    #    codon_slice = codons_from_triplet_slice(self.triplet_slice(i), code)
    #    if codon_slice_is_degenerate(codon_slice, restrict_to):
    #        mask += "111"
    #    else:
    #        mask += "000"
    #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)])
    #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code))
    #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    #mask = "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)])
    #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]))
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)]))
    #return mask
    #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
    return CAT(
        map(
            lambda c_slice: "111" if codon_slice_is_degenerate(
                c_slice, restrict_to, ignore) else "000",
            self.iter_codon_slices(code)))
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None):
    """Returns a pseudo protein alignment from *self*, a DNA alignment.
    The result is of datatype standard instead of protein, which allows
    the use of special recodings, like distinguishing between two types
    of serines, like in :meth:`Alignment.recode23aa()`.

    *self* is translated using :attribute:`Code(transl_table).code`.

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    It may be possible to use a code based on another codon length as 3,
    but this has not been tested as of June 2012.


    At the moment, we can only do translations where the sequences are phased
    with the coding frame, ie the first sequence position is the first position
    of the codon, and the last sequence position should be a last codon position.

    The default behaviour is to use translation table 1, that is the standard genetic code.
    Other available translation tables, this week::

        if transl_table == 1: # standard
        elif transl_table == 2: # vertebrate mito
        elif transl_table == 4: # Mold, Protozoan,
                                # and Coelenterate Mitochondrial Code
                                # and the Mycoplasma/Spiroplasma Code
        elif transl_table == 5: # invertebrate mito
        elif transl_table == 9: # echinoderm mito

        and now 6, 10, 11, 12, 13, 14, 21.

    (These are found in p4.GeneticCode.py or in :class:`Code`)

    *transl_table* may also be provided as text consisting in blank-separated elements.
    Each elements consists in n characters, where n is the number of defined codons.
    The first element lists the coded (pseudo-)amino-acids.
    The second elements describes whether a codon can be a start codon ('M') or not ('-').
    The other elements correspond to the (pseudo-)nucleotides at the successive codon positions.
    Example::
        FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG
        ---M---------------M------------MMMM---------------M------------
        TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
        TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
        TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG

    """

    gm = ['p4.alignment_recoding.pseudoTranslate()']
    if self.dataType != 'dna':
        gm.append("Self should be a DNA alignment")
        raise P4Error(gm)

    if code is None:
        #from GeneticCode import Code
        code = Code(transl_table, in_type="dna", out_type=out_type).code
        codelength = Code(transl_table).codelength
    else:
        if isinstance(code, str):
            code = getBiopythonCode(code)  # defined in code_utils.py
        # We assume that the "codons" have all the same length,
        # and we look at the first codon in the dictionary to know this length.
        codelength = len(code.keys()[0])
        # We use standard type, because, depending on the code used to make the translation,
        # we may get something that contains symbols not corresponding to normal amino-acids.
        out_type = "standard"

    if self.length % codelength != 0:
        gm.append("The length of self should be a multiple of %i" % codelength)
        raise P4Error(gm)

    ali = self.dupe()
    ali.dataType = out_type
    ali.length = self.length / codelength
    ali.symbols = CAT(sorted(set(code.values())))
    ali.equates = {}
    ali.dim = len(ali.symbols)
    ali.nexusSets = None
    ali.parts = []
    ali.excludeDelete = None
    for seq in ali.sequences:
        # Initialize an all-gap sequence.
        seq.sequence = ['-'] * ali.length
        seq.dataType = out_type

    for i in range(len(self.sequences)):
        # the original sequence
        dnaSeq = self.sequences[i].sequence
        # the future pseudo-translation
        pseudoProtSeq = ali.sequences[i].sequence
        for j in range(ali.length):
            theCodon = dnaSeq[(j * codelength):((j + 1) * codelength)]
            if theCodon in code:
                pseudoProtSeq[j] = code[theCodon]
            elif theCodon == '-' * codelength:
                # full indel
                pseudoProtSeq[j] = '-'
            elif theCodon.count('-'):
                # partial indel
                gm.append(
                    "    seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete"
                    % (i, j, (j * codelength), theCodon))
                raise P4Error(gm)
            else:
                # Should we use a CodonTranslationError (defined in code_utils.py) here ?
                gm.append(
                    "    seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon"
                    % (i, j, (j * codelength), theCodon))
                raise P4Error(gm)

    for seq in ali.sequences:
        # Convert from list to string.
        #s.sequence = ''.join(s.sequence)
        seq.sequence = CAT(seq.sequence)
        #print s.sequence
    return ali