def indelizeCodons(self, aas, code="Standard"): """ This method returns a copy of *self* where the codons corresponding to amino-acids listed in *aas* are replaced by indels. *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted or a dictionary converting from codons to amino-acids. Default is to use the standard genetic code. Possible values for *code* are: %s """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys())) if isinstance(code, types.StringType): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # Ensure the amino-acids are in lowercase. aas = set([aa.lower() for aa in aas]) # Build a conversion dictionary. # The keys are the codons, the values their "indelized" replacements. cod2indels = {} for codon in code.keys(): if code[codon] in aas: cod2indels[codon] = "---" else: cod2indels[codon] = codon # Make a copy of self. ali = self.dupe() for seq in ali.sequences: # Recode the sequence using the conversion dictionary built previously. # recode_sequence is defined in Codon_utils.py seq.sequence = recode_sequence(seq.sequence, cod2indels, code=code) return ali
def indelizeCodons(self, aas, code="Standard"): """ This method returns a copy of *self* where the codons corresponding to amino-acids listed in *aas* are replaced by indels. *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted or a dictionary converting from codons to amino-acids. Default is to use the standard genetic code. Possible values for *code* are: %s """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys())) if isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # Ensure the amino-acids are in lowercase. aas = set([aa.lower() for aa in aas]) # Build a conversion dictionary. # The keys are the codons, the values their "indelized" replacements. cod2indels = {} for codon in code.keys(): if code[codon] in aas: cod2indels[codon] = "---" else: cod2indels[codon] = codon # Make a copy of self. ali = self.dupe() for seq in ali.sequences: # Recode the sequence using the conversion dictionary built previously. # recode_sequence is defined in Codon_utils.py seq.sequence = recode_sequence(seq.sequence, cod2indels, code=code) return ali
def degenerate(self, code="Standard", positions=[1, 2, 3], restrict_to=[], ignore=[], sub_code=None): """ This method returns a copy of *self* where the codons are replaced with degenerate versions. If *restrict_to* is not empty, only those codons that code amino-acids listed in *restrict_to* will be degenerated. If *ignore* is not empty, those codons that code amino-acids listed in *ignore* will not be degenerated. *positions* determines which codon positions are degenerated. By default, the whole codons are degenerated (if there is degeneracy of course). *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted or a dictionary converting from codons to amino-acids (all in lower case). Default is to use the standard genetic code. Possible values for *code* are: %s *sub_code*, if provided, should be a dictionary associating amino-acids to codons (all in lower case). For the purpose of defining degeneracy groups, the codons present in *sub_code* will be considered as coding for the amino-acid defined there instead of the one defined by *code*. This can be used for instance to keep two distinct types of serine codons, with degeneracy only within each type. The codons still count as coding their original amino-acid with respect to the *restrict_to* and *ignore* options. """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys())) if isinstance(code, types.StringType): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # codons belonging to different sub-groups of codons for one amino-acid # can be considered as coding different amino-acids # (sub-amino-acids of the "normal" amino-acid, for instance two types of serine) if sub_code is None: sub_code = {} else: assert isinstance(sub_code, dict), "sub_code must be a dictionary." sub_code = copy.copy(sub_code) # otherwise there are side effects: # the content of sub_code can be modified # in the calling context if any([sub_aa in code.values() for sub_aa in sub_code.values()]): msg = CAT(["Note that at least one sub-aminoacid provided in sub_code ", "is identical to an amino-acid provided by the chosen genetic code.\n", "The sub-amino-acids are:\n%s\n" % ", ".join( [str(aa) for aa in sub_code.values()])]) warnings.warn(msg) # Ensure the amino-acids are in lowercase. restrict_to = set([aa.lower() for aa in restrict_to]) ignored_aas = set([aa.lower() for aa in ignore]) # Find the groups of synonymous codons. # The keys are amino-acids, the values are lists of codons that code the amino-acid. aas_codons = {} for codon in code.keys(): aa = code[codon] if not sub_code.has_key(codon): sub_code[codon] = aa # sub_aa will be the same as aa sub_aa = sub_code[codon] # Only consider codons that are compatible with the restriction rule, if there is one. if (len(restrict_to) == 0 or aa.lower() in restrict_to) and not (aa.lower() in ignored_aas) : #if aas_codons.has_key(aa): if aas_codons.has_key(sub_aa): #aas_codons[aa].append(codon) aas_codons[sub_aa].append(codon) else: #aas_codons[aa] = [codon] aas_codons[sub_aa] = [codon] # Build a conversion dictionary. # The keys are the codons, the values their degenerate replacements. cod2degen = {} for codons in aas_codons.values(): # Compute degeneracy values at the 3 positions # The degenerate value at a position is the binary union # of the values of the nucleotides found at that position. # nuc2val and reduce_by_or are defined in code_utils.py degen1 = reduce_by_or([nuc2val[cod[0]] for cod in codons]) degen2 = reduce_by_or([nuc2val[cod[1]] for cod in codons]) degen3 = reduce_by_or([nuc2val[cod[2]] for cod in codons]) # Compute the string representation of the resulting degenerate codon. # val2nuc is defined in code_utils.py degenerate_codon = val2nuc[degen1] + val2nuc[degen2] + val2nuc[degen3] # Associate this representation to all the synonymous codons it represents. for cod in codons: cod2degen[cod.lower()] = degenerate_codon.lower() # If restrict_to is not empty, it is likely that not all codons # are present in cod2degen, but the code_utils.recode_sequence function # will just keep those codons as is. # Make a copy of self. ali = self.dupe() for seq in ali.sequences: # Recode the sequence using the conversion dictionary built previously. # recode_sequence is defined in Codon_utils.py seq.sequence = recode_sequence(seq.sequence, cod2degen, positions, code=code) return ali
def degenerate(self, code="Standard", positions=[1, 2, 3], restrict_to=[], ignore=[], sub_code=None): """ This method returns a copy of *self* where the codons are replaced with degenerate versions. If *restrict_to* is not empty, only those codons that code amino-acids listed in *restrict_to* will be degenerated. If *ignore* is not empty, those codons that code amino-acids listed in *ignore* will not be degenerated. *positions* determines which codon positions are degenerated. By default, the whole codons are degenerated (if there is degeneracy of course). *code* is the Biopython name of the genetic code under which degeneracy has to be interpreted or a dictionary converting from codons to amino-acids (all in lower case). Default is to use the standard genetic code. Possible values for *code* are: %s *sub_code*, if provided, should be a dictionary associating amino-acids to codons (all in lower case). For the purpose of defining degeneracy groups, the codons present in *sub_code* will be considered as coding for the amino-acid defined there instead of the one defined by *code*. This can be used for instance to keep two distinct types of serine codons, with degeneracy only within each type. The codons still count as coding their original amino-acid with respect to the *restrict_to* and *ignore* options. """ % "\n".join(sorted(CodonTable.unambiguous_dna_by_name.keys())) if isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # codons belonging to different sub-groups of codons for one amino-acid # can be considered as coding different amino-acids # (sub-amino-acids of the "normal" amino-acid, for instance two types of serine) if sub_code is None: sub_code = {} else: assert isinstance(sub_code, dict), "sub_code must be a dictionary." sub_code = copy.copy(sub_code) # otherwise there are side effects: # the content of sub_code can be modified # in the calling context if any([sub_aa in code.values() for sub_aa in sub_code.values()]): msg = CAT([ "Note that at least one sub-aminoacid provided in sub_code ", "is identical to an amino-acid provided by the chosen genetic code.\n", "The sub-amino-acids are:\n%s\n" % ", ".join([str(aa) for aa in sub_code.values()]) ]) warnings.warn(msg) # Ensure the amino-acids are in lowercase. restrict_to = set([aa.lower() for aa in restrict_to]) ignored_aas = set([aa.lower() for aa in ignore]) # Find the groups of synonymous codons. # The keys are amino-acids, the values are lists of codons that code the amino-acid. aas_codons = {} for codon in code.keys(): aa = code[codon] if codon not in sub_code: sub_code[codon] = aa # sub_aa will be the same as aa sub_aa = sub_code[codon] # Only consider codons that are compatible with the restriction rule, if there is one. if (len(restrict_to) == 0 or aa.lower() in restrict_to) and not (aa.lower() in ignored_aas): #if aa in aas_codons: if sub_aa in aas_codons: #aas_codons[aa].append(codon) aas_codons[sub_aa].append(codon) else: #aas_codons[aa] = [codon] aas_codons[sub_aa] = [codon] # Build a conversion dictionary. # The keys are the codons, the values their degenerate replacements. cod2degen = {} for codons in aas_codons.values(): # Compute degeneracy values at the 3 positions # The degenerate value at a position is the binary union # of the values of the nucleotides found at that position. # nuc2val and reduce_by_or are defined in code_utils.py degen1 = reduce_by_or([nuc2val[cod[0]] for cod in codons]) degen2 = reduce_by_or([nuc2val[cod[1]] for cod in codons]) degen3 = reduce_by_or([nuc2val[cod[2]] for cod in codons]) # Compute the string representation of the resulting degenerate codon. # val2nuc is defined in code_utils.py degenerate_codon = val2nuc[degen1] + val2nuc[degen2] + val2nuc[degen3] # Associate this representation to all the synonymous codons it represents. for cod in codons: cod2degen[cod.lower()] = degenerate_codon.lower() # If restrict_to is not empty, it is likely that not all codons # are present in cod2degen, but the code_utils.recode_sequence function # will just keep those codons as is. # Make a copy of self. ali = self.dupe() for seq in ali.sequences: # Recode the sequence using the conversion dictionary built previously. # recode_sequence is defined in Codon_utils.py seq.sequence = recode_sequence(seq.sequence, cod2degen, positions, code=code) return ali