def getConstantAAMask(self, transl_table=1, code=None, restrict_to=[]): """ This method returns a mask corresponding to the triplets of sites where only one amino-acid is coded. This is intended to be used for submatrix extraction using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the non-constant sites). If *restrict_to* is not empty, only those sites that are constant and code for one of the amino-acids the one-letter code of which is in *restrict_to* will be considered constant. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. """ gm = ["Alignment.getConstantAAMask()"] if code is None: code = Code(transl_table).code elif isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg return CAT( map( lambda c_slice: "111" if codon_slice_is_constant_aa(c_slice, restrict_to) else "000", self.iter_codon_slices(code)))
def getDegenerateSitesMask(self, transl_table=1, code=None, all_3rd_positions=False): """This method returns a mask corresponding to sites contributing to codon degeneracy. This is intended to be used for submatrix extraction using the noLRSall3 method, using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the degeneracy-free sites). If *all_3rd_positions* is set to True, then the mask includes all 3rd codon positions regardless of their effective contribution to codon degeneracy. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. If such a code is provided, the value of *transl_table* is ignored. The name of this method noLRSall3 comes from its effect in the case of the standard genetic code: it discards the sites participating in first position degeneracy for leucine (L) and arginine (R), first and second position degeneracy for serine (S), as well as all third codon positions where degeneracy is observed (or all of them if *all_3rd_positions* is True). Depending on the genetic code used, the type of amino-acid affected could be different. The goal of the submatrix extraction using the produced mask is to remove the sites that could have been affected by composition bias: mutations within a set of synonymous codons are more likely to favour the codons that conform to the general nucleotide composition. However, one could argue that this bias is less likely to have played when the observed codons differ by more than one nucleotide and at least a non-synonymous mutation has to occur to bridge the gap. With the standard genetic code, this occurs for serine codons. Indeed, the minimal mutation paths connecting the serine AGY and TCN codon categories are AGY (serine) <-> TGY (cysteine) <-> TCY (serine) and AGY (serine) <-> ACY (threonine) <-> TCY (serine) The current implementation (as of june 2012) does not check that a mutational path between synonymous codons exists, that consists only in synonymous point mutations. This may be considered as a bug, because you may not want AGY and TCN (or other similar cases that could occur with different genetic codes) to be considered as a single degeneracy continuum. """ gm = ["Alignment.getDegenerateSitesMask()"] if code is None: #code = GeneticCode(transl_table).code # Use the generalized Code class defined in code_utils.py code = Code(transl_table).code n_codons = self.length / 3 mask = "" # Loop over the successive triplets of sites. for c in range(n_codons): # 3 alignment slices. One for each codon position. slices = [self.sequenceSlice((3 * c) + pos-1) for pos in [1, 2, 3]] # The different codons found for the current triplet of sites. codons = set([codon.lower() for codon in ["%s%s%s" % nnn for nnn in zip( slices[0], slices[1], slices[2])]]) # These are not Codon instances, this probably doesn't deal properly with ambiguity codes. # Record the amino-acids coded at the 3 nucleotides site, and the codons used for this aa. aas_codons = {} for codon in codons: # Determine the corresponding amino-acid. if codon == '---': aa = '-' elif code.has_key(codon): aa = code[codon] elif 'n' in codon: # This is a simplification. Some "degenerate" codons # can still code an unambiguous amino-acid. aa = 'x' else: gm.append("Codon %s is not defined in the chosen code " "or translation table." % codon) gm.append("%s" % str(code)) raise P4Error(gm) # Record the codon used for the aa. if aas_codons.has_key(aa): aas_codons[aa].append(codon) else: aas_codons[aa] = [codon] # Determine which positions in the triplet are degenerate. codon_mask = [False, False, False] # Loop over the recorded amino-acids. for aa in aas_codons.keys(): if len(aas_codons[aa]) > 1: # Several codons have been found at this triplet for the amino-acid aa. # For each position, count the number of different nucleotides # present in the used codons. degeneracy = [len(set([cod[0] for cod in aas_codons[aa]])), len(set([cod[1] for cod in aas_codons[aa]])), len(set([cod[2] for cod in aas_codons[aa]]))] if all_3rd_positions: # Put a position in the mask if it is already in the mask # or if it is degenerate, or if it is a 3rd position. codon_mask = [codon_mask[pos-1] or (degeneracy[pos-1] > 1) for pos in [1, 2]] + [True] else: # Put a position in the mask if it is already in the mask # or if it is degenerate. codon_mask = [codon_mask[pos-1] or (degeneracy[pos-1] > 1) for pos in [1, 2, 3]] if all(codon_mask): # All positions of the triplet have been found to contribute to # some codon degeneracy somewhere in the alignment. # There is no need to search further. break # Append the codon mask to the mask. mask += CAT(map(lambda b: "1" if b else "0", codon_mask)) return mask
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None): """Returns a pseudo protein alignment from *self*, a DNA alignment. The result is of datatype standard instead of protein, which allows the use of special recodings, like distinguishing between two types of serines, like in :meth:`Alignment.recode23aa()`. *self* is translated using :attribute:`Code(transl_table).code`. Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. It may be possible to use a code based on another codon length as 3, but this has not been tested as of June 2012. At the moment, we can only do translations where the sequences are phased with the coding frame, ie the first sequence position is the first position of the codon, and the last sequence position should be a last codon position. The default behaviour is to use translation table 1, that is the standard genetic code. Other available translation tables, this week:: if transl_table == 1: # standard elif transl_table == 2: # vertebrate mito elif transl_table == 4: # Mold, Protozoan, # and Coelenterate Mitochondrial Code # and the Mycoplasma/Spiroplasma Code elif transl_table == 5: # invertebrate mito elif transl_table == 9: # echinoderm mito and now 6, 10, 11, 12, 13, 14, 21. (These are found in p4.GeneticCode.py or in :class:`Code`) *transl_table* may also be provided as text consisting in blank-separated elements. Each elements consists in n characters, where n is the number of defined codons. The first element lists the coded (pseudo-)amino-acids. The second elements describes whether a codon can be a start codon ('M') or not ('-'). The other elements correspond to the (pseudo-)nucleotides at the successive codon positions. Example:: FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG ---M---------------M------------MMMM---------------M------------ TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG """ gm = ['p4.alignment_recoding.pseudoTranslate()'] if self.dataType != 'dna': gm.append("Self should be a DNA alignment") raise P4Error(gm) if code is None: #from GeneticCode import Code code = Code(transl_table, in_type="dna", out_type=out_type).code codelength = Code(transl_table).codelength else: if isinstance(code, types.StringType): code = getBiopythonCode(code) # defined in code_utils.py # We assume that the "codons" have all the same length, # and we look at the first codon in the dictionary to know this length. codelength = len(code.keys()[0]) # We use standard type, because, depending on the code used to make the translation, # we may get something that contains symbols not corresponding to normal amino-acids. out_type = "standard" if self.length % codelength != 0: gm.append("The length of self should be a multiple of %i" % codelength) raise P4Error(gm) ali = self.dupe() ali.dataType = out_type ali.length = self.length / codelength ali.symbols = CAT(sorted(set(code.values()))) ali.equates = {} ali.dim = len(ali.symbols) ali.nexusSets = None ali.parts = [] ali.excludeDelete = None for seq in ali.sequences: # Initialize an all-gap sequence. seq.sequence = ['-'] * ali.length seq.dataType = out_type for i in range(len(self.sequences)): # the original sequence dnaSeq = self.sequences[i].sequence # the future pseudo-translation pseudoProtSeq = ali.sequences[i].sequence for j in range(ali.length): theCodon = dnaSeq[(j * codelength):((j+1) * codelength)] if code.has_key(theCodon): pseudoProtSeq[j] = code[theCodon] elif theCodon == '-' * codelength: # full indel pseudoProtSeq[j] = '-' elif theCodon.count('-'): # partial indel gm.append(" seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % ( i, j, (j*codelength), theCodon)) raise P4Error(gm) else: # Should we use a CodonTranslationError (defined in code_utils.py) here ? gm.append(" seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % ( i, j, (j*codelength), theCodon)) raise P4Error(gm) for seq in ali.sequences: # Convert from list to string. #s.sequence = string.join(s.sequence, '') seq.sequence = CAT(seq.sequence) #print s.sequence return ali
def getDegenerateSiteMaskForPos(self, pos, transl_table=1, code=None, restrict_to=[], ignore=[]): """ This method returns a mask corresponding to the sites where degeneracy has been observed if they correspond to a *pos*-th codon position. This is intended to be used for submatrix extraction using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the degeneracy-free sites). If *restrict_to* is not empty, only those amino-acids the one-lettre code of which is in *restrict_to* are considered. If *ignore* is not empty, only those amino-acid the one-lettre code of which is not in *ignore* are considered. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. """ gm = ["Alignment.getDegenerateSiteMaskForPos()"] if code is None: code = Code(transl_table).code elif isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg def pos_mask(i): if i == pos: return "1" else: return "0" def triplet_mask(selected): if selected: return CAT(map(pos_mask, [1, 2, 3])) else: return "000" # Iterate over the slices, find the triplets that will be included in the mask # (those where degeneracy occurs), generate the corresponding mask portions, # and join the mask portions to make the matrix mask. return CAT( map(triplet_mask, [ codon_position_is_degenerate(cod_slice, pos, restrict_to, ignore) for cod_slice in self.iter_codon_slices(code) ]))
def getDegenerateSitesMask(self, transl_table=1, code=None, all_3rd_positions=False): """This method returns a mask corresponding to sites contributing to codon degeneracy. This is intended to be used for submatrix extraction using the noLRSall3 method, using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the degeneracy-free sites). If *all_3rd_positions* is set to True, then the mask includes all 3rd codon positions regardless of their effective contribution to codon degeneracy. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. If such a code is provided, the value of *transl_table* is ignored. The name of this method noLRSall3 comes from its effect in the case of the standard genetic code: it discards the sites participating in first position degeneracy for leucine (L) and arginine (R), first and second position degeneracy for serine (S), as well as all third codon positions where degeneracy is observed (or all of them if *all_3rd_positions* is True). Depending on the genetic code used, the type of amino-acid affected could be different. The goal of the submatrix extraction using the produced mask is to remove the sites that could have been affected by composition bias: mutations within a set of synonymous codons are more likely to favour the codons that conform to the general nucleotide composition. However, one could argue that this bias is less likely to have played when the observed codons differ by more than one nucleotide and at least a non-synonymous mutation has to occur to bridge the gap. With the standard genetic code, this occurs for serine codons. Indeed, the minimal mutation paths connecting the serine AGY and TCN codon categories are AGY (serine) <-> TGY (cysteine) <-> TCY (serine) and AGY (serine) <-> ACY (threonine) <-> TCY (serine) The current implementation (as of june 2012) does not check that a mutational path between synonymous codons exists, that consists only in synonymous point mutations. This may be considered as a bug, because you may not want AGY and TCN (or other similar cases that could occur with different genetic codes) to be considered as a single degeneracy continuum. """ gm = ["Alignment.getDegenerateSitesMask()"] if code is None: #code = GeneticCode(transl_table).code # Use the generalized Code class defined in code_utils.py code = Code(transl_table).code n_codons = self.length / 3 mask = "" # Loop over the successive triplets of sites. for c in range(n_codons): # 3 alignment slices. One for each codon position. slices = [self.sequenceSlice((3 * c) + pos - 1) for pos in [1, 2, 3]] # The different codons found for the current triplet of sites. codons = set([ codon.lower() for codon in ["%s%s%s" % nnn for nnn in zip(slices[0], slices[1], slices[2])] ]) # These are not Codon instances, this probably doesn't deal properly with ambiguity codes. # Record the amino-acids coded at the 3 nucleotides site, and the codons used for this aa. aas_codons = {} for codon in codons: # Determine the corresponding amino-acid. if codon == '---': aa = '-' elif codon in code: aa = code[codon] elif 'n' in codon: # This is a simplification. Some "degenerate" codons # can still code an unambiguous amino-acid. aa = 'x' else: gm.append("Codon %s is not defined in the chosen code " "or translation table." % codon) gm.append("%s" % str(code)) raise P4Error(gm) # Record the codon used for the aa. if aa in aas_codons: aas_codons[aa].append(codon) else: aas_codons[aa] = [codon] # Determine which positions in the triplet are degenerate. codon_mask = [False, False, False] # Loop over the recorded amino-acids. for aa in aas_codons.keys(): if len(aas_codons[aa]) > 1: # Several codons have been found at this triplet for the amino-acid aa. # For each position, count the number of different nucleotides # present in the used codons. degeneracy = [ len(set([cod[0] for cod in aas_codons[aa]])), len(set([cod[1] for cod in aas_codons[aa]])), len(set([cod[2] for cod in aas_codons[aa]])) ] if all_3rd_positions: # Put a position in the mask if it is already in the mask # or if it is degenerate, or if it is a 3rd position. codon_mask = [ codon_mask[pos - 1] or (degeneracy[pos - 1] > 1) for pos in [1, 2] ] + [True] else: # Put a position in the mask if it is already in the mask # or if it is degenerate. codon_mask = [ codon_mask[pos - 1] or (degeneracy[pos - 1] > 1) for pos in [1, 2, 3] ] if all(codon_mask): # All positions of the triplet have been found to contribute to # some codon degeneracy somewhere in the alignment. # There is no need to search further. break # Append the codon mask to the mask. mask += CAT(map(lambda b: "1" if b else "0", codon_mask)) return mask
def getDegenerateCodonsMask(self, transl_table=1, code=None, restrict_to=[], ignore=[]): """ This method returns a mask corresponding to the triplets of sites where degeneracy has been observed. This is intended to be used for submatrix extraction using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the sites with no degenerate codons). If *restrict_to* is not empty, only those amino-acid the one-lettre code of which is in *restrict_to* are considered. If *ignore* is not empty, only those amino-acid the one-lettre code of which is not in *ignore* are considered. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. """ gm = ["Alignment.getDegenerateCodonsMask()"] if code is None: #code = GeneticCode(transl_table).code code = Code(transl_table).code elif isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # Experiments to test the speed of execution. #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code)) #return mask #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code)) #mask = "" # Loop over the successive triplets of sites. #for codon_slice in self.iter_codon_slices(code): #for codon_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]: #for i in xrange(0, self.length, 3): # # i is positioned at the first codon position of the triplet. # codon_slice = codons_from_triplet_slice(self.triplet_slice(i), code) # if codon_slice_is_degenerate(codon_slice, restrict_to): # mask += "111" # else: # mask += "000" #return "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)]) #mask= "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in self.iter_codon_slices(code)) #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code))) #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code))) #mask = "".join(("111" if codon_slice_is_degenerate(cod_slice, restrict_to) else "000") for cod_slice in [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)]) #mask = "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in xrange(0, self.length, 3)])) #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", [codons_from_triplet_slice(self.triplet_slice(i), code) for i in range(0, self.length, 3)])) #return mask #return "".join(map(lambda c_slice : "111" if codon_slice_is_degenerate(c_slice, restrict_to) else "000", self.iter_codon_slices(code))) return CAT( map( lambda c_slice: "111" if codon_slice_is_degenerate( c_slice, restrict_to, ignore) else "000", self.iter_codon_slices(code)))
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None): """Returns a pseudo protein alignment from *self*, a DNA alignment. The result is of datatype standard instead of protein, which allows the use of special recodings, like distinguishing between two types of serines, like in :meth:`Alignment.recode23aa()`. *self* is translated using :attribute:`Code(transl_table).code`. Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. It may be possible to use a code based on another codon length as 3, but this has not been tested as of June 2012. At the moment, we can only do translations where the sequences are phased with the coding frame, ie the first sequence position is the first position of the codon, and the last sequence position should be a last codon position. The default behaviour is to use translation table 1, that is the standard genetic code. Other available translation tables, this week:: if transl_table == 1: # standard elif transl_table == 2: # vertebrate mito elif transl_table == 4: # Mold, Protozoan, # and Coelenterate Mitochondrial Code # and the Mycoplasma/Spiroplasma Code elif transl_table == 5: # invertebrate mito elif transl_table == 9: # echinoderm mito and now 6, 10, 11, 12, 13, 14, 21. (These are found in p4.GeneticCode.py or in :class:`Code`) *transl_table* may also be provided as text consisting in blank-separated elements. Each elements consists in n characters, where n is the number of defined codons. The first element lists the coded (pseudo-)amino-acids. The second elements describes whether a codon can be a start codon ('M') or not ('-'). The other elements correspond to the (pseudo-)nucleotides at the successive codon positions. Example:: FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG ---M---------------M------------MMMM---------------M------------ TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG """ gm = ['p4.alignment_recoding.pseudoTranslate()'] if self.dataType != 'dna': gm.append("Self should be a DNA alignment") raise P4Error(gm) if code is None: #from GeneticCode import Code code = Code(transl_table, in_type="dna", out_type=out_type).code codelength = Code(transl_table).codelength else: if isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py # We assume that the "codons" have all the same length, # and we look at the first codon in the dictionary to know this length. codelength = len(code.keys()[0]) # We use standard type, because, depending on the code used to make the translation, # we may get something that contains symbols not corresponding to normal amino-acids. out_type = "standard" if self.length % codelength != 0: gm.append("The length of self should be a multiple of %i" % codelength) raise P4Error(gm) ali = self.dupe() ali.dataType = out_type ali.length = self.length / codelength ali.symbols = CAT(sorted(set(code.values()))) ali.equates = {} ali.dim = len(ali.symbols) ali.nexusSets = None ali.parts = [] ali.excludeDelete = None for seq in ali.sequences: # Initialize an all-gap sequence. seq.sequence = ['-'] * ali.length seq.dataType = out_type for i in range(len(self.sequences)): # the original sequence dnaSeq = self.sequences[i].sequence # the future pseudo-translation pseudoProtSeq = ali.sequences[i].sequence for j in range(ali.length): theCodon = dnaSeq[(j * codelength):((j + 1) * codelength)] if theCodon in code: pseudoProtSeq[j] = code[theCodon] elif theCodon == '-' * codelength: # full indel pseudoProtSeq[j] = '-' elif theCodon.count('-'): # partial indel gm.append( " seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) else: # Should we use a CodonTranslationError (defined in code_utils.py) here ? gm.append( " seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) for seq in ali.sequences: # Convert from list to string. #s.sequence = ''.join(s.sequence) seq.sequence = CAT(seq.sequence) #print s.sequence return ali