def disambiguateaa(input, output): """Replace IUPAC ambiguous amino acids with unambiguous ones Specifically, make the following replacements: B => DN X => ACDEFGHIKLMNPQRSTVWY Z => EQ J => LI, U => C (selenocysteine) O => K (pyrrolysine) If there are multiple possible replacements, this operation will output a sequence for each possible option. Use caution with sequences that are highly ambiguous (e.g., with many Xs), as in this case a single sequence could lead to an explosion in the output. INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ for (name, ambig, qual) in readfq(input): n = num_disambiguated_iupac_aa(ambig) digits = floor(log10(n)) + 1 fmt = f"{name}|disambig_{{:0{digits}d}}" for (i, unambig) in enumerate(disambiguate_iupac_aa(ambig)): if n > 1: name = fmt.format(i + 1) print(f">{name}\n{unambig}", file=output)
def test_O(self): ambig = Seq("AAOAA") disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AAKAA"}
def test_adjacent_ambig(self): ambig = Seq("AAJJAA") # map to str here bc of annoying biopython warning when hashing a Seq proteins = set(map(str, disambiguate_iupac_aa(ambig))) assert len(proteins) == 4 assert proteins == {"AALLAA", "AALIAA", "AAILAA", "AAIIAA"}
def test_Z(self): ambig = Seq("AAZAA") disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AAEAA", "AAQAA"}
def test_J(self): ambig = Seq("AAJAA") disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AALAA", "AAIAA"}
def test_X(self): ambig = Seq("AAXAA") disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AA{}AA".format(aa) for aa in all_aa_protein_seq}
def test_B(self): ambig = Seq("AABAA") disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AADAA", "AANAA"}
def test_unambig(self): proteins = list(disambiguate_iupac_aa(all_aa_protein_seq)) assert len(proteins) == 1 assert proteins[0] == all_aa_protein_seq
def test_adjacent_ambig(self): ambig = Seq("AAJJAA", protein) proteins = set(disambiguate_iupac_aa(ambig)) assert len(proteins) == 4 assert proteins == {"AALLAA", "AALIAA", "AAILAA", "AAIIAA"}
def test_U(self): ambig = Seq("AAUAA", protein) disambig = {str(p) for p in disambiguate_iupac_aa(ambig)} assert disambig == {"AACAA"}