gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c], c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: orig_codons = [c for c in translate.codons(seq)] prot_seq = translate.translate(seq) if not prot_seq is None: for aa in translate.AAs(): codons[aa] = [ c for c in translate.getCodonsForAA(aa, rna=False) if relad_dict[c] >= options.min_rel_adapt ] opt_seq = '' for (aai, aa) in enumerate(prot_seq): #opt_seq += opt_codon_dict[aa] #random.choice(codons[aa]) codons_to_choose_from = codons[aa] # If avoiding codons and we have a choice, eliminate the avoided codon. if options.avoid_sequence and len( codons_to_choose_from) > 1: try: codons_to_choose_from.remove(orig_codons[aai]) except ValueError: # codon to be avoided not among codon choices anyway pass opt_seq += random.choice(codons_to_choose_from) assert translate.translate(opt_seq) == prot_seq
def test_run(self): # Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z. # Assign reference codons # Build a dictionary where each codon gets its reference. random.seed(111) gc = translate.geneticCode(rna=False) reference_codon_dict1 = {} reference_codon_dict2 = {} for codon in translate.AADNACodons(): aa = gc[codon] aa_codons = translate.getCodonsForAA(aa, rna=False) # Sort in alphabetical order by reverse. aa_codons.sort(key=lambda x: x[::-1]) reference_codon_dict1[codon] = aa_codons[0] reference_codon_dict2[codon] = aa_codons[-1] reference_codon_dict1['GCA'] = 'GCC' reference_codon_dict2['GCA'] = 'GCT' # Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT #reference_codon_dict[] species = ['x','y'] prots = {"dmel":'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP', #"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP', #"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY', #"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'} "dyak":'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX'} genes = {"dmel":"ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA", "dyak":"ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA"} #algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species]) species = ['x','y'] prot1 = randomProtein(1000) prot2 = mutProtein(prot1) prots = {"x":prot1, "y":prot2} algenes = dict([(s,translate.randomReverseTranslate(prots[s],bad_codon='---')) for s in species]) #print algenes gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict1, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict2, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) #print gene_codon_tables1 eps = 1e-6 #for aa in translate.degenerateAAs(): # for codon in translate.getCodonsForAA(aa): for codon in ['GCA']: ref_codon1 = reference_codon_dict1[codon] ref_codon2 = reference_codon_dict2[codon] self.assertTrue(gc[codon] == gc[ref_codon1]) self.assertTrue(gc[codon] == gc[ref_codon2]) # codon to ref_codon1 mh_res1 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables1[codon]) sc_1_to_r1 = -mh_res1.ln_odds_ratio # ref_codon1 to ref_codon2 mh_res2 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[ref_codon1]) sc_r1_to_r2 = -mh_res2.ln_odds_ratio # codon to ref_codon2 mh_res3 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[codon]) sc_1_to_r2 = -mh_res3.ln_odds_ratio # prediction from additivity pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2 #print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1) #print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2) #print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2) #, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2 self.assertTrue(abs(sc_1_to_r2-pred_sc_1_to_r2) < eps)
data_outs = util.OutStreams() # Start up output if not options.out_fname is None: outf = file(options.out_fname, 'w') data_outs.addStream(outf) else: data_outs.addStream(sys.stdout) formatFxn = biofile.getIDFunction(options.format) cdna_dict = biofile.readFASTADict(in_fname, formatFxn) calc = Calculator() calc.initializeFromSequences(cdna_dict.values(), options.pseudocount) syn_dict = calc.getCodonSYNScores() syn_opt_codons = [] for aa in translate.degenerateAAs(): codons = translate.getCodonsForAA(aa, rna=False) best_syn_codon = sorted([(syn_dict[c],c) for c in codons])[-1][1] syn_opt_codons.append(best_syn_codon) data_outs.write("# Read {0}\n#{1:d} sequences, {2:d} codons, {3:d} nucleotides\n".format(in_fname, len(cdna_dict.keys()), int(sum(calc.codon_freq.values())), int(sum(calc.nucleotide_freq.values())))) data_outs.write("# syn_scores = {0!s}\n".format(syn_dict)) data_outs.write("# SYN opt codons = {0!s}\n".format(sorted(syn_opt_codons))) data_outs.write("{0!s}".format(calc)) if not options.score_dict_fname is None: pickle.dump(syn_dict, file(options.score_dict_fname,'w')) if not options.score_fname is None: outf = file(options.score_fname, 'w') outf.write("orf\tsyn\n") orfs = cdna_dict.keys() n_written = 0
def test_run(self): # Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z. # Assign reference codons # Build a dictionary where each codon gets its reference. random.seed(111) gc = translate.geneticCode(rna=False) reference_codon_dict1 = {} reference_codon_dict2 = {} for codon in translate.AADNACodons(): aa = gc[codon] aa_codons = translate.getCodonsForAA(aa, rna=False) # Sort in alphabetical order by reverse. aa_codons.sort(key=lambda x: x[::-1]) reference_codon_dict1[codon] = aa_codons[0] reference_codon_dict2[codon] = aa_codons[-1] reference_codon_dict1['GCA'] = 'GCC' reference_codon_dict2['GCA'] = 'GCT' # Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT #reference_codon_dict[] species = ['x', 'y'] prots = { "dmel": 'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP', #"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP', #"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY', #"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'} "dyak": 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX' } genes = { "dmel": "ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA", "dyak": "ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA" } #algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species]) species = ['x', 'y'] prot1 = randomProtein(1000) prot2 = mutProtein(prot1) prots = {"x": prot1, "y": prot2} algenes = dict([(s, translate.randomReverseTranslate(prots[s], bad_codon='---')) for s in species]) #print algenes gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon( cai.conservedAA, reference_codon_dict1, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon( cai.conservedAA, reference_codon_dict2, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) #print gene_codon_tables1 eps = 1e-6 #for aa in translate.degenerateAAs(): # for codon in translate.getCodonsForAA(aa): for codon in ['GCA']: ref_codon1 = reference_codon_dict1[codon] ref_codon2 = reference_codon_dict2[codon] self.assertTrue(gc[codon] == gc[ref_codon1]) self.assertTrue(gc[codon] == gc[ref_codon2]) # codon to ref_codon1 mh_res1 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables1[codon]) sc_1_to_r1 = -mh_res1.ln_odds_ratio # ref_codon1 to ref_codon2 mh_res2 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables2[ref_codon1]) sc_r1_to_r2 = -mh_res2.ln_odds_ratio # codon to ref_codon2 mh_res3 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables2[codon]) sc_1_to_r2 = -mh_res3.ln_odds_ratio # prediction from additivity pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2 #print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1) #print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2) #print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2) #, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2 self.assertTrue(abs(sc_1_to_r2 - pred_sc_1_to_r2) < eps)
info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c],c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: orig_codons = [c for c in translate.codons(seq)] prot_seq = translate.translate(seq) if not prot_seq is None: for aa in translate.AAs(): codons[aa] = [c for c in translate.getCodonsForAA(aa, rna=False) if relad_dict[c] >= options.min_rel_adapt] opt_seq = '' for (aai, aa) in enumerate(prot_seq): #opt_seq += opt_codon_dict[aa] #random.choice(codons[aa]) codons_to_choose_from = codons[aa] # If avoiding codons and we have a choice, eliminate the avoided codon. if options.avoid_sequence and len(codons_to_choose_from)>1: try: codons_to_choose_from.remove(orig_codons[aai]) except ValueError: # codon to be avoided not among codon choices anyway pass opt_seq += random.choice(codons_to_choose_from) assert translate.translate(opt_seq) == prot_seq header_line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}".format(id, cai.getFop(opt_seq, opt_codons), cai_fxn(opt_seq), cai.getGC(opt_seq)) info_outs.write("# Optimized {}\n".format(header_line)) opt_headers.append(header_line)