def calculate_codon_properties(cds_dict, args): print("\n\nPerforming codon analyses now\n") pcg = ['cox1','cox3','atp6','atp8','nad4l','nad4','nad6','nad1','nad5','cob','nad2','nad3','cox2'] cds_dict = filter_ambiguous(cds_dict, pcg) reference = [] i = 1 cont = -1 for tupl in cds_dict.keys(): print(' ' + str(round(i/len(cds_dict.keys())*100, 2)) + '%', end = "\r") i += 1 for gene in pcg: if gene in tupl: cai_value = "NA" sequence = cds_dict[tupl] sequence = sequence[:-3] rscu_list = [] rscu_list.append(sequence) if args.CAI == True: for other_tpl in cds_dict.keys(): cont += 1 if other_tpl != tupl: if list(cds_dict.keys())[cont][4] == args.CAI: reference.append(cds_dict[other_tpl]) cont = -1 cai_value = CAI(sequence, reference= reference, genetic_code= int(args.GeneCode)) rscu_values = RSCU(rscu_list, genetic_code= int(args.GeneCode)) write_codon_properties(file = tupl, cai = cai_value, rscu = rscu_values, args = args)
def test_cai(): # first, make sure all arguments get the same result assert CAI("AAC", reference=["AAC"]) == CAI("AAC", RSCUs=RSCU(["AAC"])) == CAI("AAC", weights=relative_adaptiveness(sequences=["AAC"])) == 1.0 # check other sequences assert CAI("AAT", reference=["AAC"]) == 0.5 assert CAI("AATAAT", reference=["AAC"]) == 0.5 assert CAI("AAT"*100, reference=["AAC"]) == 0.5
def test_bad_args(): # make sure bad arguments raise errors with pytest.raises(TypeError): CAI("AAC") # no reference data with pytest.raises(TypeError): CAI("AAC", reference=["AAC"], RSCUs=RSCU(["AAC"]))
def test_stop_codon(): # stop codons should be equivalent to an empty string since they don't have RSCUs assert RSCU(["TAA"]) == RSCU(["TAG"]) == RSCU([" "])
def test_multiple_seqs(): # multiple sequences should be identical to their concatenation assert RSCU(["AAC", "ATC"]) == RSCU(["AACATC"]) assert RSCU(["AAC", "ATC", "AACGATACGGCACGT"]) == RSCU(["AACATCAACGATACGGCACGT"])
def test_seq(): # make sure module works on Bio.Seq objects from Bio.Seq import Seq assert RSCU([Seq("AGC")]) == RSCU(["AGC"]) assert RSCU([Seq("AACGATACGGCACGT")]) == RSCU(["AACGATACGGCACGT"])
def test_str_arg(): # raise an error if given a string with pytest.raises(ValueError): RSCU("AAA")
def test_sum(): # The sum of the RSCUs should be equal to the number of codons assert abs(sum(RSCU(["AAC"]).values()) - len(RSCU(["AAC"]))) < 0.0001 assert abs(sum(RSCU(["AACGATACGGCACGT"]).values()) - len(RSCU(["AAC"]))) < 0.0001
def test_rscu(): assert RSCU(["AAC"]) == { "AAA": 1.0, "AAC": 1 / (0.5 * (1 + 0.5)), "AAG": 1.0, "AAT": 0.5 / (0.5 * (1 + 0.5)), "ACA": 1.0, "ACC": 1.0, "ACG": 1.0, "ACT": 1.0, "AGA": 1.0, "AGC": 1.0, "AGG": 1.0, "AGT": 1.0, "ATA": 1.0, "ATC": 1.0, "ATG": 1.0, "ATT": 1.0, "CAA": 1.0, "CAC": 1.0, "CAG": 1.0, "CAT": 1.0, "CCA": 1.0, "CCC": 1.0, "CCG": 1.0, "CCT": 1.0, "CGA": 1.0, "CGC": 1.0, "CGG": 1.0, "CGT": 1.0, "CTA": 1.0, "CTC": 1.0, "CTG": 1.0, "CTT": 1.0, "GAA": 1.0, "GAC": 1.0, "GAG": 1.0, "GAT": 1.0, "GCA": 1.0, "GCC": 1.0, "GCG": 1.0, "GCT": 1.0, "GGA": 1.0, "GGC": 1.0, "GGG": 1.0, "GGT": 1.0, "GTA": 1.0, "GTC": 1.0, "GTG": 1.0, "GTT": 1.0, "TAC": 1.0, "TAT": 1.0, "TCA": 1.0, "TCC": 1.0, "TCG": 1.0, "TCT": 1.0, "TGC": 1.0, "TGG": 1.0, "TGT": 1.0, "TTA": 1.0, "TTC": 1.0, "TTG": 1.0, "TTT": 1.0, }
seq_list = [] counter = 0 n_count = 0 seq_object = SeqIO.parse(input_path, "fasta") for seqs in seq_object: seq_id = seqs.id seq = str(seqs.seq) seq_list.append(seq) counter += 1 nn = len(seq) n_count += nn print("\n" + str(counter) + " genes imported containing " + str(n_count) + " nucleotides") print("\nCalculating RSCU for imported genes\n") try: RSCU_list = RSCU(seq_list) except: print("\nEXCEPTION: RSCU could not be caluclated for imported genes") print("\nParsing RSCU for codon optimization") #To do: Create RSCU parser function for k, v in codon_table_11.items(): for k2, v2 in v.items(): if k2 in RSCU_list: codon_table_11[k][k2] = RSCU_list[k2] print("\nOptimizing codons for input gene list") #Read gene fasta sequence and initiate optimizer problem = DnaOptimizationProblem(
def test_bad_args(): # make sure bad arguments raise errors with pytest.raises(TypeError): relative_adaptiveness() with pytest.raises(TypeError): relative_adaptiveness(sequences=["AAC"], RSCUs=RSCU(["AAC"]))
def test_arg_equivalence(): # should be able to take either reference sequences or an RSCU dict assert relative_adaptiveness(sequences=["AAC"]) == relative_adaptiveness( RSCUs=RSCU(["AAC"]))
def test_rscu(): assert RSCU(["AAC"]) == { 'AAA': 1.0, 'AAC': 1 / (0.5 * (1 + 0.5)), 'AAG': 1.0, 'AAT': 0.5 / (0.5 * (1 + 0.5)), 'ACA': 1.0, 'ACC': 1.0, 'ACG': 1.0, 'ACT': 1.0, 'AGA': 1.0, 'AGC': 1.0, 'AGG': 1.0, 'AGT': 1.0, 'ATA': 1.0, 'ATC': 1.0, 'ATG': 1.0, 'ATT': 1.0, 'CAA': 1.0, 'CAC': 1.0, 'CAG': 1.0, 'CAT': 1.0, 'CCA': 1.0, 'CCC': 1.0, 'CCG': 1.0, 'CCT': 1.0, 'CGA': 1.0, 'CGC': 1.0, 'CGG': 1.0, 'CGT': 1.0, 'CTA': 1.0, 'CTC': 1.0, 'CTG': 1.0, 'CTT': 1.0, 'GAA': 1.0, 'GAC': 1.0, 'GAG': 1.0, 'GAT': 1.0, 'GCA': 1.0, 'GCC': 1.0, 'GCG': 1.0, 'GCT': 1.0, 'GGA': 1.0, 'GGC': 1.0, 'GGG': 1.0, 'GGT': 1.0, 'GTA': 1.0, 'GTC': 1.0, 'GTG': 1.0, 'GTT': 1.0, 'TAC': 1.0, 'TAT': 1.0, 'TCA': 1.0, 'TCC': 1.0, 'TCG': 1.0, 'TCT': 1.0, 'TGC': 1.0, 'TGG': 1.0, 'TGT': 1.0, 'TTA': 1.0, 'TTC': 1.0, 'TTG': 1.0, 'TTT': 1.0 }
return ss.pearsonr(x, y)[0] elif "spearman" in corelationFunction: return ss.spearmanr(x, y, nan_policy="omit")[0] elif "kendall" in corelationFunction: return ss.kendalltau(x, y, nan_policy="omit")[0] geneDict = FSB.findSequenceByID(targetFastaFile, idType="raw") print("Adult Set:") seqList = [] for gene in geneDict: seq = geneDict[gene] seqList.append(seq) adultRSCU = RSCU(seqList) correlationList = [] for gene in geneDict: seq = geneDict[gene] rscu = RSCU([seq]) gene_profile = RSCU_To_List(rscu) correlationList.append( testCorelation(gene_profile, RSCU_To_List(adultRSCU), "spearman")) #print(correlationList) plt.figure() plt.xlabel("Correlation") plt.xlim(0, 1) plt.hist(correlationList, bins=20) plt.show()
from Bio import SeqIO from Bio.Seq import Seq from CAI import RSCU from matplotlib import pyplot as plt import numpy as np for seq_record in SeqIO.parse("KRas.gb", "genbank"): for f in seq_record.features: rscu_list = [] if f.type == 'CDS': feature_seq = f.location.extract(seq_record).seq coding_result = Seq(str(feature_seq)) # 新建一个列表,列表才能作为输入数据 rscu_list.append(coding_result) #print(rscu_list) codon_pre = RSCU(rscu_list) #print(codon_pre) # 满足rscu不为空就退出 if len(rscu_list): break code = [] value = [] for k in codon_pre: code.append(k) values = codon_pre[k] value.append(values) plt.switch_backend('Agg') plt.figure(figsize=(20, 6), dpi=80) width = 0.8