def test_codon_usage_custom(self): """Test Codon Adaptation Index (CAI) using FASTA file for background.""" # We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(dna_genbank_filename, "genbank") records = [] for feature in record.features: if feature.type == "CDS" and len(feature.location.parts) == 1: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1: seq = record.seq[start:end].reverse_complement() else: seq = record.seq[start:end] # Double check we have the CDS sequence expected # TODO - Use any cds_start option if/when added to deal with the met a = "M" + str(seq[3:].translate(table)) b = feature.qualifiers["translation"][0] + "*" self.assertEqual(a, b, "%r vs %r" % (a, b)) records.append( SeqRecord( seq, id=feature.qualifiers["protein_id"][0], description=feature.qualifiers["product"][0], ) ) with open(dna_fasta_filename, "w") as handle: SeqIO.write(records, handle, "fasta") CAI = CodonAdaptationIndex() # Note - this needs a FASTA file which containing non-ambiguous DNA coding # sequences - which should each be a whole number of codons. CAI.generate_index(dna_fasta_filename) # Now check codon usage index (CAI) using this species self.assertEqual( record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001" ) value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") self.assertAlmostEqual(value, 0.67213, places=5) os.remove(dna_fasta_filename)
def calc_cai(sequence, genbank, cai_freqs=CAI_FREQS): """Return the CAI for a given genome.""" # create CAI index cai_index = {} for codons in SynonymousCodons.values(): codons = list(codons) codon_freqs = np.array([cai_freqs[x] for x in codons]) max_freq = max(codon_freqs) codon_freqs = codon_freqs / max_freq for i, x in enumerate(codons): cai_index[x] = codon_freqs[i] cai_table = CodonAdaptationIndex() cai_table.set_cai_index(cai_index) # concatenate ORFs orfs = [x for x in genbank.features if x.type.lower() == "cds"] cds_seq = "" for orf in orfs: cds_seq += proc_sequence( str(orf.extract(sequence)).upper().replace("U", "T")) # return cai return cai_table.cai_for_gene(cds_seq)
def test_codon_usage_custom(self): """Test Codon Adaptation Index (CAI) using FASTA file for background.""" #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(dna_genbank_filename, "genbank") records = [] for feature in record.features: if feature.type == "CDS" and not feature.sub_features: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1: seq = record.seq[start:end].reverse_complement() else: seq = record.seq[start:end] #Double check we have the CDS sequence expected #TODO - Use any cds_start option if/when added to deal with the met a = "M" + str(seq[3:].translate(table)) b = feature.qualifiers["translation"][0] + "*" self.assertEqual(a, b, "%r vs %r" % (a, b)) records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0], description=feature.qualifiers["product"][0])) with open(dna_fasta_filename, "w") as handle: SeqIO.write(records, handle, "fasta") CAI = CodonAdaptationIndex() # Note - this needs a FASTA file which containing non-ambiguous DNA coding # sequences - which should each be a whole number of codons. CAI.generate_index(dna_fasta_filename) # Now check codon usage index (CAI) using this species self.assertEqual(record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001") self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.67213") os.remove(dna_fasta_filename)
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records): assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING. print "Example CAI %0.5f using E. coli (default)" \ % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(open(dna_genbank_filename), "genbank") records = [] for feature in record.features: if feature.type == "CDS" \ and not feature.sub_features: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1: seq = record.seq[start:end].reverse_complement() else:
Pro CCG 15778 5.40 0.12 Pro CCA 51993 17.79 0.41 Pro CCT 39685 13.58 0.31 Pro CCC 20139 6.89 0.16''' import csv index = {} for aa, cn, n1,n2, f in csv.reader([x for x in cai_table.splitlines() if x.strip()], delimiter='\t'): index[cn] = float(n2) from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex from pprint import pprint cai = CodonAdaptationIndex() cai.set_cai_index(index) from Bio import SeqIO genes = list(SeqIO.parse("genes.fasta", "fasta")) from Bio.SeqUtils import GC for g in genes: print g.id, " ",cai.cai_for_gene( g.seq.tostring() )#, GC(g.seq.tostring())
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records) : assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING. print "Example CAI %0.5f using E. coli (default)" \ % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(open(dna_genbank_filename), "genbank") records = [] for feature in record.features : if feature.type == "CDS" \ and not feature.sub_features : start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1 : seq = record.seq[start:end].reverse_complement() else :
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.09978")
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() self.assertEqual( "%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.09978")
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") self.assertAlmostEqual(value, 0.09978, places=5)
for codon in CodonsDict.keys(): DNA_codon = codon.replace("T","U") i = cai_table.find( DNA_codon ) index[codon] = float(cai_table[i+4:i+8])/100 from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex from pprint import pprint cai = CodonAdaptationIndex() cai.set_cai_index(index) print cai.cai_for_gene("ATGACTGAATTCAAGGCCGGTTCTGCTAAGAAAGGTGCTACACTTTTCAAGACTAGATGTCTACAATGCCACACCGTGGAAAAGGGTGGCCCACATAAGGTTGGTCCAAACTTGCATGGTATCTTTGGCAGACACTCTGGTCAAGCTGAAGGGTATTCGTACACAGATGCCAATATCAAGAAAAACGTGTTGTGGGACGAAAATAACATGTCAGAGTACTTGACTAACCCAAAGAAATATATTCCTGGTACCAAGATGGCCTTTGGTGGGTTGAAGAAGGAAAAAGACAGAAACGACTTAATTACCTACTTGAAAAAAGCCTGTGAGTAA") # http://www.genscript.com/cgi-bin/tools/rare_codon_analysis # cai = 0.79 ''' UUU 26.1(170666) UCU 23.5(153557) UAU 18.8(122728) UGU 8.1( 52903) UUC 18.4(120510) UCC 14.2( 92923) UAC 14.8( 96596) UGC 4.8( 31095) UUA 26.2(170884) UCA 18.7(122028) UAA 1.1( 6913) UGA 0.7( 4447) UUG 27.2(177573) UCG 8.6( 55951) UAG 0.5( 3312) UGG 10.4( 67789) CUU 12.3( 80076) CCU 13.5( 88263) CAU 13.6( 89007) CGU 6.4( 41791) CUC 5.4( 35545) CCC 6.8( 44309) CAC 7.8( 50785) CGC 2.6( 16993)
for record in SeqIO.parse(input_file, "fasta"): subdat = [record.id] if is_it_an_orf(str(record.seq)): orf_nt = str(record.seq) orf_aa = str(record.seq.translate()).replace("*", "") if trim: orf_aa = orf_aa[1:] orf_nt = orf_nt[3:] length = len(orf_nt.upper()) mw = Analyze(orf_aa).molecular_weight() pI = Analyze(orf_aa).isoelectric_point() aroma = Analyze(orf_aa).aromaticity() hydrophobe = Analyze(orf_aa).gravy() instability = Analyze(orf_aa).instability_index() cai = CAI.cai_for_gene(orf_nt.upper()) mp = mt.Tm_GC(orf_nt) A = orf_nt.upper().count("A") T = orf_nt.upper().count("T") C = orf_nt.upper().count("C") G = orf_nt.upper().count("G") CpG = orf_nt.upper().count("CG") + orf_nt.upper().count( "GC") # a forward GpC is a reverse CpG stop = stopz[orf_nt.upper()[-3:]] subdat.extend([ length, mw, mp, pI, aroma, hydrophobe, instability, cai, A, T, C, G, CpG, stop ]) nuWreck = record.translate()