def setUpClass(cls): # Example of crc64 collision from Sebastian Bassi using the # immunoglobulin lambda light chain variable region from H**o sapiens # Both sequences share the same CRC64 checksum: 44CAAD88706CC153 cls.str_light_chain_one = ( "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEGSKRPSGV" "SNRFSGSKSGNTASLTISGLQAEDEADYYCSSYAGSSTLVFGGGTKLTVL") cls.str_light_chain_two = ( "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEGSKRPSGV" "SNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSSTWVFGGGTKLTVL") X = CodonAdaptationIndex() path = os.path.join("CodonUsage", "HighlyExpressedGenes.txt") X.generate_index(path) cls.X = X
def test_codon_usage_custom(self): """Test Codon Adaptation Index (CAI) using FASTA file for background.""" # We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(dna_genbank_filename, "genbank") records = [] for feature in record.features: if feature.type == "CDS" and len(feature.location.parts) == 1: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1: seq = record.seq[start:end].reverse_complement() else: seq = record.seq[start:end] # Double check we have the CDS sequence expected # TODO - Use any cds_start option if/when added to deal with the met a = "M" + str(seq[3:].translate(table)) b = feature.qualifiers["translation"][0] + "*" self.assertEqual(a, b, "%r vs %r" % (a, b)) records.append( SeqRecord( seq, id=feature.qualifiers["protein_id"][0], description=feature.qualifiers["product"][0], ) ) with open(dna_fasta_filename, "w") as handle: SeqIO.write(records, handle, "fasta") CAI = CodonAdaptationIndex() # Note - this needs a FASTA file which containing non-ambiguous DNA coding # sequences - which should each be a whole number of codons. CAI.generate_index(dna_fasta_filename) # Now check codon usage index (CAI) using this species self.assertEqual( record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001" ) value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") self.assertAlmostEqual(value, 0.67213, places=5) os.remove(dna_fasta_filename)
def calc_cai(sequence, genbank, cai_freqs=CAI_FREQS): """Return the CAI for a given genome.""" # create CAI index cai_index = {} for codons in SynonymousCodons.values(): codons = list(codons) codon_freqs = np.array([cai_freqs[x] for x in codons]) max_freq = max(codon_freqs) codon_freqs = codon_freqs / max_freq for i, x in enumerate(codons): cai_index[x] = codon_freqs[i] cai_table = CodonAdaptationIndex() cai_table.set_cai_index(cai_index) # concatenate ORFs orfs = [x for x in genbank.features if x.type.lower() == "cds"] cds_seq = "" for orf in orfs: cds_seq += proc_sequence( str(orf.extract(sequence)).upper().replace("U", "T")) # return cai return cai_table.cai_for_gene(cds_seq)
def test_codon_usage_custom(self): """Test Codon Adaptation Index (CAI) using FASTA file for background.""" #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(dna_genbank_filename, "genbank") records = [] for feature in record.features: if feature.type == "CDS" and not feature.sub_features: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0]) if feature.strand == -1: seq = record.seq[start:end].reverse_complement() else: seq = record.seq[start:end] #Double check we have the CDS sequence expected #TODO - Use any cds_start option if/when added to deal with the met a = "M" + str(seq[3:].translate(table)) b = feature.qualifiers["translation"][0] + "*" self.assertEqual(a, b, "%r vs %r" % (a, b)) records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0], description=feature.qualifiers["product"][0])) with open(dna_fasta_filename, "w") as handle: SeqIO.write(records, handle, "fasta") CAI = CodonAdaptationIndex() # Note - this needs a FASTA file which containing non-ambiguous DNA coding # sequences - which should each be a whole number of codons. CAI.generate_index(dna_fasta_filename) # Now check codon usage index (CAI) using this species self.assertEqual(record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001") self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.67213") os.remove(dna_fasta_filename)
tuple_records = quick_FASTA_reader(dna_fasta_filename) assert len(tuple_records)==3 seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records): assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING. print "Example CAI %0.5f using E. coli (default)" \ % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(open(dna_genbank_filename), "genbank") records = [] for feature in record.features: if feature.type == "CDS" \ and not feature.sub_features: start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0])
Pro CCG 15778 5.40 0.12 Pro CCA 51993 17.79 0.41 Pro CCT 39685 13.58 0.31 Pro CCC 20139 6.89 0.16''' import csv index = {} for aa, cn, n1,n2, f in csv.reader([x for x in cai_table.splitlines() if x.strip()], delimiter='\t'): index[cn] = float(n2) from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex from pprint import pprint cai = CodonAdaptationIndex() cai.set_cai_index(index) from Bio import SeqIO genes = list(SeqIO.parse("genes.fasta", "fasta")) from Bio.SeqUtils import GC for g in genes: print g.id, " ",cai.cai_for_gene( g.seq.tostring() )#, GC(g.seq.tostring())
tuple_records = quick_FASTA_reader(dna_fasta_filename) assert len(tuple_records)==3 seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta")) assert len(seq_records)==3 for tuple_record, seq_record in zip(tuple_records, seq_records) : assert tuple_record == (seq_record.description, seq_record.seq.tostring()) print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq)) ############## # CodonUsage # ############## print print "Codon Adaption Index (CAI)" CAI = CodonAdaptationIndex() # Note - this needs a whole number of codons, and a DNA seq AS A STRING. print "Example CAI %0.5f using E. coli (default)" \ % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") #We need a FASTA file of CDS sequences to count the codon usage... dna_fasta_filename = "fasta.tmp" dna_genbank_filename = "GenBank/NC_005816.gb" record = SeqIO.read(open(dna_genbank_filename), "genbank") records = [] for feature in record.features : if feature.type == "CDS" \ and not feature.sub_features : start = feature.location.start.position end = feature.location.end.position table = int(feature.qualifiers["transl_table"][0])
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.09978")
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() self.assertEqual( "%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"), "0.09978")
def test_codon_usage_ecoli(self): """Test Codon Adaptation Index (CAI) using default E. coli data.""" CAI = CodonAdaptationIndex() value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") self.assertAlmostEqual(value, 0.09978, places=5)
def __init__(self, verbose=False): CodonAdaptationIndex.__init__(self) self.verbose = verbose
''' index={} for codon in CodonsDict.keys(): DNA_codon = codon.replace("T","U") i = cai_table.find( DNA_codon ) index[codon] = float(cai_table[i+4:i+8])/100 from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex from pprint import pprint cai = CodonAdaptationIndex() cai.set_cai_index(index) print cai.cai_for_gene("ATGACTGAATTCAAGGCCGGTTCTGCTAAGAAAGGTGCTACACTTTTCAAGACTAGATGTCTACAATGCCACACCGTGGAAAAGGGTGGCCCACATAAGGTTGGTCCAAACTTGCATGGTATCTTTGGCAGACACTCTGGTCAAGCTGAAGGGTATTCGTACACAGATGCCAATATCAAGAAAAACGTGTTGTGGGACGAAAATAACATGTCAGAGTACTTGACTAACCCAAAGAAATATATTCCTGGTACCAAGATGGCCTTTGGTGGGTTGAAGAAGGAAAAAGACAGAAACGACTTAATTACCTACTTGAAAAAAGCCTGTGAGTAA") # http://www.genscript.com/cgi-bin/tools/rare_codon_analysis # cai = 0.79 ''' UUU 26.1(170666) UCU 23.5(153557) UAU 18.8(122728) UGU 8.1( 52903) UUC 18.4(120510) UCC 14.2( 92923) UAC 14.8( 96596) UGC 4.8( 31095) UUA 26.2(170884) UCA 18.7(122028) UAA 1.1( 6913) UGA 0.7( 4447)
#### import os import sys from Bio import SeqIO from Bio import Entrez from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqUtils from Bio.SeqUtils import CodonUsage from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex, SynonymousCodons, CodonsDict from types import MethodType Entrez.email = "*****@*****.**" #define new CAI class CAI = CodonAdaptationIndex() def _count_codons2(self, fasta_file): with open(fasta_file, 'r') as handle: # make the codon dictionary local self.codon_count = CodonsDict.copy() # iterate over sequence and count all the codons in the FastaFile. for cur_record in SeqIO.parse(handle, "fasta"): # make sure the sequence is lower case if str(cur_record.seq).islower(): dna_sequence = str(cur_record.seq).upper() else: dna_sequence = str(cur_record.seq)
'TGG': 1.00, 'CGT': 0.08, 'CGC': 0.18, 'CGA': 0.11, 'CGG': 0.20, 'AGT': 0.15, 'AGC': 0.24, 'AGA': 0.21, 'AGG': 0.21, 'GGT': 0.16, 'GGC': 0.34, 'GGA': 0.25, 'GGG': 0.25 } CAI = CodonAdaptationIndex() CAI.set_cai_index(Codon_Index) def is_it_an_orf(s): is_it = True is_it = is_it and len(s) % 3 == 0 is_it = is_it and s.upper().startswith("ATG") is_it = is_it and (s.upper().endswith("TGA") or s.upper().endswith("TAG") or s.upper().endswith("TAA")) return (is_it) stopz = {"TGA": "opal", "TAG": "amber", "TAA": "ochre"} parser = argparse.ArgumentParser()