示例#1
0
    def test_codon_usage_custom(self):
        """Test Codon Adaptation Index (CAI) using FASTA file for background."""
        # We need a FASTA file of CDS sequences to count the codon usage...
        dna_fasta_filename = "fasta.tmp"
        dna_genbank_filename = "GenBank/NC_005816.gb"
        record = SeqIO.read(dna_genbank_filename, "genbank")
        records = []
        for feature in record.features:
            if feature.type == "CDS" and len(feature.location.parts) == 1:
                start = feature.location.start.position
                end = feature.location.end.position
                table = int(feature.qualifiers["transl_table"][0])
                if feature.strand == -1:
                    seq = record.seq[start:end].reverse_complement()
                else:
                    seq = record.seq[start:end]
                # Double check we have the CDS sequence expected
                # TODO - Use any cds_start option if/when added to deal with the met
                a = "M" + str(seq[3:].translate(table))
                b = feature.qualifiers["translation"][0] + "*"
                self.assertEqual(a, b, "%r vs %r" % (a, b))
                records.append(
                    SeqRecord(
                        seq,
                        id=feature.qualifiers["protein_id"][0],
                        description=feature.qualifiers["product"][0],
                    )
                )

        with open(dna_fasta_filename, "w") as handle:
            SeqIO.write(records, handle, "fasta")

        CAI = CodonAdaptationIndex()
        # Note - this needs a FASTA file which containing non-ambiguous DNA coding
        # sequences - which should each be a whole number of codons.
        CAI.generate_index(dna_fasta_filename)
        # Now check codon usage index (CAI) using this species
        self.assertEqual(
            record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001"
        )
        value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
        self.assertAlmostEqual(value, 0.67213, places=5)
        os.remove(dna_fasta_filename)
def calc_cai(sequence, genbank, cai_freqs=CAI_FREQS):
    """Return the CAI for a given genome."""
    # create CAI index
    cai_index = {}
    for codons in SynonymousCodons.values():
        codons = list(codons)
        codon_freqs = np.array([cai_freqs[x] for x in codons])
        max_freq = max(codon_freqs)
        codon_freqs = codon_freqs / max_freq
        for i, x in enumerate(codons):
            cai_index[x] = codon_freqs[i]
    cai_table = CodonAdaptationIndex()
    cai_table.set_cai_index(cai_index)

    # concatenate ORFs
    orfs = [x for x in genbank.features if x.type.lower() == "cds"]
    cds_seq = ""
    for orf in orfs:
        cds_seq += proc_sequence(
            str(orf.extract(sequence)).upper().replace("U", "T"))

    # return cai
    return cai_table.cai_for_gene(cds_seq)
示例#3
0
    def test_codon_usage_custom(self):
        """Test Codon Adaptation Index (CAI) using FASTA file for background."""
        #We need a FASTA file of CDS sequences to count the codon usage...
        dna_fasta_filename = "fasta.tmp"
        dna_genbank_filename = "GenBank/NC_005816.gb"
        record = SeqIO.read(dna_genbank_filename, "genbank")
        records = []
        for feature in record.features:
            if feature.type == "CDS" and not feature.sub_features:
                start = feature.location.start.position
                end = feature.location.end.position
                table = int(feature.qualifiers["transl_table"][0])
                if feature.strand == -1:
                    seq = record.seq[start:end].reverse_complement()
                else:
                    seq = record.seq[start:end]
                #Double check we have the CDS sequence expected
                #TODO - Use any cds_start option if/when added to deal with the met
                a = "M" + str(seq[3:].translate(table))
                b = feature.qualifiers["translation"][0] + "*"
                self.assertEqual(a, b, "%r vs %r" % (a, b))
                records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
                                        description=feature.qualifiers["product"][0]))

        with open(dna_fasta_filename, "w") as handle:
            SeqIO.write(records, handle, "fasta")

        CAI = CodonAdaptationIndex()
        # Note - this needs a FASTA file which containing non-ambiguous DNA coding
        # sequences - which should each be a whole number of codons.
        CAI.generate_index(dna_fasta_filename)
        # Now check codon usage index (CAI) using this species
        self.assertEqual(record.annotations["source"],
                         "Yersinia pestis biovar Microtus str. 91001")
        self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
                         "0.67213")
        os.remove(dna_fasta_filename)
示例#4
0
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records):
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
print "Example CAI %0.5f using E. coli (default)" \
      % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")

#We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(open(dna_genbank_filename), "genbank")
records = []
for feature in record.features:
    if feature.type == "CDS" \
    and not feature.sub_features:
        start = feature.location.start.position
        end = feature.location.end.position
        table = int(feature.qualifiers["transl_table"][0])
        if feature.strand == -1:
            seq = record.seq[start:end].reverse_complement()
        else:
示例#5
0
	
Pro	CCG	15778	5.40	0.12
Pro	CCA	51993	17.79	0.41
Pro	CCT	39685	13.58	0.31
Pro	CCC	20139	6.89	0.16'''
import csv

index = {}
for aa, cn, n1,n2, f in csv.reader([x for x in cai_table.splitlines() if x.strip()], delimiter='\t'):
    index[cn] = float(n2)

from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex
from pprint import pprint


cai = CodonAdaptationIndex()

cai.set_cai_index(index)

from Bio import SeqIO

genes = list(SeqIO.parse("genes.fasta", "fasta"))

from Bio.SeqUtils import GC

for g in genes:
    print g.id, "         ",cai.cai_for_gene( g.seq.tostring() )#, GC(g.seq.tostring())


示例#6
0
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records) :
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
print "Example CAI %0.5f using E. coli (default)" \
      % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")

#We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(open(dna_genbank_filename), "genbank")
records = []
for feature in record.features :
    if feature.type == "CDS" \
    and not feature.sub_features :
        start = feature.location.start.position
        end = feature.location.end.position
        table = int(feature.qualifiers["transl_table"][0])
        if feature.strand == -1 :
            seq = record.seq[start:end].reverse_complement()
        else :
示例#7
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
                      "0.09978")
示例#8
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     self.assertEqual(
         "%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
         "0.09978")
示例#9
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
     self.assertAlmostEqual(value, 0.09978, places=5)
示例#10
0
for codon in CodonsDict.keys():
    DNA_codon = codon.replace("T","U")
    i = cai_table.find( DNA_codon )    
    index[codon] = float(cai_table[i+4:i+8])/100
  

from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex
from pprint import pprint


cai = CodonAdaptationIndex()

cai.set_cai_index(index)

print cai.cai_for_gene("ATGACTGAATTCAAGGCCGGTTCTGCTAAGAAAGGTGCTACACTTTTCAAGACTAGATGTCTACAATGCCACACCGTGGAAAAGGGTGGCCCACATAAGGTTGGTCCAAACTTGCATGGTATCTTTGGCAGACACTCTGGTCAAGCTGAAGGGTATTCGTACACAGATGCCAATATCAAGAAAAACGTGTTGTGGGACGAAAATAACATGTCAGAGTACTTGACTAACCCAAAGAAATATATTCCTGGTACCAAGATGGCCTTTGGTGGGTTGAAGAAGGAAAAAGACAGAAACGACTTAATTACCTACTTGAAAAAAGCCTGTGAGTAA")



# http://www.genscript.com/cgi-bin/tools/rare_codon_analysis
# cai = 0.79


'''
UUU 26.1(170666)  UCU 23.5(153557)  UAU 18.8(122728)  UGU  8.1( 52903)
UUC 18.4(120510)  UCC 14.2( 92923)  UAC 14.8( 96596)  UGC  4.8( 31095)
UUA 26.2(170884)  UCA 18.7(122028)  UAA  1.1(  6913)  UGA  0.7(  4447)
UUG 27.2(177573)  UCG  8.6( 55951)  UAG  0.5(  3312)  UGG 10.4( 67789)

CUU 12.3( 80076)  CCU 13.5( 88263)  CAU 13.6( 89007)  CGU  6.4( 41791)
CUC  5.4( 35545)  CCC  6.8( 44309)  CAC  7.8( 50785)  CGC  2.6( 16993)
for record in SeqIO.parse(input_file, "fasta"):
    subdat = [record.id]
    if is_it_an_orf(str(record.seq)):
        orf_nt = str(record.seq)
        orf_aa = str(record.seq.translate()).replace("*", "")
        if trim:
            orf_aa = orf_aa[1:]
            orf_nt = orf_nt[3:]

        length = len(orf_nt.upper())
        mw = Analyze(orf_aa).molecular_weight()
        pI = Analyze(orf_aa).isoelectric_point()
        aroma = Analyze(orf_aa).aromaticity()
        hydrophobe = Analyze(orf_aa).gravy()
        instability = Analyze(orf_aa).instability_index()
        cai = CAI.cai_for_gene(orf_nt.upper())
        mp = mt.Tm_GC(orf_nt)
        A = orf_nt.upper().count("A")
        T = orf_nt.upper().count("T")
        C = orf_nt.upper().count("C")
        G = orf_nt.upper().count("G")
        CpG = orf_nt.upper().count("CG") + orf_nt.upper().count(
            "GC")  # a forward GpC is a reverse CpG

        stop = stopz[orf_nt.upper()[-3:]]

        subdat.extend([
            length, mw, mp, pI, aroma, hydrophobe, instability, cai, A, T, C,
            G, CpG, stop
        ])
        nuWreck = record.translate()