def handle_codon_usage(genes): """ Generate codon usage histograms """ def extract(marker, data): """ Extract marked codon usage from each gene """ out = [] for gene, codu in data.items(): if not codu[marker] is None: out.append(codu[marker]) return out print('Computing codon statistics') dnana = DNAAnalyzer(strict=False) data = dnana.get_gene_codon_usages(genes) plot_data = [] bin_width = 0.01 for marker in ['AAA', 'GAA', 'CAA']: cur = {} usage = extract(marker, data) cur['marker'] = marker cur['counts'], cur['edges'] = do_binning(usage, bin_width) plot_data.append(cur) json.dump(plot_data, open('results/gene_codon_usages.json', 'w')) print('Plotting') subprocess.check_call( ['Rscript', 'plotting/codon_usage_histogram_maker.R'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def setUp(self): self.dnana = DNAAnalyzer() self.seq = 'AAAAAGAAA' self.genes = [ SeqRecord(Seq('AAAAAAAAG')), SeqRecord(Seq('AAAAAGAAA')), SeqRecord(Seq('TTTTTCTTT')), SeqRecord(Seq('TTCTTTTTC')) ]
def get_codu_per_group(groups): """ Compute codon usage per group """ group_codu = {} dnana = DNAAnalyzer(strict=False) for label, genes in groups.items(): group_codu[label] = dnana.get_avg_codon_usage(genes) return group_codu
def generate_codon_usage_summary(groups, out_fname): """ Save codon usage tables per group """ dnana = DNAAnalyzer(strict=False) with open(out_fname, 'w') as fd: for label, genes in groups.items(): codu = dnana.get_avg_codon_usage(genes) fd.write(label + '\n') pprint.pprint(codu, fd) fd.write('\n')
def main(): """ Generate overview """ farser = FastaParser(sys.argv[1]) genes = farser.parse() dnana = DNAAnalyzer(strict=False) codu = dnana.get_avg_codon_usage(genes) with open('results/plain_codon_usage_table.txt', 'w') as fd: output_data(codu, fd)
def get_codu(genes, group): """ Compute codon usage for all genes or only for certain expression group if file is given """ exprs = extract_expression_levels(sys.argv[2]) if len( sys.argv) == 3 else None groups = { 'all': genes } if exprs is None else group_expression_levels(genes, exprs) select = 'all' if exprs is None else group dnana = DNAAnalyzer(strict=False) codu = dnana.get_avg_codon_usage(groups[select]) return codu, select
def store_low_CAA_genes(genes): """ Create list of genes where CAA usage < 0.9 """ # compute codon usage print('Computing codon statistics') dnana = DNAAnalyzer(strict=False) data = dnana.get_gene_codon_usages(genes) def compute_norm(gene, *args): """ Compute normalized occurrence frequency of aa """ all_codon_num = dnana._count_codons(str(gene.seq)) aa_num = sum([all_codon_num[codon] for codon in args]) norm = aa_num * 1000 / len(gene.seq) return norm avg_codon_freqs = dnana.get_codon_freqs(genes) print(' LYS freq: %f\n' % (avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) + ' GLU freq: %f\n' % (avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) + ' GLN freq: %f' % (avg_codon_freqs['CAA'] + avg_codon_freqs['CAG'])) # filter for genes low_CAA_genes = [] for gene, codu in data.items(): if not codu['CAA'] is None and codu['CAA'] < 0.9: lys_freq = (compute_norm(gene, 'AAA', 'AAG') / 1000) / ( avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) glu_freq = (compute_norm(gene, 'GAA', 'GAG') / 1000) / ( avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) gln_freq = (compute_norm(gene, 'CAA', 'CAG') / 1000) / ( avg_codon_freqs['CAA'] + avg_codon_freqs['CAG']) low_CAA_genes.append( (gene.id, extract_gene_name(gene), lys_freq, codu['AAA'], glu_freq, codu['GAA'], gln_freq, codu['CAA'])) # store results with open('results/low_CAA_genes.csv', 'w') as fd: wrtr = csv.writer(fd) wrtr.writerow([ 'ID', 'name', 'LYS rel freq', 'CU: AAA', 'GLU rel freq', 'CU: GAA', 'GLN rel freq', 'CU: CAA' ]) for entry in low_CAA_genes: wrtr.writerow(entry)
class TestCodonUsage(TestCase): def setUp(self): self.dnana = DNAAnalyzer() self.seq = 'AAAAAGAAA' self.genes = [ SeqRecord(Seq('AAAAAAAAG')), SeqRecord(Seq('AAAAAGAAA')), SeqRecord(Seq('TTTTTCTTT')), SeqRecord(Seq('TTCTTTTTC')) ] def test_codon_counter(self): count = self.dnana._count_codons(self.seq) self.assertEqual(count['AAA'], 2) self.assertEqual(count['AAG'], 1) self.assertEqual(count['AAT'], 0) self.assertEqual(count['AAC'], 0) def test_codon_usage(self): codu = self.dnana.get_codon_usage(self.seq) self.assertEqual(round(codu['AAA'], 3), round(0.6666, 3)) self.assertEqual(round(codu['AAG'], 3), round(0.3333, 3)) self.assertEqual(codu['AAT'], None) self.assertEqual(codu['AAC'], None) def test_average_codon_usage(self): avg_codu = self.dnana.get_avg_codon_usage(self.genes) self.assertEqual(round(avg_codu['AAA'], 3), round(0.6666, 3)) self.assertEqual(round(avg_codu['AAG'], 3), round(0.3333, 3)) self.assertEqual(avg_codu['AAT'], None) self.assertEqual(avg_codu['AAC'], None) self.assertEqual(round(avg_codu['TTT'], 3), round(0.5, 3)) self.assertEqual(round(avg_codu['TTC'], 3), round(0.5, 3)) def test_codon_frequencies(self): avg_cod_freqs = self.dnana.get_codon_freqs(self.genes) self.assertEqual(round(avg_cod_freqs['AAA'], 3), 0.333) self.assertEqual(round(avg_cod_freqs['AAG'], 3), 0.167) self.assertEqual(round(avg_cod_freqs['TTT'], 3), 0.25) self.assertEqual(round(avg_cod_freqs['TTC'], 3), 0.25)
class LysineAbundanceFilter(BaseFilter): """ Only allow gene which code for more than 76.6 lysines if scaled to a length of 1000 bases """ def __init__(self): self.dnaa = DNAAnalyzer(strict=False) def apply(self, record): res = self.dnaa._count_codons(str(record.seq)) lysin = res['AAA'] + res['AAG'] norm = lysin * 1000 / len(record.seq) return norm > 76.6
def find_special_AAA_freqs(genes): #id_filter = ['DDB0305421|DDB_G0276433', 'DDB0347990|DDB_G0289359', 'DDB0347948|DDB_G0270662', 'DDB0349097|DDB_G0279651', 'DDB0306784|DDB_G0293038', 'DDB0218505|DDB_G0283527', 'DDB0348150|DDB_G0285779', 'DDB0347690|DDB_G0286087'] # AAA=0 #id_filter = ['DDB0230164|DDB_G0293360', 'DDB0186263|DDB_G0284929', 'DDB0232396|DDB_G0282423', 'DDB0238636|DDB_G0269008', 'DDB0234236|DDB_G0289721', 'DDB0229439|DDB_G0270122'] # AAA=1 id_filter = ['DDB0348668|DDB_G0276223', 'DDB0307442|DDB_G0269954', 'DDB0307413|DDB_G0269350', 'DDB0308362|DDB_G0269090', 'DDB0216219|DDB_G0269132'] # long AAA=1 def get_record(gene_id): for gene in genes: if gene.id == gene_id: return gene return None dnaa = DNAAnalyzer() for gid in id_filter: rec = get_record(gid) if not rec is None: print(rec.id) print(' ', 'gene length:', len(rec.seq)) coco = dnaa._count_codons(str(rec.seq)) print(' ', 'AAA:', coco['AAA']) print(' ', 'AAG:', coco['AAG']) print()
def group_genes(Classifier, genes, fname_out): """ Group genes given in filename and save results elsewhere """ gegro = GeneGrouper(Classifier) genes = Classifier.preprocess(genes) groups = gegro.group(genes) foo = [] filter_stats = collections.defaultdict(int) dnana = DNAAnalyzer(strict=False) for group_name, group_genes in groups.items(): # apply post-annotation filters filters = parse_filters(post_annotation=True) genes = [] for gene in group_genes: skip = False for f in filters: if not f.skip and not f().apply(gene): filter_stats[f.__name__] += 1 skip = True if skip: continue genes.append(gene) if len(genes) == 0: continue # compute codon usage cum_codu = dnana.get_cum_codon_usage(genes) foo.append({ 'group': group_name, 'cumulative_codon_usage': cum_codu }) if len(filter_stats) > 0: print('Post-Annotation filters:') for k, v in filter_stats.items(): print(' ', k, '->', v) json.dump(foo, open(os.path.join(Classifier.RESULTS_DIR, fname_out), 'w'))
def stretch_codu_histogram(genes): """ Generate 2D histogram of stretch length versus codon usage """ dnana = DNAAnalyzer() def get_stretches(gene, codons): """ Find stretches in ORF of given gene and codon usage """ cods = '|'.join(['(?:%s)' % c for c in codons]) pat = re.compile(r'((?:' + cods + ')+)') stretches = pat.finditer(str(gene.seq), overlapped=True) return parse_stretches( gene, stretches, lambda gene, stretch: dnana.get_codon_usage( stretch.group())[codons[0]]) data = [] for codon_pair in [('CAA', 'CAG'), ('AAA', 'AAG'), ('AAT', 'AAC')]: stretch_lens = [] stretch_codus = [] for gene in genes: stretches, codu = get_stretches(gene, codon_pair) stretch_lens.extend([len(stretch) / 3. for stretch in stretches]) stretch_codus.extend(codu) # make 2D-Histogram coords = do_2d_binning(stretch_lens, stretch_codus, 1, 0.01, max(stretch_lens), 1) data.append({'codon': ','.join(codon_pair), 'data': coords}) with open('results/longest_stretches.json', 'w') as fd: json.dump(data, fd) print('Plotting') subprocess.check_call(['Rscript', 'plotting/stretch_histogram2.R'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def __init__(self): self.dnaa = DNAAnalyzer(strict=False)