def find_longest_A_stretch(genes): threshold = 8 regex = re.compile("(A+A)") data = [] A_lengths = [] for gene in genes: stretches = regex.findall(str(gene.seq)) A_lengths.extend([len(s) for s in stretches]) longest_stretch = max(stretches) if len(longest_stretch) > threshold: data.append({ 'gene_id': gene.id, 'gene_name': utils.extract_gene_name(gene), 'gene_len': len(gene.seq), 'stretch': longest_stretch, 'stretch_len': len(longest_stretch) }) data = list(reversed(sorted(data, key=operator.itemgetter('stretch')))) with open('longest_A_stretches.json', 'w') as fd: json.dump(data, fd) with open('all_A_stretch_lengths.json', 'w') as fd: json.dump(A_lengths, fd)
def test_gene_name_extraction(self): record = SeqRecord( Seq('AGTC'), description= 'DDB0191165|DDB_G0267380 |DNA coding sequence|gene: argE on chromosome: 1 position 414980 to 416538' ) self.assertEqual(utils.extract_gene_name(record), 'argE')
def store_low_CAA_genes(genes): """ Create list of genes where CAA usage < 0.9 """ # compute codon usage print('Computing codon statistics') dnana = DNAAnalyzer(strict=False) data = dnana.get_gene_codon_usages(genes) def compute_norm(gene, *args): """ Compute normalized occurrence frequency of aa """ all_codon_num = dnana._count_codons(str(gene.seq)) aa_num = sum([all_codon_num[codon] for codon in args]) norm = aa_num * 1000 / len(gene.seq) return norm avg_codon_freqs = dnana.get_codon_freqs(genes) print(' LYS freq: %f\n' % (avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) + ' GLU freq: %f\n' % (avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) + ' GLN freq: %f' % (avg_codon_freqs['CAA'] + avg_codon_freqs['CAG'])) # filter for genes low_CAA_genes = [] for gene, codu in data.items(): if not codu['CAA'] is None and codu['CAA'] < 0.9: lys_freq = (compute_norm(gene, 'AAA', 'AAG') / 1000) / ( avg_codon_freqs['AAA'] + avg_codon_freqs['AAG']) glu_freq = (compute_norm(gene, 'GAA', 'GAG') / 1000) / ( avg_codon_freqs['GAA'] + avg_codon_freqs['GAG']) gln_freq = (compute_norm(gene, 'CAA', 'CAG') / 1000) / ( avg_codon_freqs['CAA'] + avg_codon_freqs['CAG']) low_CAA_genes.append( (gene.id, extract_gene_name(gene), lys_freq, codu['AAA'], glu_freq, codu['GAA'], gln_freq, codu['CAA'])) # store results with open('results/low_CAA_genes.csv', 'w') as fd: wrtr = csv.writer(fd) wrtr.writerow([ 'ID', 'name', 'LYS rel freq', 'CU: AAA', 'GLU rel freq', 'CU: GAA', 'GLN rel freq', 'CU: CAA' ]) for entry in low_CAA_genes: wrtr.writerow(entry)
def get_direct_annotation(self, record): """ Extract direct annotation information from amigo query result """ soup = self._query_amigo(utils.extract_gene_name(record)) table = soup.find('table', attrs={'class': 'bbop-js-search-pane-results-table'}) table_body = table.find('tbody') rows = table_body.find_all('tr') # only check first row annotations = [] for ele in rows[0].find_all('a'): if ele['href'].startswith( 'http://amigo.geneontology.org/amigo/term/GO:'): annotations.append(ele.text.lower()) return annotations
def get_groupname(self, record): """ Choose "best" annotation out of list of possible ones """ gnames = [] gnames.append('all') egn = extract_gene_name(record) if egn.endswith('_RTE'): gnames.append('rte') annos = ' | '.join(record.annotations['manual']) for kw in self.keywords: if kw.lower() in annos.lower(): gnames.append(kw) if len(gnames) == 1: gnames.append('other') return gnames
def apply(self, record): name = extract_gene_name(record) return not name.endswith('_RTE')
def apply(self, record): name = extract_gene_name(record) return not name.startswith('DDB_')