def recodesite( input, output, site, clip_left, clip_right, codon_table, codon_usage, sampler, codon_freq_threshold, amber_only, ): """Recode a DNA sequence to remove a particular site (e.g., restriction site) The site needs to be recognized by Biopython, or it will be treated as a DNA sequence. The clipping options should determine the boundaries of the coding sequence, which will correspond to the part of the sequence that is "recodable". INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ if sampler == "weighted": usage = ecoli_codon_usage if codon_freq_threshold is not None: # TODO: this is hardcoded in and there's a leaky abstraction here table = standard_dna_table usage = zero_low_freq_codons(usage, table, codon_freq_threshold) if amber_only: usage = zero_non_amber_stops(usage) codon_sampler = FreqWeightedCodonSampler(usage=usage) elif sampler == "uniform": codon_sampler = UniformCodonSampler() sites = [site2dna(s) for s in site] # sites is now a list[Bio.Seq.Seq] for seqrecord in SeqIO.parse(input, "fasta"): id_ = seqrecord.id cds_start = clip_left cds_end = len(seqrecord) - clip_right seq = recode_sites_from_cds( seqrecord.seq, sites, codon_sampler, cds_start, cds_end ) print_fasta(SeqRecord(seq, id_, description=""), output)
def test_with_two_sites_in_cds(self): dna_seq = Seq("GAGATCCGGTCAAGCTTGAATTCAACGCAAGTTGTTAT") new_seq = recode_sites_from_cds( dna_seq, [self.EcoRI, self.HindIII], self.codon_sampler, self.cds_start, self.cds_end, ) orig_trans = dna_seq[self.cds_start:self.cds_end].translate( table=self.codon_sampler.table) new_trans = new_seq[self.cds_start:self.cds_end].translate( table=self.codon_sampler.table) assert new_seq.find(self.EcoRI) == -1 assert new_seq.find(self.HindIII) == -1 assert new_seq != dna_seq assert len(new_seq) == len(dna_seq) assert new_seq[:self.cds_start] == dna_seq[:self.cds_start] assert new_seq[self.cds_end:] == dna_seq[self.cds_end:] assert new_trans == orig_trans