def recodesite( input, output, site, clip_left, clip_right, codon_table, codon_usage, sampler, codon_freq_threshold, amber_only, ): """Recode a DNA sequence to remove a particular site (e.g., restriction site) The site needs to be recognized by Biopython, or it will be treated as a DNA sequence. The clipping options should determine the boundaries of the coding sequence, which will correspond to the part of the sequence that is "recodable". INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT. """ if sampler == "weighted": usage = ecoli_codon_usage if codon_freq_threshold is not None: # TODO: this is hardcoded in and there's a leaky abstraction here table = standard_dna_table usage = zero_low_freq_codons(usage, table, codon_freq_threshold) if amber_only: usage = zero_non_amber_stops(usage) codon_sampler = FreqWeightedCodonSampler(usage=usage) elif sampler == "uniform": codon_sampler = UniformCodonSampler() sites = [site2dna(s) for s in site] # sites is now a list[Bio.Seq.Seq] for seqrecord in SeqIO.parse(input, "fasta"): id_ = seqrecord.id cds_start = clip_left cds_end = len(seqrecord) - clip_right seq = recode_sites_from_cds( seqrecord.seq, sites, codon_sampler, cds_start, cds_end ) print_fasta(SeqRecord(seq, id_, description=""), output)
def findsite(input, site, clip_left, clip_right): """Find locations of a site in a DNA sequences If a sequence matches the specified site, write out its name and location. Used as a diagnostic to confirm that a particular DNA site (e.g., restriction enzyme) is absent from a set of sequences. Because there may be adaptor sequences that contain such a site by design, the clipping option allows the search to be restricted. Note that a site is searched if it overlaps with the valid region even by one base (i.e., a site can match if it is mostly outside the clipped region, as long as it overlaps the target search region). INPUT is a path to fasta file or "-" to specify STDIN. """ query = str(site2dna(site)) for (name, seq, qual) in readfq(input): start = clip_left end = len(seq) - clip_right idx = seq[start:end].find(query) if idx >= 0: print(f"{name}|{site}|{idx + start}", file=sys.stdout)
def test_manual_seq(self): assert site2dna("AGGCG") == "AGGCG"
def test_bad_site(self): with raises(ValueError): site2dna("foo")
def test_site2dna_enzyme(self): assert site2dna("EcoRI") == "GAATTC"