def make_restriction_part(part_length, left_overhang, right_overhang, enzyme, forbidden_enzymes, assembly_enzyme='BsmBI'): l_left = len(left_overhang) l_right = len(right_overhang) left_overhang_location = (0, l_left) right_overhang_location = (l_left + part_length, l_left + part_length + l_right) center_location = (l_left, l_left + part_length) core_sequence = (left_overhang + dc.random_dna_sequence(part_length) + right_overhang) enforce_enzyme = dc.EnforcePatternOccurence( enzyme=enzyme, location=center_location) problem = dc.DnaOptimizationProblem( sequence=core_sequence, constraints=[ dc.AvoidChanges(left_overhang_location), dc.AvoidChanges(right_overhang_location), ] + [enforce_enzyme] + [ dc.AvoidPattern(enzyme=enzyme_name) for enzyme_name in forbidden_enzymes + [assembly_enzyme] ] ) problem.resolve_constraints() core_sequence = dc.sequence_to_biopython_record(problem.sequence) for loc in [left_overhang_location, right_overhang_location]: dc.annotate_record(core_sequence, loc, 'overhang') site_location = enforce_enzyme.evaluate(problem).data['matches'][0] dc.annotate_record(core_sequence, site_location.to_tuple(), enzyme) assembly_site = Restriction.__dict__[assembly_enzyme].site flank = dc.sequence_to_biopython_record(assembly_site + 'A') dc.annotate_record(flank, label='flank') return flank + core_sequence + flank.reverse_complement()
def test_circular_sequence_basic(): np.random.seed(123) # Until the feature gets more battle-test, we're making sure it works # across a range of sequences. for i in range(4): dna_sequence = ( "CTC" + dc.random_dna_sequence(100) + "CGTCTC" + dc.random_dna_sequence(100) + "CGT" ) problem = dc.CircularDnaOptimizationProblem( sequence=dna_sequence, constraints=[ dc.AvoidPattern("BsmBI_site"), dc.EnforceGCContent( mini=0.4, maxi=0.6, location=(150, 250), window=50 ), dc.UniquifyAllKmers(k=9, location=(10, 100)), ], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_circular_sequence_optimize_with_report(tmpdir): """Test that the custom function of CircularDnaOptimizationProblems works. """ np.random.seed(123) # Until the feature gets more battle-test, we're making sure it works # across a range of sequences. dna_sequence = ( "CTC" + dc.random_dna_sequence(100) + "CGTCTC" + dc.random_dna_sequence(100) + "CGT" ) problem = dc.CircularDnaOptimizationProblem( sequence=dna_sequence, constraints=[ dc.AvoidPattern("BsmBI_site"), dc.EnforceGCContent( mini=0.4, maxi=0.6, location=(150, 250), window=50 ), dc.UniquifyAllKmers(k=9, location=(10, 100)), ], logger=None, ) target = os.path.join(str(tmpdir), "circular_with_solution") os.mkdir(target) assert os.listdir(target) == [] assert not problem.all_constraints_pass() success, message, data = problem.optimize_with_report(target) assert problem.all_constraints_pass() record = problem.to_record() assert str(record.seq) != dna_sequence
def test_whole_sequence_change_objective_20_going_down(): np.random.seed(123) problem = dc.DnaOptimizationProblem( sequence=20*"AT", constraints=[dc.AvoidPattern("ATA")], objectives=[dc.EnforceChanges(amount=20)], ) problem.mutations_per_iteration = 2 problem.resolve_constraints() assert problem.number_of_edits() >= 24 problem.optimize() assert problem.number_of_edits() == 20
def test_constraints_reports(): genbank_dir = os.path.join("tests", "data", "10_emma_genbanks") records = [ dc.load_record(os.path.join(genbank_dir, filename), name=filename) for filename in os.listdir(genbank_dir) ] # DEFINE THE CONSTRAINTS TO BE CHECKED ON EACH RECORD constraints = [ dc.AvoidPattern("BsaI_site"), dc.AvoidPattern("BsmBI_site"), dc.AvoidPattern("BbsI_site"), dc.AvoidPattern("8x1mer"), dc.AvoidPattern("5x3mer"), dc.AvoidPattern("9x2mer"), dc.AvoidHairpins(stem_size=20, hairpin_window=200), dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100), ] # CREATE A SPREADSHEET AND PLOTS OF THE BREACHES dataframe = cr.constraints_breaches_dataframe(constraints, records) records = cr.records_from_breaches_dataframe(dataframe, records) assert sum([len(r.features) for r in records]) == 157 pdf_data = cr.breaches_records_to_pdf(records) assert 70000 < len(pdf_data) < 72000
def test_insert_and_erase_pattern(): numpy.random.seed(123) protein = dc.random_protein_sequence(100) pattern = "ATGC" # CREATE A SEQUENCE WITH 0 PATTERN OCCURENCES sequence = dc.random_compatible_dna_sequence( sequence_length=300, constraints=[ dc.EnforceTranslation(translation=protein), dc.AvoidPattern(pattern), ], logger=None, ) # NOW INCREASE PATTERN OCCURENCES FROM 0 TO 5 problem = dc.DnaOptimizationProblem( sequence=sequence, constraints=[ dc.EnforcePatternOccurence(pattern, occurences=5), dc.EnforceTranslation(), ], logger=None, ) assert problem.constraints[0].evaluate(problem).score == -5 problem.resolve_constraints() assert problem.all_constraints_pass() sequence = problem.sequence # NOW DECREASE THE NUMBER OF OCCURENCES FROM 5 TO 2 problem = dc.DnaOptimizationProblem( sequence=sequence, constraints=[ dc.EnforcePatternOccurence(pattern, occurences=2), dc.EnforceTranslation(), ], logger=None, ) assert problem.constraints[0].evaluate(problem).score == -3 problem.resolve_constraints() assert problem.all_constraints_pass()
def work(self): data = self.data figures = [] self.logger(message="Generating report...") records = records_from_data_files(data.files) constraints = [ dc.AvoidPattern("BsaI_site"), dc.AvoidPattern("BsmBI_site"), dc.AvoidPattern("BbsI_site"), dc.AvoidPattern("SapI_site"), dc.AvoidPattern("8x1mer"), dc.AvoidPattern("5x3mer"), dc.AvoidPattern("9x2mer"), dc.AvoidHairpins(stem_size=20, hairpin_window=200), dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100), dc.EnforceGCContent(mini=0.1, maxi=0.9, window=100), dc.UniquifyAllKmers(k=15), ] dataframe = cr.constraints_breaches_dataframe(constraints, records) spreadsheet_io = BytesIO() dataframe.to_excel(spreadsheet_io) records = cr.records_from_breaches_dataframe(dataframe, records) zipped_records = flametree.file_tree("@memory") if data.include_genbanks: for record in records: target = zipped_records._file("%s.gb" % record.id) write_record(record, target) pdf_io = BytesIO() cr.breaches_records_to_pdf(records, pdf_io, logger=self.logger) return { "pdf_report": { "data": data_to_html_data( pdf_io.getvalue(), "pdf", filename="manufacturability_report.pdf", ), "name": "manufacturability_report.pdf", "mimetype": "application/pdf", }, "records": { "data": data_to_html_data( zipped_records._close(), "zip", filename="manufacturability_annotated_records.zip", ), "name": "manufacturability_annotated_records.zip", "mimetype": "application/zip", }, "spreadsheet": { "data": data_to_html_data( spreadsheet_io.getvalue(), "xlsx", filename="manufacturability_report.xlsx", ), "name": "manufacturability_report.xlsx", "mimetype": "vnd.openxmlformats-officedocument.spreadsheetml.sheet", }, }
problem: - The sequence is designed to have a cross-origin BsmBI site that will need to be removed, because the location-less specification ``AvoidPattern`` is interpreted as applying to the full circle. - The specification ``EnforceGCContent`` is cross-origin since its location is 1500-2500, and the sequence is ~2000bp long. """ import dnachisel as dc dna_sequence = "CTC%sCGTCTC%sCGT" % ( dc.random_dna_sequence(1000), dc.random_dna_sequence(1000), ) constraints = [ dc.AvoidPattern("BsmBI_site"), dc.EnforceGCContent(mini=0.4, maxi=0.6, location=(1500, 2500), window=50), dc.UniquifyAllKmers(k=9, location=(10, 1000)), ] problem = dc.CircularDnaOptimizationProblem( sequence=dna_sequence, constraints=constraints ) print("BEFORE OPTIMIZATION:\n\n", problem.constraints_text_summary()) problem.resolve_constraints() print("AFTER OPTIMIZATION:\n\n", problem.constraints_text_summary())
from copy import deepcopy from collections import OrderedDict import dnachisel as dc import pandas import seaborn as sns sns.set() # DEFINE HOW OPTIMIZATION PROBLEMS ARE CREATED specifications = { "~keep": dc.AvoidChanges(), "~no(CG)": dc.AvoidPattern("CG"), "~codon_optimize": dc.CodonOptimize(species="e_coli"), "~unique_kmers": dc.UniquifyAllKmers(20), "~gc(39%)": dc.EnforceGCContent(target=0.39, window=200), } class_to_label = { spec.__class__: label for label, spec in specifications.items() } sequence = dc.load_record("record.gb") def create_problem(boost_profile): location = dc.Location(1000, 9247) objectives = [] for spec_name, boost in boost_profile.items(): spec = specifications[spec_name] spec = spec.copy_with_changes(boost=boost, location=location) objectives.append(spec) return dc.DnaOptimizationProblem(
import dnachisel as dc import dnachisel.reports.constraints_reports as cr import os # IMPORT THE 10 RECORDS FROM THE genbanks/ FOLDER records = [ dc.load_record(os.path.join("genbanks", filename), name=filename) for filename in os.listdir("genbanks") ] # DEFINE THE CONSTRAINTS TO BE CHECKED ON EACH RECORD constraints = [ dc.AvoidPattern("BsaI_site"), dc.AvoidPattern("BsmBI_site"), dc.AvoidPattern("BbsI_site"), dc.AvoidPattern("8x1mer"), dc.AvoidPattern("5x3mer"), dc.AvoidPattern("9x2mer"), dc.AvoidHairpins(stem_size=20, hairpin_window=200), dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100), ] # CREATE A SPREADSHEET AND PLOTS OF THE BREACHES dataframe = cr.constraints_breaches_dataframe(constraints, records) dataframe.to_excel("breaches.xlsx") records = cr.records_from_breaches_dataframe(dataframe, records) cr.breaches_records_to_pdf(records, "breaches_plots.pdf")
regex = "(CCCTTT){3}C{3}" # optimal pattern for i-motif formation query_seq = ( dnachisel.random_dna_sequence(length=50) + i_motif + dnachisel.random_dna_sequence(length=50) ) print(query_seq) seq = Bio.Seq.Seq(query_seq) # Find first occurrence: print(seq.find(i_motif)) # Find all: matches = [ (m.start(), m.end()) for m in re.finditer(i_motif, str(seq)) ] # list of tuples print(seq[matches[0][0] : matches[0][1]]) # Find regex with DNA Chisel: problem = dnachisel.DnaOptimizationProblem( sequence=query_seq, constraints=[dnachisel.AvoidPattern(pattern=regex)] ) print(problem.constraints_text_summary()) compact_regex = "(C{3}T{3}){3}C{3}" # variant of the same regex problem = dnachisel.DnaOptimizationProblem( sequence=query_seq, constraints=[dnachisel.AvoidPattern(pattern=compact_regex)] ) print(problem.constraints_text_summary())