def test_EnforceTranslation_bacterial_valine(): table_name = "Bacterial" protein = "LLTMMVTTTTVMVL" protein_sequence = reverse_translate(protein, table=table_name) for first_codon_before, first_codon_after in [ ("ATG", "ATG"), # methionine stays the only methionine codon ("GTG", "GTG"), # valine-start-codon stays the only valine-start-codon ]: sequence = first_codon_before + protein_sequence cds_constraint = EnforceTranslation( genetic_table="Bacterial", start_codon="keep" ) problem = DnaOptimizationProblem( sequence=sequence, constraints=[cds_constraint], objectives=[EnforceChanges()], logger=None, ) assert problem.constraints[0].translation == "MLLTMMVTTTTVMVL" problem.optimize() protein_after = translate( problem.sequence, table_name, assume_start_codon=True ) assert protein_after == "M" + protein assert problem.sequence[:3] == first_codon_after
def verify_constraints(self, sequence): """Return True iff `sequence` passes all `self.sequence_constraints` Will automatically process DNA-Chisel constraints that would be in `self.sequence_constraints` """ constraints = self.sequence_constraints if not hasattr(self, "dnachisel_constraints"): self.dnachisel_constraints = [ constraint for constraint in self.sequence_constraints if isinstance(constraint, Specification) ] if self.dnachisel_constraints != []: if not DNACHISEL_AVAILABLE: raise ImportError( "Spotted DNA Chisel constraints, while " "DNA Chisel is not installed." ) # We provide an empty mutation space so it won't be recomputed # (which would take time and is useless here!) problem = DnaOptimizationProblem( sequence, self.dnachisel_constraints, mutation_space=[] ) constraints = [ constraint for constraint in constraints if not isinstance(constraint, Specification) ] + [lambda seq: problem.all_constraints_pass()] return all(constraint(sequence) for constraint in constraints)
def test_no_solution_error_frozen_region(): problem = DnaOptimizationProblem( sequence="AAAAATCGTCTCTTTT", constraints=[AvoidChanges(), AvoidPattern(enzyme='BsmBI')]) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Constraint breach in frozen region' in str(err.value)
def test_no_solution_error_random_search(): problem = DnaOptimizationProblem( sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTT", constraints=[AvoidChanges((0, 10)), EnforceGCContent(mini=0.8)] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Random search did not' in str(err.value)
def test_AvoidNonUniqueSegments_as_objective(): numpy.random.seed(123) sequence = random_dna_sequence(1000, seed=123) specification = AvoidNonUniqueSegments(8) problem = DnaOptimizationProblem(sequence=sequence, objectives=[specification]) problem.optimize() assert problem.objectives[0].evaluate(problem).passes
def test_no_solution_error_exhaustive_search(): problem = DnaOptimizationProblem( sequence="TTTTTTT", constraints=[AvoidChanges((0, 4)), EnforceGCContent(mini=0.8)] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Exhaustive search failed' in str(err.value)
def test_no_solution_error_frozen_region(): problem = DnaOptimizationProblem( sequence="AAAAATCGTCTCTTTT", constraints=[AvoidChanges(), AvoidPattern('BsmBI_site')] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'region that cannot be mutated' in str(err.value)
def test_constraints_text_summary(): problem = DnaOptimizationProblem(sequence="ATTGCCATATGCGC", constraints=[ EnforceGCContent(mini=0.4, maxi=0.6), AvoidPattern('ATT') ]) text = problem.constraints_text_summary() assert 'FAILURE: 1 constraints evaluations failed' in text
def test_UniquifyAllKmers_as_objective(): numpy.random.seed(123) sequence = random_dna_sequence(1000, seed=123) specification = UniquifyAllKmers(8) problem = DnaOptimizationProblem(sequence=sequence, objectives=[specification], logger=None) problem.optimize() assert problem.objectives[0].evaluate(problem).passes
def test_random_compatible_dna_sequence(): constraints = [ EnforceGCContent(mini=0.4, maxi=0.6, window=50), AvoidPattern('ATC') ] seq = random_compatible_dna_sequence(1000, constraints=constraints) problem = DnaOptimizationProblem(sequence=seq, constraints=constraints) assert ("ATC" not in seq) assert problem.all_constraints_pass()
def test_optimize_with_report(tmpdir): problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern('BsmBI_site')]) target = os.path.join(str(tmpdir), 'with_solution') os.mkdir(target) assert os.listdir(target) == [] success, message, data = problem.optimize_with_report(target) assert success assert os.listdir(target) != []
def test_optimize_with_report_no_solution(tmpdir): problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern("BsmBI_site"), AvoidChanges()], logger=None, ) target = os.path.join(str(tmpdir), "no_solution") os.mkdir(target) assert os.listdir(target) == [] success, message, data = problem.optimize_with_report(target) assert not success assert os.listdir(target) != []
def test_AvoidRareCodons_as_constraint(): numpy.random.seed(123) sequence = "ATG" "TTT" "ATA" "CCA" "CTT" "TAG" problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(), AvoidRareCodons(0.11, "e_coli")], ) assert problem.all_constraints_pass() assert problem.sequence_edits_as_array().sum() == 4 assert translate(problem.sequence) == translate(sequence)
def test_AllowPrimer(): primers = ["ATTGCGCCAAACT", "TAATCCACCCTAAT", "ATTCACACTTCAA"] problem = DnaOptimizationProblem(sequence=40 * "A", constraints=[ AllowPrimer( tmin=50, tmax=60, max_homology_length=5, location=(10, 30), avoid_heterodim_with=primers) ]) problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_changes_with_indices_as_constraint(): numpy.random.seed(123) indices = [10, 20] + list(range(30, 40)) + [44, 45, 46] sequence = random_dna_sequence(50) problem = DnaOptimizationProblem( sequence=sequence, constraints=[AvoidChanges(indices=indices)], objectives=[EnforceChanges()], logger=None, ) problem.optimize() assert problem.number_of_edits() == 50 - 15
def work(self): data = self.data self.logger(message='Initializing...') if data.editFeatures: record = sequence_to_biopython_record(data.sequence.upper()) for feature in sorted(data.editedFeatures.values(), key=lambda f: (f.start, f.end)): annotate_record(record, feature_type="misc_feature", location=(feature.start, feature.end), label=feature.label) else: records, fmt = records_from_data_file(data.file) record = records[0] problem = DnaOptimizationProblem.from_record(record) problem.max_random_iters = 1000 problem.logger = self.logger success, summary, zip_data = optimization_with_report( target="@memory", problem=problem, project_name=record.id) return { 'zip_file': { 'data': data_to_html_data(zip_data, 'zip'), 'name': 'optimization_report.zip', 'mimetype': 'application/zip' }, 'success': success, 'summary': summary }
def work(self): data = self.data self.logger(message="Initializing...") if data.editFeatures: record = sequence_to_biopython_record(data.sequence.upper()) for feature in sorted(data.editedFeatures.values(), key=lambda f: (f.start, f.end)): annotate_record( record, feature_type="misc_feature", location=(feature.start, feature.end), label=feature.label, ) else: record = records_from_data_files([data.file])[0] problem = DnaOptimizationProblem.from_record(record, logger=self.logger) problem.optimization_stagnation_tolerance = 30 success, summary, zip_data = problem.optimize_with_report( target="@memory", project_name=record.id) return { "zip_file": { "data": data_to_html_data(zip_data, "zip"), "name": "optimization_report.zip", "mimetype": "application/zip", }, "success": success, "summary": summary, }
def load_template(filename, insert, destination): ''' func descriptor ''' objectives = [] constraints = [] vector = SeqIO.read(filename, "genbank") vector, insert_location = insert_into_vector(vector, destination, insert) problem = DnaOptimizationProblem.from_record(vector) constraints += problem.constraints objectives += problem.objectives #feats = [feat.qualifiers for feat in vector.features] #dnachisel hasn't implemented MultiLocation yet #vector_location = FeatureLocation(insert_location.end, len(vector)) + FeatureLocation(0,insert_location.start) #vector_location_us = Location(0, insert_location.start, 1) #vector_location_ds = Location(insert_location.end, len(vector), 1) #constraints.append(EnforceTranslation(Location.from_biopython_location(insert_location))) #constraints.append(AvoidChanges(vector_location_us)) #constraints.append(AvoidChanges(vector_location_ds)) #This seq should be a SeqRecord object return vector, objectives, constraints
def test_rca_example(): """Test a Genbank with ~harmonize_rca feature.""" path = os.path.join("tests", "tests_from_genbanks", "genbanks", "rca_example.gb") problem = DnaOptimizationProblem.from_record(path) assert str(problem.objectives) == "[HarmonizeRCA[0-105(+)](e_coli -> h_sapiens)]" assert problem.objectives[0].original_species == "e_coli" assert problem.objectives[0].species == "h_sapiens" problem.optimize()
def test_AvoidRareCodons_as_constraint_reversed(): numpy.random.seed(123) sequence = "ATG" "TTT" "ATA" "CCA" "CTT" "TAG" rev_sequence = reverse_complement(sequence) location = (0, len(sequence), -1) problem = DnaOptimizationProblem( sequence=rev_sequence, constraints=[ EnforceTranslation(location=location), AvoidRareCodons(0.11, "e_coli", location=location), ], ) assert problem.all_constraints_pass() assert problem.sequence_edits_as_array().sum() == 4 new_sequence = reverse_complement(problem.sequence) assert translate(new_sequence) == translate(sequence)
def test_avoid_matches_with_list(): pattern_1 = "CGTCTC" pattern_2 = "TGCACA" sequence = 10 * "A" + pattern_1 + 20 * "A" + pattern_2 + 10 * "A" avoided_seqs = [ 10 * "G" + pattern_1 + 10 * "G", 10 * "G" + pattern_2 + 10 * "G", ] constraint = AvoidMatches(sequences=avoided_seqs, match_length=6) problem = DnaOptimizationProblem(sequence=sequence, constraints=[constraint], logger=None) cst_eval = constraint.evaluate(problem) assert len(cst_eval.locations) == 2 problem.resolve_constraints() assert problem.all_constraints_pass() constraint.remove_temp_directory()
def test_EnforceTranlationError(): """Providing a location that is not multiple of 3 raises an error""" numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) with pytest.raises(ValueError) as err: problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(location=(0, 16))], ) assert "Location 0-16(+) has length 16" in str(err.value)
def compute_forbidden_patterns_locations(self, record): """Return an array where ``arr[i] == 1`` means that i is surrounded by a user-forbidden pattern.""" pattern_constraints = [ AvoidPattern(homopolymer_pattern(c, 5)) for c in 'ATGC' ] kmer_constraints = [ AvoidPattern(repeated_kmers(k, n)) for k, n in [(4, 2), (3, 3), (2, 4)] ] problem = DnaOptimizationProblem(sequence=record, constraints=pattern_constraints + kmer_constraints) constraints_breaches = group_overlapping_segments([ (f.location.start, f.location.end) for ev in problem.constraints_evaluations() for f in ev.locations_to_biopython_features() if not ev.passes ]) return segments_to_array(constraints_breaches, len(record))
def test_record_with_multispec_feature(): sequence = random_dna_sequence(100) record = sequence_to_biopython_record(sequence) label = "@gc(40-60%/20bp) & @no(BsaI_site) & @keep" annotate_record(record, label=label) problem = DnaOptimizationProblem.from_record(record) assert len(problem.constraints) == 3 c1, c2, c3 = problem.constraints assert c1.mini == 0.4 assert c2.pattern.name == "BsaI"
def test_optimization_with_report_no_solution(tmpdir): problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern(enzyme='BsmBI'), AvoidChanges()] ) target = os.path.join(str(tmpdir), 'no_solution') os.mkdir(target) assert os.listdir(target) == [] success, message, data = optimization_with_report(target, problem) assert not success assert os.listdir(target) != []
def test_cuba_example_1(): path = os.path.join( "tests", "tests_from_genbanks", "genbanks", "cuba_example_1.gbk" ) problem = DnaOptimizationProblem.from_record(path) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass() assert problem.objective_scores_sum() < -100 problem.optimize() assert problem.objective_scores_sum() > -0.1
def test_cuba_example_1(): path = os.path.join('tests', 'tests_from_genbanks', 'genbanks', 'cuba_example_1.gbk') record = load_record(path) problem = DnaOptimizationProblem.from_record(record) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass() assert problem.objective_scores_sum() < -100 problem.optimize() assert problem.objective_scores_sum() > -0.1
def test_avoid_change_as_objectives_basics(): numpy.random.seed(123) results = [] for boost in (0, 0.1, 0.2, 1): sequence = random_dna_sequence(1000, seed=123) problem = DnaOptimizationProblem( sequence=sequence, objectives=[ EnforceGCContent( mini=0.45, maxi=0.55, window=80).copy_with_changes(locations_span=300), AvoidChanges(boost=boost).as_passive_objective() ]) problem.optimize() differences = sequences_differences(problem.sequence, problem.sequence_before) results.append(differences) assert results[0] > 40 assert (results[0] > results[1] > results[2] > results[3]) assert results[-1] == 0
def test_pattern_and_reverse(): bsmbi = "CGTCTC" bsmbi_rev = "GAGACG" sequence = 10 * bsmbi + 25 * bsmbi_rev + 15 * bsmbi + 15 * bsmbi_rev problem = DnaOptimizationProblem(sequence, constraints=[AvoidPattern('BsmBI_site')], objectives=[AvoidChanges()]) problem.resolve_constraints() problem.optimize() assert sum(problem.sequence_edits_as_array()) < 70
def test_EnforceRegionsCompatibility(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(123) def compatibility_condition(location1, location2, problem): seq1 = location1.extract_sequence(problem.sequence) seq2 = location2.extract_sequence(problem.sequence) return sequences_differences(seq1, seq2) >= 2 locations = [(0, 4), (50, 54), (100, 104), (150, 154)] problem = DnaOptimizationProblem( sequence=random_dna_sequence(200, seed=123), constraints=[ EnforceRegionsCompatibility( locations=locations, compatibility_condition=compatibility_condition, condition_label="2bp difference", ), EnforceGCContent(mini=0.4, maxi=0.6, window=40), ], logger=None, ) assert not any([e.passes for e in problem.constraints_evaluations()]) problem.resolve_constraints() assert problem.all_constraints_pass() seq = problem.sequence assert [ sequences_differences(seq[s1:e1], seq[s2:e2]) >= 2 for (s1, e1), (s2, e2) in itertools.combinations(locations, 2) ]