def test_avoid_repeated_small_kmers(): problem = DnaOptimizationProblem( sequence="AGAAGAAGAAGAAGAAGATTTTTTTTTTTTTGGAGGAGGAGGACCCCCCCCCCCCGAGG", constraints=[AvoidPattern(RepeatedKmerPattern(3, 3))]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidStopCodons(): numpy.random.seed(123) problem = DnaOptimizationProblem(sequence="ATTGCCATCTAA", constraints=[AvoidStopCodons()]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidNonUniqueSegments_from_polyAs(): problem = DnaOptimizationProblem( sequence= 40 * "A", constraints=[AvoidNonUniqueSegments(3, location=(10, 30))] ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_pattern_basics(): numpy.random.seed(123) problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern(enzyme="BsaI")]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidNonuniqueSegments_as_constraint(): numpy.random.seed(123) sequence = random_dna_sequence(1000, seed=123) problem = DnaOptimizationProblem(sequence=sequence, constraints=[AvoidNonuniqueSegments(8)]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_pattern_overlapping_locations(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence="AGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG", constraints=[AvoidPattern("NAN")]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass() assert "A" not in problem.sequence[1:-1]
def test_UniquifyAllKmers_as_constraint(): numpy.random.seed(123) sequence = random_dna_sequence(1000, seed=123) problem = DnaOptimizationProblem(sequence=sequence, constraints=[UniquifyAllKmers(8)], logger=None) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_UniquifyAllKmers_from_polyAs(): problem = DnaOptimizationProblem( sequence=40 * "A", constraints=[UniquifyAllKmers(3, location=(10, 30))], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceTranlation(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) problem = DnaOptimizationProblem( sequence=sequence, constraints=[AvoidPattern("AAA"), EnforceTranslation()], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_UniquifyAllKmers_from_polyAs_uncached(): """Uncaching actually calls another function get_kmer_extractor.""" constraint = UniquifyAllKmers(3, location=(10, 30)) constraint.use_cache = False problem = DnaOptimizationProblem(sequence=40 * "A", constraints=[constraint], logger=None) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidStopCodons(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence="".join(["ATT", "TAG", "GCC", "TGA", "ATC", "TAA"]), constraints=[AvoidStopCodons()], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass() assert "*" not in translate(problem.sequence)
def test_basics(): numpy.random.seed(123) probas = {'A': 0.2, 'T': 0.2, 'G': 0.3, 'C': 0.3} problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, probas=probas, seed=123), constraints=[ AvoidPattern(enzyme="BsaI"), EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50) ]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_codon_optimize_as_hard_constraint(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence=random_dna_sequence(2000, seed=123), constraints=[ EnforceTranslation(location=Location(1000, 1300)), CodonOptimize(location=Location(1000, 1300), species='e_coli') ] ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidPattern_with_regular_expression(): sequence = ("ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTG" "GTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGC" "GAGGGCGAGGGCGATGCCACCAACGGCAAGCTGACCCTGAAGTTCATC") problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(), AvoidPattern(r"GGT(.*)GAT")], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_blast_matches(): avoided_seqs = [ "GTCCTCATGCGAAAGCTACGATCGCCAACCCTGT", "ACCCACCTCGTTACGTCCACGGCACGAGGAATGATCTCGAGTTGCTTT" ] constraint = AvoidBlastMatches(sequences=avoided_seqs, min_align_length=8) problem = DnaOptimizationProblem(sequence=sequence, constraints=[constraint]) assert not problem.all_constraints_pass() cst_eval = constraint.evaluate(problem) assert len(cst_eval.locations) == 10 problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceTranlationReversed(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) rev_sequence = reverse_complement(sequence) problem = DnaOptimizationProblem( sequence=rev_sequence, constraints=[ AvoidPattern("AGC"), EnforceTranslation(location=(0, len(sequence), -1)) ], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceGCContents(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[ AvoidPattern(enzyme="BsaI"), EnforceGCContent(mini=0.3, maxi=0.7, window=50) ], objectives=[EnforceGCContent(target=0.4)] ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_basics(): numpy.random.seed(123) probas = {"A": 0.2, "T": 0.2, "G": 0.3, "C": 0.3} problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, probas=probas, seed=123), constraints=[ AvoidPattern("BsaI_site"), EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50), ], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_phage_blast_matches(): PHAGE_TAXID = "697289" collection = GenomeCollection() blastdb = collection.get_taxid_blastdb_path(PHAGE_TAXID, db_type="nucl") problem = DnaOptimizationProblem(sequence=random_dna_sequence(30, seed=123), constraints=[ AvoidBlastMatches(blast_db=blastdb, min_align_length=10, word_size=7) ], logger=None) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidChanges_with_max_edits(): numpy.random.seed(1) problem = DnaOptimizationProblem( sequence="ATATATATATA", constraints=[ AvoidChanges(max_edits=2), AvoidPattern("ATATA"), EnforcePatternOccurence("A", occurences=6, location=(0, 11, 1)), EnforcePatternOccurence("T", occurences=4, location=(0, 11, 1)), ], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceRegionsCompatibility(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(123) def compatibility_condition(location1, location2, problem): seq1 = location1.extract_sequence(problem.sequence) seq2 = location2.extract_sequence(problem.sequence) return sequences_differences(seq1, seq2) >= 2 locations = [(0, 4), (50, 54), (100, 104), (150, 154)] problem = DnaOptimizationProblem( sequence=random_dna_sequence(200, seed=123), constraints=[ EnforceRegionsCompatibility( locations=locations, compatibility_condition=compatibility_condition, condition_label="2bp difference", ), EnforceGCContent(mini=0.4, maxi=0.6, window=40), ], logger=None, ) assert not any([e.passes for e in problem.constraints_evaluations()]) problem.resolve_constraints() assert problem.all_constraints_pass() seq = problem.sequence assert [ sequences_differences(seq[s1:e1], seq[s2:e2]) >= 2 for (s1, e1), (s2, e2) in itertools.combinations(locations, 2) ]
def verify_constraints(self, sequence): """Return True iff `sequence` passes all `self.sequence_constraints` Will automatically process DNA-Chisel constraints that would be in `self.sequence_constraints` """ constraints = self.sequence_constraints if not hasattr(self, "dnachisel_constraints"): self.dnachisel_constraints = [ constraint for constraint in self.sequence_constraints if isinstance(constraint, Specification) ] if self.dnachisel_constraints != []: if not DNACHISEL_AVAILABLE: raise ImportError( "Spotted DNA Chisel constraints, while " "DNA Chisel is not installed." ) # We provide an empty mutation space so it won't be recomputed # (which would take time and is useless here!) problem = DnaOptimizationProblem( sequence, self.dnachisel_constraints, mutation_space=[] ) constraints = [ constraint for constraint in constraints if not isinstance(constraint, Specification) ] + [lambda seq: problem.all_constraints_pass()] return all(constraint(sequence) for constraint in constraints)
def test_avoid_hairpins_on_extremities(): # see https://github.com/Edinburgh-Genome-Foundry/DnaChisel/issues/37 problem = DnaOptimizationProblem( sequence="attcaatgggggggggggggggggggggggggtagccta", constraints=[AvoidHairpins(stem_size=3, hairpin_window=8)]) evaluation = problem.constraints_evaluations().evaluations[0] assert str(evaluation.locations) == "[0-6, 32-39]" problem.resolve_constraints() assert problem.all_constraints_pass()
def test_random_compatible_dna_sequence(): constraints = [ EnforceGCContent(mini=0.4, maxi=0.6, window=50), AvoidPattern('ATC') ] seq = random_compatible_dna_sequence(1000, constraints=constraints) problem = DnaOptimizationProblem(sequence=seq, constraints=constraints) assert ("ATC" not in seq) assert problem.all_constraints_pass()
def test_avoid_hairpin_basics(): numpy.random.seed(123) random_sequences = [random_dna_sequence(30) for i in range(10)] full_sequence = "".join([ seq for sequence in random_sequences for seq in (random_dna_sequence(50), sequence, random_dna_sequence(50), reverse_complement(sequence), random_dna_sequence(50)) ]) problem = DnaOptimizationProblem(full_sequence, constraints=[AvoidHairpins()]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidRareCodons_as_constraint(): numpy.random.seed(123) sequence = "ATG" "TTT" "ATA" "CCA" "CTT" "TAG" problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(), AvoidRareCodons(0.11, "e_coli")], ) assert problem.all_constraints_pass() assert problem.sequence_edits_as_array().sum() == 4 assert translate(problem.sequence) == translate(sequence)
def test_AvoidPattern_with_jaspar_motifs(): stringio = StringIO(JASPAR_CONTENT) motif_patterns = MotifPssmPattern.list_from_file(stringio, file_format="jaspar", relative_threshold=0.9) problem = DnaOptimizationProblem( sequence="GGGGGGGGGGTGCGTGATTAAAGGGGG", constraints=[AvoidPattern(p) for p in motif_patterns], ) assert 2 == len(problem.constraints_evaluations().all_locations()) problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceSequence(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(1234) for symbol, nucleotides in [("W", "AT"), ("S", "GC")]: n_nucleotides = 15 start = 50 location = (start, start + n_nucleotides) problem = DnaOptimizationProblem( sequence=25 * "ATGC", constraints=[ AvoidPattern("ATGC"), AvoidPattern("AAA"), AvoidPattern("GGG"), EnforceSequence(n_nucleotides * symbol, location=location), ], ) problem.max_random_iters = 10000 problem.resolve_constraints() s, e = start, start + n_nucleotides assert all([n in nucleotides for n in problem.sequence[s:e]]) # Test -1 strand: seq = "ATG" + "CAG" + "AGCAAGGTGCTGCT" problem = DnaOptimizationProblem( sequence=seq, constraints=[ EnforcePatternOccurence( pattern="CTG", # CAG on strand +1 occurences=2, strand=-1, location=Location(start=0, end=50), ) ], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AllowPrimer(): primers = ["ATTGCGCCAAACT", "TAATCCACCCTAAT", "ATTCACACTTCAA"] problem = DnaOptimizationProblem(sequence=40 * "A", constraints=[ AllowPrimer( tmin=50, tmax=60, max_homology_length=5, location=(10, 30), avoid_heterodim_with=primers) ]) problem.resolve_constraints() assert problem.all_constraints_pass()
def test_enforce_pattern_basics(): numpy.random.seed(123) for seed in [2, 3, 123456]: # The seeds cover various cases: # 2: the problem has no occurences instead of 1 wanted # 3: the pattern has no occurences instead of 1 wanted # 123456: the pattern is over-represented (4 times instead of 1) sequence = random_dna_sequence(5000, seed=seed) constraints = [ EnforceTranslation(location=Location(1000, 2500)), EnforceTranslation(location=Location(3000, 4500)), EnforcePatternOccurence("ANANANANTT", location=Location(1100, 2150)), ] problem = DnaOptimizationProblem(sequence=sequence, constraints=constraints, logger=None) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()