def experiment_1(seed=123): """A DNA chisel optimization whose results produced the file test_determinism.py""" np.random.seed(seed) sequence = dc.reverse_translate(dc.random_protein_sequence(50)) # MAXIMIZE THE GC CONTENT problem = dc.DnaOptimizationProblem( sequence=sequence, constraints=[dc.EnforceTranslation()], objectives=[dc.EnforceGCContent(target=1)], logger=None, ) problem.optimize() # BRING THE GC CONTENT BACK TO 50% problem = dc.DnaOptimizationProblem( sequence=problem.sequence, constraints=[dc.EnforceTranslation()], objectives=[dc.EnforceGCContent(target=0.5)], logger=None, ) problem.optimize() return problem.sequence
def test_EnforceTranslation_bacterial_valine(): table_name = "Bacterial" protein = "LLTMMVTTTTVMVL" protein_sequence = reverse_translate(protein, table=table_name) for first_codon_before, first_codon_after in [ ("ATG", "ATG"), # methionine stays the only methionine codon ("GTG", "GTG"), # valine-start-codon stays the only valine-start-codon ]: sequence = first_codon_before + protein_sequence cds_constraint = EnforceTranslation( genetic_table="Bacterial", start_codon="keep" ) problem = DnaOptimizationProblem( sequence=sequence, constraints=[cds_constraint], objectives=[EnforceChanges()], logger=None, ) assert problem.constraints[0].translation == "MLLTMMVTTTTVMVL" problem.optimize() protein_after = translate( problem.sequence, table_name, assume_start_codon=True ) assert protein_after == "M" + protein assert problem.sequence[:3] == first_codon_after
def test_EnforceTranlation(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) problem = DnaOptimizationProblem( sequence=sequence, constraints=[AvoidPattern("AAA"), EnforceTranslation()], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceTranlationError(): """Providing a location that is not multiple of 3 raises an error""" numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) with pytest.raises(ValueError) as err: problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(location=(0, 16))], ) assert "Location 0-16(+) has length 16" in str(err.value)
def test_codon_optimize_harmonized_short_sequence(): protein = "DDDKKKKKK" sequence = reverse_translate(protein) harmonization = CodonOptimize(species='b_subtilis', mode='harmonized') problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[harmonization] ) assert problem.objective_scores_sum() < -7 problem.optimize() assert -1 < problem.objective_scores_sum()
def test_codon_optimize_harmonized(): numpy.random.seed(123) protein = random_protein_sequence(500, seed=123) sequence = reverse_translate(protein) problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[CodonOptimize(species='e_coli', mode='harmonized')] ) assert (-700 < problem.objective_scores_sum() < -600) problem.optimize() assert (-350 < problem.objective_scores_sum())
def test_codon_optimize_bestcodon(): numpy.random.seed(123) protein = random_protein_sequence(3000, seed=123) sequence = reverse_translate(protein) problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[CodonOptimize(species='e_coli')] ) assert problem.objective_scores_sum() < 0 problem.optimize() assert problem.objective_scores_sum() == 0
def test_create_new_sequence(self): """Run the dnachisel optimizer and get a new DNA sequence.""" target_protein = 'MAAATCAGAGAAAAC' naive_target_sequence = reverse_translate(target_protein) result = self.optimize.create_new_sequence( naive_target_sequence, None, [] ) self.assertEqual( Seq(result).translate(), target_protein)
def test_maximal_protein_sequence_change(): np.random.seed(123) protein = dc.random_protein_sequence(200) sequence = dc.reverse_translate(protein) problem = dc.DnaOptimizationProblem( sequence=sequence, constraints=[dc.EnforceTranslation()], objectives=[dc.EnforceChanges()], ) problem.resolve_constraints() problem.optimize() assert problem.number_of_edits() == 238 assert dc.translate(problem.sequence) == protein
def test_EnforceTranslation_error_location_smaller_than_translation(): """Providing a location that is not multiple of 3 raises an error""" numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(15, seed=123)) with pytest.raises(ValueError) as err: _ = DnaOptimizationProblem( sequence=sequence, constraints=[ EnforceTranslation( translation=random_protein_sequence(30, seed=111)) ], logger=None, ) assert str(err.value).startswith("Window size")
def test_EnforceTranlationReversed(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) rev_sequence = reverse_complement(sequence) problem = DnaOptimizationProblem( sequence=rev_sequence, constraints=[ AvoidPattern("AGC"), EnforceTranslation(location=(0, len(sequence), -1)) ], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_codon_optimize_harmonize_rca_short_sequence(): protein = random_protein_sequence(500, seed=123) sequence = reverse_translate(protein) harmonization = CodonOptimize(species="h_sapiens", original_species="e_coli", method="harmonize_rca") problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[harmonization], logger=None, ) assert problem.objective_scores_sum() < -123 problem.optimize() assert -74 < problem.objective_scores_sum()
def experiment_2(seed=123): np.random.seed(seed) sequence = dc.reverse_translate(dc.random_protein_sequence(1000)) problem = dc.DnaOptimizationProblem( sequence=sequence, constraints=[ dc.EnforceTranslation(), dc.EnforceGCContent(mini=0.4, maxi=0.6, window=50), ], objectives=[dc.CodonOptimize(species="e_coli")], logger=None, ) problem.resolve_constraints() problem.optimize() return problem.sequence
def test_codon_optimize_match_usage(): numpy.random.seed(123) protein = random_protein_sequence(500, seed=123) sequence = reverse_translate(protein) problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[ CodonOptimize(species="e_coli", method="match_codon_usage") ], logger=None, ) assert -600 < problem.objective_scores_sum() < -550 problem.optimize() assert -350 < problem.objective_scores_sum()
def test_codon_optimize_match_usage_short_sequence(): numpy.random.seed(123) protein = "DDDKKKKKK" sequence = reverse_translate(protein) harmonization = CodonOptimize(species="b_subtilis", method="match_codon_usage") problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation()], objectives=[harmonization], logger=None, ) assert problem.objective_scores_sum() < -5.5 problem.optimize() assert -0.6 < problem.objective_scores_sum() print(problem.objective_scores_sum()) assert problem.sequence == "GATGATGACAAGAAAAAGAAAAAAAAA"
def execute(self, params : dict): """Invoke dnachissel to return matching codon optimized DNA sequence.""" protein_sequence = params['aa_sequence'].get_sequence() codon_usage_table = None num_results = params['num_results'] naive_target_sequence = reverse_translate(protein_sequence) proposed_sequences : List[str] = [] for i in range(num_results): print('Optimization run %s of %s' % (i, num_results)) new_sequence = self.create_new_sequence( naive_target_sequence=naive_target_sequence, codon_usage_table=codon_usage_table, existing_sequences=proposed_sequences) proposed_sequences.append(new_sequence) return Collection([ NucleotideSequence(sequence) for sequence in proposed_sequences ])
"""Example of use of the AvoidPAttern specification""" from dnachisel import ( DnaOptimizationProblem, random_protein_sequence, reverse_translate, CodonOptimize, EnforceTranslation, AvoidPattern, EnforceGCContent, ) protein = random_protein_sequence(1000, seed=123) sequence = reverse_translate(protein) problem = DnaOptimizationProblem( sequence=sequence, constraints=[ EnforceTranslation(), AvoidPattern("BsmBI_site"), EnforceGCContent(mini=0.4, maxi=0.6, window=60), ], objectives=[CodonOptimize(species="s_cerevisiae")], ) print("\nBefore optimization:\n") print(problem.constraints_text_summary()) print(problem.objectives_text_summary()) problem.resolve_constraints(final_check=True) problem.optimize()
def domesticate( self, dna_sequence=None, protein_sequence=None, is_cds="default", codon_optimization=None, extra_constraints=(), extra_objectives=(), final_record_target=None, edit=False, barcode="", barcode_spacer="AA", report_target=None, ): """Domesticate a sequence. Parameters ---------- dna_sequence The DNA sequence string to domesticate. protein_sequence Amino-acid sequence of the protein, which will be converted into a DNA sequence string. is_cds If True, sequence edits are restricted to synonymous mutations. codon_optimization Either None for no codon optimization or the name of an organism supported by DnaChisel. extra_constraints List of extra constraints to apply to the domesticated sequences. Each constraint is either a DnaChisel constraint or a function (dna_sequence => DnaChisel constraint). extra_objectives List of extra optimization objectives to apply to the domesticated sequences. Each objective is either a DnaChisel constraint or a function (dna_sequence => DnaChisel constraint). final_record_target Path to the file where to write the final genbank. edit Turn to True to allow sequence edits (if it is false and no all constraints are originally satisfied, a failed domestication result (i.e. with attribute ``success`` set to False) will be returned. report_target Target for the sequence optimization report (a folder path, or a zip path). barcode A sequence of DNA that will be added to the left of the sequence once the domestication is done. barcode_spacer Nucleotides to be added between the barcode and the enzyme (optional, the idea here is that they will make sure to avoid the creation of unwanted cutting sites). Returns ------- final_record, edits_record, report_data, success, msg """ if is_cds == "default": is_cds = self.cds_by_default if isinstance(dna_sequence, SeqRecord): problem = DnaOptimizationProblem.from_record(dna_sequence) for spec in problem.constraints + problem.objectives: spec.location += len(self.left_flank) extra_constraints = list(extra_constraints) + problem.constraints extra_objectives = list(extra_constraints) + problem.objectives if protein_sequence is not None: is_cds = True dna_sequence = reverse_translate(protein_sequence) constraints = [ c(dna_sequence) if hasattr(c, "__call__") else c for c in list(extra_constraints) + self.constraints ] location = Location(len(self.left_flank), len(self.left_flank) + len(dna_sequence)) if is_cds: constraints.append(EnforceTranslation(location=location)) objectives = [ o(dna_sequence) if hasattr(o, "__call__") else o for o in list(extra_objectives) + self.objectives ] if codon_optimization: objectives.append( CodonOptimize(species=codon_optimization, location=location)) if self.minimize_edits: objectives.append(AvoidChanges()) extended_sequence = self.left_flank + dna_sequence + self.right_flank if (not is_cds) and (not edit): constraints.append(AvoidChanges()) problem = DnaOptimizationProblem( extended_sequence, constraints=constraints, objectives=objectives, logger=self.logger, ) all_constraints_pass = problem.all_constraints_pass() no_objectives = (len(problem.objectives) - self.minimize_edits) == 0 report_data = None optimization_successful = True message = "" # print (all_constraints_pass, no_objectives) if not (all_constraints_pass and no_objectives): problem.n_mutations = self.simultaneous_mutations if report_target is not None: (success, message, report_data) = problem.optimize_with_report( target=report_target, project_name=self.name) optimization_successful = success else: report_data = None try: problem.resolve_constraints() problem.optimize() except Exception as err: message = str(err) optimization_successful = False report_data = None final_record = problem.to_record( with_original_features=True, with_original_spec_features=False, with_constraints=False, with_objectives=False, ) edits_record = problem.to_record( with_original_features=True, with_original_spec_features=False, with_constraints=False, with_objectives=False, with_sequence_edits=True, ) if final_record_target is not None: SeqIO.write(final_record, final_record_target, "genbank") return DomesticationResult( problem.sequence_before, final_record, edits_record, report_data, optimization_successful, message, )
all_9mers = [sequence[i:i + 9] for i in range(len(sequence) - 9)] number_of_non_unique_9mers = sum([ count for ninemer, count in Counter(all_9mers).items() if count > 1 ]) score = -(9.0 * number_of_non_unique_9mers) / len(sequence) return SpecEvaluation(self, problem, score=score, locations=[Location(0, len(sequence))], message="Score: %.02f (%d non-unique ninemers)" % (score, number_of_non_unique_9mers)) def __str__(self): """String representation.""" return "MinimizeNinemersScore" sequence = reverse_translate(random_protein_sequence(300)) problem = DnaOptimizationProblem(sequence=sequence, constraints=[EnforceTranslation()], objectives=[MinimizeNinemersScore()]) print("\n=== Status before optimization ===") print(problem.objectives_text_summary()) problem.optimize() print("\n=== Status after optimization ===") print(problem.objectives_text_summary()) print(problem.constraints_text_summary(failed_only=True))
def load_inserts(inputs): rec_counter = 1 inserts = [] for this_input in inputs: if os.path.isfile(this_input): ext = os.path.splitext(this_input)[1] if ext == '.fasta': for record in SeqIO.parse(this_input, 'fasta'): record.seq = Seq(reverse_translate(record.seq), IUPAC.unambiguous_dna) inserts.append(record) elif ext == '.pdb': for chain_num, record in enumerate(SeqIO.parse(this_input, "pdb-atom")): name = os.path.splitext(os.path.basename(this_input))[0] + "_" + record.annotations['chain'] record.seq = Seq(reverse_translate(record.seq), IUPAC.unambiguous_dna) record.id=name record.name=name inserts.append(record) else: exit("extension not recognized: " + ext) else: record = SeqRecord(Seq(reverse_translate(this_input),IUPAC.unambiguous_dna), id="unknown_seq%d" % rec_counter, name="unknown_seq%d" % rec_counter, description="domesticator-optimized DNA sequence") rec_counter += 1 inserts.append(record) # if mode == "protein_fasta_file": # for input_filename in inputs: # for record in SeqIO.parse(input_filename, 'fasta'): # record.seq = Seq(reverse_translate(record.seq), IUPAC.unambiguous_dna) # inserts.append(record) # # elif mode == "DNA_fasta_file": # for input_filename in inputs: # for record in SeqIO.parse(input_filename, 'fasta'): # assert(len(record.seq) % 3 == 0) # record.seq = Seq(str(record.seq), IUPAC.unambiguous_dna) # inserts.append(record) # elif mode == "protein_sequence": # for input_sequence in inputs: # record = SeqRecord(Seq(reverse_translate(input_sequence),IUPAC.unambiguous_dna), id="unknown_seq%d" % rec_counter, name="unknown_seq%d" % rec_counter, description="domesticator-optimized DNA sequence") # rec_counter += 1 # inserts.append(record) # elif mode == "DNA_sequence": # for input_sequence in inputs: # record = SeqRecord(Seq(input_sequence,IUPAC.unambiguous_dna), id="unknown_seq%d" % rec_counter, name="unknown_seq%d" % rec_counter, description="domesticator-optimized DNA sequence") # rec_counter += 1 # inserts.append(record) # elif mode == "PDB": # chain="ABCDEFGHIJKLMNOPQRSTUVWXYZ" # #parser = PDBParser() # #ppb=PPBuilder() # for input_pdb in inputs: # #for chain_num, polypeptide in enumerate(ppb.build_peptides(parser.get_structure('name', input_pdb))): # for chain_num, record in enumerate(SeqIO.parse(input_pdb, "pdb-atom")): # #seq = Seq(reverse_translate(polypeptide.get_sequence()), IUPAC.unambiguous_dna) # name = os.path.splitext(os.path.basename(input_pdb))[0] + "_" + chain[chain_num] # #record = SeqRecord(seq, id=name, name=name, description="domesticator-optimized DNA sequence") # record.seq = Seq(reverse_translate(record.seq), IUPAC.unambiguous_dna) # record.id=name # record.name=name # inserts.append(record) # else: # exit("input mode not recognized: " + args.input_mode) return inserts
codon_table = codon_table_11 else: print("\ngenetic codes other than 11 (Bacterial, Archaeal) not supported") #Import target gene #To do: refactor to process multiple input sequences gene_object = SeqIO.parse(fasta_path, "fasta") for dna_seq in gene_object: dna_id = dna_seq.id print("\nImporting target gene " + dna_id) dna = str(dna_seq.seq) gene = dna if protein_flag: gene = reverse_translate(gene) if input_path and taxid: print("\ngene list and taxonomic ID both provided. Defaulting to gene list") #Import gene list for RSCU calculation if input_path: print("\nImporting genes for RSCU calculation") seq_list = [] counter = 0 n_count = 0 seq_object = SeqIO.parse(input_path, "fasta") for seqs in seq_object: seq_id = seqs.id seq = str(seqs.seq) seq_list.append(seq)