def __call__(self, dna_sequence: str) -> tuple: # -- calculation of CAI -- w_list = [] # list of individual Relative Adaptivness length = len(dna_sequence) - 1 dna_with_stop = dna_sequence + 'x' # add stop character to the end of dna sequence to be able to iterate by codons get_aa = DegenerateTriplet( ) # this instance is needed to later get the list of amino acids generated by a given codon n_codons = len(dna_sequence) / 3 # number of codons on a sequence for i in range(0, len(dna_sequence) - 1, 3): codon = dna_sequence[i:i + 3] # specifying given codon aa = get_aa.degenerate_codon_to_aminos( str(codon), self.codonUsage.table.forward_table)[ 0] # getting a corresponding amino acid for this codon all_codons = self.get_codons( aa, self.threshold ) # getting a list of all codons for a given dna sequence c_max = all_codons[max( all_codons, key=all_codons.get )] # identifying maximal codon usage for a given amino acid c_current = all_codons[str( codon)] # identifying usage value for a given codon w = c_current / c_max # calculating Relative Adaptivness for a given codon w_list.append(w) CAI_score = (reduce(lambda x, y: x * y, w_list))**( 1 / n_codons ) # calculating CAI (codon adaptation index) which is the exponent of the product of all #-- calculation of GC content ratio gc_desired = ( self.gc_range[0] + self.gc_range[1] ) // 2 # identifying middle of the desired gc content region gc_sequence = GC(dna_sequence) GC_score = abs(float( (gc_desired - gc_sequence)) / 100) # calculating the score return (CAI_score, GC_score) # final score function
def __init__(self, config, is_dna_sequence, is_mutations_as_codons): self.config = config self.is_dna_sequence = is_dna_sequence self.is_mutations_as_codons = is_mutations_as_codons self.wild_dna_sequence = "" self.temp_calculator = config.temperature_config.create_calculator() self.gene = None # problem 1 specific # for future self.tm_distances = [] self.avoided_motifs = config.avoided_motifs self.get_aa = DegenerateTriplet() if config.organism == 'e-coli': self.usage_table = UsageTable( ).ecoli_usage # e_coli organism is chosen self.codonUsage = CodonUsage(config.organism) elif config.organism == 'yeast': self.usage_table = UsageTable( ).yeast_usage # yeast organism is chosen self.codonUsage = CodonUsage(config.organism) else: org = Organism(config.organism) self.usage_table = org.codon_table # other by name organism is chosen self.codonUsage = CodonUsage(config.organism)
def protein_from_dna(self, dna_sequence): get_aa = DegenerateTriplet() AA = [] for i in range(0, len(dna_sequence) - 1, 3): codon = dna_sequence[i:i + 3] aa = get_aa.degenerate_codon_to_aminos( str(codon), self.e_coli.table.forward_table)[0] AA.append(aa) return ''.join(AA)
def mutation_coverage(self) -> float: """ Returns a ratio (in [0,1]), of number of aminos generated by the solution primers and the number of amino acid mutations requested. """ aminos_for_sites = [ (set(AminoAcid(a) for a in mut.new_aminos) | {AminoAcid(mut.old_amino)}) for mut in self.mutations ] mut_site_offsets = [Offset(m.position) for m in self.mutations] index_of_site = { offset: i for (i, offset) in enumerate(mut_site_offsets) } aminos_covered: List[Set[AminoAcid]] = [set() for _ in self.mutations] for site_set, primers in self.primers.items(): site_list = sorted(site_set) for primer in primers: for i, codon in enumerate(primer.spec.codons): aminos = DegenerateTriplet.degenerate_codon_to_aminos( codon, self.usages.table.forward_table) aminos_covered[index_of_site[site_list[i]]].update(aminos) total_aminos = sum(len(amino_set) for amino_set in aminos_for_sites) total_aminos_covered = sum( len(amino_set & amino_set_covered) for (amino_set, amino_set_covered) in zip(aminos_for_sites, aminos_covered)) return total_aminos_covered / total_aminos
def test_mutations_on_sites(self): """ Assures that on every site, the generated mutations coincide with user's input """ pas_seq, config, is_mutations_as_codons, mutations, fragments, solution, goi_offset = self.generate_example() mutations_list = parse_input_mutations(is_mutations_as_codons,mutations) # list of mutations is generated from the input sequence, goi_offset = pas_seq.get_full_sequence_with_offset() # full sequence, offset value generator = OligoGenerator(config, is_mutations_as_codons, config.organism) codon_usage = CodonUsage("e-coli") for i, frag in enumerate(solution.get_fragments()): oligos_set = generator(frag.get_sequence( solution.gene.sequence), mutations, frag, goi_offset, 250) mutations_on_fragment = mutations_on_fragments(frag.get_start(), frag.get_end(), mutations_list, goi_offset) # filtering out mutations on this fragment mutations_on_site = self.get_mutations_on_sites(mutations_on_fragment) # get list of mutations for every mutation site for site, mutations_i in mutations_on_site.items(): wild_type_codon = self.get_codon_on_position(site, frag.get_sequence(sequence), goi_offset, frag.get_start()) # for every mutation site get wild codons on this position mutated_codons_on_site = [] # list of mutated codons on this site for oligo in oligos_set: mutated_codons_on_site.append(self.get_codon_on_position(site, oligo.sequence, goi_offset, frag.get_start())) # get all codons on this site from all oligos together try: mutated_codons_on_site.remove(wild_type_codon) # remove wild codons if present except: pass created_mutations = [] # list of mutated amino acids on a particular mutation site for codon in set(mutated_codons_on_site): temp_list = DegenerateTriplet.degenerate_codon_to_aminos(codon, codon_usage.table.forward_table) # codons to amino acids for i in temp_list: created_mutations.append(i) with self.subTest(i=site): self.assertEqual(set(created_mutations), set(mutations_i))
def test_degenerate_union_decode(self): codons = ["TTT", "TTA", "ATT", "GTT", "ACT", "AAT", "GAT", "GGG"] original_aminos = set(e_coli.table.forward_table[codon] for codon in codons) degenerate_codons = DegenerateTripletWithAminos.set_cover_with_degenerate_code( [ DegenerateTripletWithAminos.parse_from_codon_string( codon, e_coli.table.forward_table) for codon in codons ]) decoded_aminos_per_degenerate_codon = [ set( DegenerateTriplet.degenerate_codon_to_aminos( str(deg_codon), e_coli.table.forward_table)) for deg_codon in degenerate_codons ] # Before checking the decoding we just quickly check if the # set cover is disjoint. for a, b in itertools.combinations(decoded_aminos_per_degenerate_codon, 2): self.assertEqual(len(a.intersection(b)), 0) all_decoded_aminos = set.union(*decoded_aminos_per_degenerate_codon) self.assertEqual(original_aminos, all_decoded_aminos)
def test_degenerate_codon_to_aminos(self): test_cases = [("AAA", ["K"]), ("KAT", ["Y", "D"]), ("BGG", ["W", "R", "G"])] for degenerate_codon, aminos in test_cases: generated_aminos = DegenerateTriplet.degenerate_codon_to_aminos( degenerate_codon, e_coli.table.forward_table) self.assertEqual(set(aminos), set(generated_aminos))
def __init__(self, config: PASConfig, is_mutations_as_codons, organism='e-coli'): """ Initializing a class instance """ self.dna_by_name = CodonTable.unambiguous_dna_by_name["Standard"] self.threshold_usage, self.gc_range, self.use_degeneracy = parse_input_config( config) self.get_aa = DegenerateTriplet( ) # this instance is needed to later get the list of amino acids generated # by a given codon if organism == 'e-coli': self.usage_table = UsageTable( ).ecoli_usage # e_coli organism is chosen self.fw_table = CodonTable.unambiguous_dna_by_name[ "Standard"].forward_table self.codonUsage = CodonUsage(organism) self.scoring = TranslationScoring( self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) # initializing scoring instance elif organism == 'yeast': self.usage_table = UsageTable( ).yeast_usage # yeast organism is chosen self.codonUsage = CodonUsage(organism) self.fw_table = CodonTable.unambiguous_dna_by_name[ "Standard"].forward_table self.scoring = TranslationScoring( self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) # initializing scoring instance else: org = Organism(organism) self.usage_table = org.codon_table # other by name organism is chosen self.codonUsage = CodonUsage(organism) self.fw_table = org.translation_table.forward_table self.scoring = TranslationScoring(self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) self.get_motifs = Motifs() self.avoided_motifs = self.get_motifs( config.avoided_motifs) # getting list of avoided motifs self.degeneracy = Degeneracy(config, organism) self.is_mutations_as_codons = is_mutations_as_codons
def test_non_degenerate_triplets(self): test_cases = [ ("AAA", ["AAA"]), ("NAA", ["AAA", "CAA", "TAA", "GAA"]), ("KAK", ["GAG", "GAT", "TAG", "TAT"]), ("WSY", ["AGC", "AGT", "TGC", "TGT", "ACC", "ACT", "TCC", "TCT"]) ] for degenerate_codon, non_degenerate_codons in test_cases: non_deg_codons = DegenerateTriplet.get_all_non_degenerate_codons( degenerate_codon) self.assertEqual(set(non_deg_codons), set(non_degenerate_codons))
def degenerate_codon_to_aminos(codon: str, codonUsage) -> List: """ Converts a degenerate codon string to a list of aminos generated by that codon. """ assert len(codon) == 3 non_degenerate_codons = DegenerateTriplet.get_all_non_degenerate_codons( codon) coded_aminos = [] for c in non_degenerate_codons: try: coded_aminos.append(codonUsage.table.forward_table[c]) except: pass return list(set(coded_aminos))
class OligoGenerator(object): """ Function object for oligos generation for the fragment. """ threshold_usage: float # threshold for the codon frequency gc_range: List[int] # desired GC content range organism: str # organism chosen to use as codon frequency reference dna: str # initial dna fragment mutations: List # list of mutations, including mutation sites and probabilities aminos_with_probabilities: Dict # dictionary of mutations with their probabilities grouped by mutation sites aminos_with_codons: Dict # dictionary of mutations with corresponding codons grouped by mutation sites def __init__(self, config: PASConfig, is_mutations_as_codons, organism='e-coli'): """ Initializing a class instance """ self.dna_by_name = CodonTable.unambiguous_dna_by_name["Standard"] self.threshold_usage, self.gc_range, self.use_degeneracy = parse_input_config( config) self.get_aa = DegenerateTriplet( ) # this instance is needed to later get the list of amino acids generated # by a given codon if organism == 'e-coli': self.usage_table = UsageTable( ).ecoli_usage # e_coli organism is chosen self.fw_table = CodonTable.unambiguous_dna_by_name[ "Standard"].forward_table self.codonUsage = CodonUsage(organism) self.scoring = TranslationScoring( self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) # initializing scoring instance elif organism == 'yeast': self.usage_table = UsageTable( ).yeast_usage # yeast organism is chosen self.codonUsage = CodonUsage(organism) self.fw_table = CodonTable.unambiguous_dna_by_name[ "Standard"].forward_table self.scoring = TranslationScoring( self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) # initializing scoring instance else: org = Organism(organism) self.usage_table = org.codon_table # other by name organism is chosen self.codonUsage = CodonUsage(organism) self.fw_table = org.translation_table.forward_table self.scoring = TranslationScoring(self.threshold_usage, self.gc_range, self.codonUsage, self.usage_table) self.get_motifs = Motifs() self.avoided_motifs = self.get_motifs( config.avoided_motifs) # getting list of avoided motifs self.degeneracy = Degeneracy(config, organism) self.is_mutations_as_codons = is_mutations_as_codons def generate_solution(self, dna: str, mutations_list: List[tuple], start, goi_offset) -> List[PASOligo]: """ Main logic of solution's generation is implemented here: 1. For a given mutation site generate random codons 2. For these codons solve set cover problem 3. Check if aminos from same degenerate codon share same probability 3.1 If yes: 4. In the list of aminos with their probabilities keep only one amino from the ones sharing probabilities, and multiply it's probability to the number of covered aminos 3.2 If no: Leave codons from p.1 5. Proceed to next mutation site and repeat pp 1. - 4. 6. Generate combinations with concetrations for different sites with the help of cartesian multiplication 7. For every combination replace aminos on mutation sites with selected previousely codons (degenerete or normal ones) """ mutations_sites = Mutations.list_of_mutation_sites( mutations_list ) # get a list of mutation sites for a given fragment mutations_on_site_with_prob = [] chosen_codons_on_sites = [] for site in mutations_sites: aminos_with_codons = {} aminos_with_probabilities = {} for item in mutations_list: if item[0] == site: if self.is_mutations_as_codons: am = self.get_aa.degenerate_codon_to_aminos( item[1], self.fw_table)[ 0] # getting amino for a chosen by user codon aminos_with_codons[am] = item[ 1] # generating a dictionary storing aminos with corresponding randomly chosen codons # grouped by mutation sites aminos_with_probabilities[am] = item[ 2] # generating dictionary storing aminos with corresponding probabilities grouped by # mutation sites else: aminos_with_codons[ item[1]] = Codons.return_random_codon( self.codonUsage, self.threshold_usage, self.usage_table, item[1]) # generating a dictionary # storing aminos with corresponding randomly chosen codons grouped by mutation sites aminos_with_probabilities[item[1]] = item[ 2] # generating dictionary storing aminos with corresponding probabilities grouped by # mutation sites sum_of_probabilities = sum(aminos_with_probabilities.values( )) # checking if we need to take into account wild type codon if sum_of_probabilities != 1: wild_type_prob = 1 - sum_of_probabilities wild_type_codon = Codons.get_wild_type_codon( site, dna, start, goi_offset) wild_type_amino = self.get_aa.degenerate_codon_to_aminos( wild_type_codon, self.fw_table)[0] aminos_with_codons[ wild_type_amino] = wild_type_codon # creating wild type record with corresponding codon aminos_with_probabilities[ wild_type_amino] = wild_type_prob # creating wild type record with corresponding probability if self.use_degeneracy: # pprint.pprint(aminos_with_probabilities) candidates_for_set_cover = find_candidates_for_set_cover( aminos_with_probabilities) for candidate in candidates_for_set_cover: set_cover = Codons.solve_set_cover(candidate, self.degeneracy) if len(aminos_with_codons) > len(set_cover): modify_lists( set_cover, aminos_with_probabilities, aminos_with_codons, self.get_aa, self.fw_table ) # if the degeneracy problem is solved successfully - modify # aminos_with_codons and aminos_with_probabilities dictionaries to reflect the result ( # recalculating the probabilities as well) mutations_on_site_with_prob.append( aminos_with_probabilities ) # generate the final list of mutations on sites chosen_codons_on_sites.append( aminos_with_codons ) # generate the final list of corresponding codons mutations_combinations_with_probabilitites = Mutations.generate_mutation_combinations( mutations_on_site_with_prob ) # find all combinations of mutations for a given fragment and calculate the # concentrations return generate_oligos_from_combinations( mutations_combinations_with_probabilitites, chosen_codons_on_sites, dna, mutations_sites, start, goi_offset) # generate oligos from the combinations def __call__(self, dna: str, mutations, frag, goi_offset, niter: int) -> List[PASOligo]: """ Generates niter number of solutions and chooses the one with minimal number of oligos. """ start = frag.get_start() end = frag.get_end() mutations_list = parse_input_mutations(self.is_mutations_as_codons, mutations) mutations_list = mutations_on_fragments(start, end, mutations_list, goi_offset) if len(mutations_list) != 0: solutions = [] i = 0 while len(solutions) < 100 and i < niter: i += 1 solution = self.generate_solution(dna, mutations_list, start, goi_offset) motifs_in_dna = [ motif for motif in self.avoided_motifs if motif.search(dna) ] # list of avoided motifs which is contained in generated dna sequence if len(motifs_in_dna) == 0: solutions.append(solution) if len(solutions) < 100 and i == niter: raise PASNoSolutionException( 'Not possible to avoid specified combination of motifs!' ) solution = min( solutions, key=len) # choose the solution with minimal number of oligos solution: List[PASOligo] return solution else: return [PASOligo(sequence=dna, ratio=1)]
class Output: def __init__(self, config, is_dna_sequence, is_mutations_as_codons): self.config = config self.is_dna_sequence = is_dna_sequence self.is_mutations_as_codons = is_mutations_as_codons self.wild_dna_sequence = "" self.temp_calculator = config.temperature_config.create_calculator() self.gene = None # problem 1 specific # for future self.tm_distances = [] self.avoided_motifs = config.avoided_motifs self.get_aa = DegenerateTriplet() if config.organism == 'e-coli': self.usage_table = UsageTable( ).ecoli_usage # e_coli organism is chosen self.codonUsage = CodonUsage(config.organism) elif config.organism == 'yeast': self.usage_table = UsageTable( ).yeast_usage # yeast organism is chosen self.codonUsage = CodonUsage(config.organism) else: org = Organism(config.organism) self.usage_table = org.codon_table # other by name organism is chosen self.codonUsage = CodonUsage(config.organism) def combine_mutations_list(self, fragment: PASFragment, oligos_group, mutation_sites_on_fragment: List, mutations_on_fragment: [PASMutationSite], sequence=None, goi_offset=None) -> List: """ Combines a list of mutations on a fragment with additional details needed for frontend """ list_of_mutations = [] # Create mutated oligos for mut_site in mutations_on_fragment: for mutt in mut_site.mutations: if self.is_mutations_as_codons: mutation = self.get_aa.degenerate_codon_to_aminos( str(mutt.mutation), self.codonUsage.table.forward_table)[0] else: mutation = str(mutt.mutation) position = mut_site.position frequency = float(mutt.frequency) wild_type_codon = Codons.get_wild_type_codon( position, sequence, fragment.get_start(), goi_offset) wild_type_amino = self.get_aa.degenerate_codon_to_aminos( wild_type_codon, self.codonUsage.table.forward_table)[0] mutated_codon = "" # extreact mutated codons from the changed oligos for oligo in oligos_group: codon_on_position = get_codon_on_position( position, oligo.sequence, fragment.get_start(), goi_offset) amino_on_position = self.get_aa.degenerate_codon_to_aminos( codon_on_position, self.codonUsage.table.forward_table) if mutation in amino_on_position: mutated_codon = codon_on_position sublist_of_mutation = PASMutationFormatted( position=position, mutated_amino=mutation, wild_type_amino=wild_type_amino, wild_type_codon=wild_type_codon, mutated_codon=mutated_codon, frequency=frequency, wild_type=False) list_of_mutations.append(sublist_of_mutation) # Adding wild type mutations for site in mutation_sites_on_fragment: frequencies = [] for mut in mutations_on_fragment: if mut.position == site: frequencies.append(mut.mutations[0].frequency) if np.sum(frequencies) < 1: frequency = 1 - np.sum(frequencies) position = site wild_type_codon = Codons.get_wild_type_codon( site, sequence, fragment.get_start(), goi_offset) wild_type_amino = self.get_aa.degenerate_codon_to_aminos( wild_type_codon, self.codonUsage.table.forward_table)[0] mutated_codon = wild_type_codon mutated_amino = wild_type_amino sublist_of_mutation = PASMutationFormatted( position=position, mutated_amino=mutated_amino, wild_type_amino=wild_type_amino, wild_type_codon=wild_type_codon, mutated_codon=mutated_codon, frequency=frequency, wild_type=True) list_of_mutations.append(sublist_of_mutation) list_of_mutations.sort(key=sort_func) return list_of_mutations def __call__(self, best_solution: PASSolution, mutations: [PASMutationSite], sequences: PASSequences) -> [PASResult]: """ Returns list of results """ # two shifted iterators to iterate over fragment and next fragment in the same time # in purpose to calculate overlaps frag_current_it = iter(best_solution.get_fragments()) frag_lagged_it = iter(best_solution.get_fragments()) next(frag_lagged_it) results = [] goi_offset = sequences.get_goi_offset() # sorted list of all mutations sites mutation_sites = list(set([mut.position for mut in mutations])) mutation_sites.sort() # creating the output values for every fragment for i, frag_current in enumerate(frag_current_it): # getting oligos for a fragment, and fragment parameters generator = OligoGenerator(self.config, self.is_mutations_as_codons, self.config.organism) oligos_group = generator( frag_current.get_sequence(best_solution.gene.sequence), mutations, frag_current, goi_offset, 250) fragment_sequence = frag_current.get_sequence( best_solution.gene.sequence) # getting list of mutations on a fragment a prepare it in a desired json format mutation_sites_on_fragment = [ site for site in mutation_sites if ((goi_offset + (site - 1) * 3) >= frag_current.get_start() and (goi_offset + (site - 1) * 3 + 2) <= frag_current.get_end()) ] mutations_on_fragment = [ mut for mut in mutations if mut.position in mutation_sites_on_fragment ] mutations_on_fragment_formatted = self.combine_mutations_list( frag_current, oligos_group, mutation_sites_on_fragment, mutations_on_fragment, fragment_sequence, goi_offset) list_oligos = combine_oligos_list(oligos_group, mutations_on_fragment_formatted, mutation_sites_on_fragment, goi_offset, frag_current) # getting overlap and its parameters try: frag_next = next(frag_lagged_it) overlap = frag_current.get_overlap_seq( frag_next, sequences.get_full_sequence()) overlap_Tm = best_solution.temp_calculator(overlap) overlap_GC = GC(overlap) overlap_length = len(overlap) except: # when lagged iterator returns None set all overlaps info to None overlap = overlap_Tm = overlap_GC = overlap_length = None # every fragment at even position should be reverse complement of the original sub-sequence # doing it here because previous code requires fragment in original forward direction if i % 2 == 1: for oligo in list_oligos: oligo.make_reverse_complement() fragment_sequence = reverse_complement(fragment_sequence) # combining the results together result_oligo = PASResult(fragment=fragment_sequence, start=frag_current.get_start(), end=frag_current.get_end(), length=frag_current.get_length(), overlap=overlap, overlap_Tm=overlap_Tm, overlap_GC=overlap_GC, overlap_length=overlap_length, mutations=mutations_on_fragment_formatted, oligos=list_oligos) results.append(result_oligo) # preparing input data for final json # list of all mutation on a gene in a desired json format # returning output json return results
def create_new_output(self, input_data: QCLMInput, solution: QCLMSolution) \ -> QCLMOutput: """ Parse QCLM solution and create output object which can be automatically translated to json. """ sites_boundaries = compute_starts(solution) results: List[QCLMMutationOutput] = [] parsed_mutations = input_data.parse_mutations(self.goi_offset) for site_set, scored_primers in solution.primers.items(): site_sequence = sorted(site_set) mutated_dna_sequence_with_primer_sites = \ DNASequenceForMutagenesis(self.sequence, site_sequence) # noinspection PyUnusedLocal primer: ScoredPrimer for primer in scored_primers: primer_sequence = primer.spec.get_sequence(mutated_dna_sequence_with_primer_sites) primer_mutations: List[MutationSite] = [ parsed_mutation for parsed_mutation in parsed_mutations if parsed_mutation.get_start() in site_sequence ] sorted_primer_mutations = sorted(primer_mutations, key=lambda mut: mut.position) user_mutation_strings: List[str] = [] for mutation, codon in zip(sorted_primer_mutations, primer.spec.codons): coded_aminos = DegenerateTriplet.degenerate_codon_to_aminos(codon, self.usages.table.forward_table) user_code = mutation.user_string_with_aminos(coded_aminos) user_mutation_strings.append(user_code) # check if we have overlap with any primers. if check_for_overlap(sites_boundaries, site_set, primer.spec.offset,primer.spec.offset + primer.spec.length): overlap_with_next = True print("We have an overlap") else: overlap_with_next = False results.append(QCLMMutationOutput( result_found=True, mutations=user_mutation_strings, primers=[PrimerOutput( sequence=primer_sequence, start=primer.spec.offset, length=primer.spec.length, temperature=round(primer.tm, ndigits=2), gc_content=round(GC(primer_sequence), ndigits=2), degenerate_codons=list(primer.spec.codons), overlap_with_following=overlap_with_next )] )) return QCLMOutput( results=results, full_sequence=self.sequence, goi_offset=self.goi_offset, input_data=input_data, )