def generate_charts(ga: pyeasyga.GeneticAlgorithm): all_vals = [] best_vals = [] start = time_ns() ga.create_first_generation() all_vals.append([x.fitness for x in ga.current_generation]) best_vals.append(ga.current_generation[0].fitness) for _ in range(1, ga.generations): if ga.current_generation[0].fitness == 0: break ga.create_next_generation() all_vals.append([x.fitness for x in ga.current_generation]) best_vals.append(ga.current_generation[0].fitness) print(f'\nin {(time_ns() - start) / 1000000}ms') avg = [np.average(x) for x in all_vals] fig, ax = plt.subplots() ax.plot(best_vals, label="Max") ax.plot(avg, label="Avg") plt.xlabel("Generations") plt.ylabel("Value") legend = ax.legend(loc='lower right') plt.show()
def generate(target_params, insert_aa_seq, population_size=100, mutation_probability=0.3, max_gens_since_improvement=50, genetic_code=11, verbose=False): # back translate to an initial seq insert = "" for aa in insert_aa_seq: try: insert += Bio.Data.CodonTable.unambiguous_dna_by_id[ genetic_code].back_table[aa] except: if aa == "*": insert += Bio.Data.CodonTable.unambiguous_dna_by_id[ genetic_code].back_table[None] # create the genetic algorithm instance ga = GeneticAlgorithm(dna_to_vector(insert), crossover_probability=0, maximise_fitness=False, population_size=population_size, mutation_probability=mutation_probability) # get the target values of k k = list(target_params.keys()) # generate the target vector from the input dict target = np.array([]) for _k in sorted([x for x in k if x != "codons"]): target = np.concatenate((target, [ x[1] for x in sorted(target_params[_k].items(), key=lambda x: x[0]) ])) if "codons" in k: target = np.concatenate((target, [ x[1] for x in sorted(target_params["codons"].items(), key=lambda x: x[0]) ])) def vector(seq): output = k_mer_frequencies(seq, [x for x in k if x != "codons"], include_missing=True, vector=True) if "codons" in k: output = np.concatenate((output, [ x[1] for x in sorted(codon_frequencies(seq, genetic_code).items(), key=lambda x: x[0]) ])) return output def fitness(individual, data): individual = vector_to_dna(individual) # fitness = np.linalg.norm(target - vector(individual)) fitness = jensen_shannon_divergence([ dit.ScalarDistribution(target), dit.ScalarDistribution(vector(individual)) ]) return fitness ga.fitness_function = fitness synonymous_codons = _synonymous_codons(genetic_codes[genetic_code]) def mutate(individual): while True: # choose a random codon codon_idx = np.random.randint(len(individual) / 6) * 6 # figure out which codon it is codon = vector_to_dna(individual[codon_idx:codon_idx + 6]) # ensure that mutations actually change the sequence if len(synonymous_codons[codon]) != 1: break # choose a new one at random for the AA new_codon = dna_to_vector( np.random.choice( [x for x in synonymous_codons[codon] if x != codon])) # replace it in the individual individual[codon_idx:codon_idx + 6] = new_codon return individual ga.mutate_function = mutate def create_individual(seed_data): individual = vector_to_dna(seed_data) new = "" for codon in [ individual[i:i + 3] for i in range(0, len(individual), 3) ]: if len(synonymous_codons[codon]) == 1: new += codon continue new += np.random.choice( [x for x in synonymous_codons[codon] if x != codon]) return dna_to_vector(new) ga.create_individual = create_individual # set up for GA run ga.create_first_generation() gens_since_improvement = 0 best_indv_fitness = ga.best_individual()[0] counter = 1 # run the GA while gens_since_improvement < max_gens_since_improvement: ga.create_next_generation() if ga.best_individual()[0] < best_indv_fitness: best_indv_fitness = ga.best_individual()[0] gens_since_improvement = 0 else: gens_since_improvement += 1 if verbose: print( "Gen: %s\tSince Improvement: %s/%s\tFitness: %s".expandtabs(15) % (counter, gens_since_improvement, max_gens_since_improvement, ga.best_individual()[0]), end="\r") counter += 1 if verbose: print() best_seq = vector_to_dna(ga.best_individual()[1]) best_freqs = vector(best_seq) return best_seq
def generate(target_params, insert_aa_seq, population_size=100, mutation_probability=0.3, crossover_probability=0.8, max_gens_since_improvement=50, genetic_code=11, verbose=False): '''Generate a sequence matching :math:`k`-mer usage. Args: target_params (dict): The parameters to optimize towards. Should be of the format {:math:`k_n`: {:math:`k_{n1}`: 0.2, :math:`k_{n2}`: 0.3,...}...} insert_aa_seq (str): The amino acid sequence for the optimized sequence. population_size (int, optional): The size of the population for the genetic algorithm. Defaults to 100. mutation_probability (float, optional): The likelihood of changing each member of each generation. Defaults to 0.3. crossover_probability (float, optional): The likelihood of each member of the population undergoing crossover. Defaults to 0.8. max_gens_since_improvement (int, optional): The number of generations of no improvement after which to stop optimization. Defaults to 50. genetic_code (int, optional): The genetic code to use. Defaults to 11, the standard genetic code. verbose (bool, optional): Whether to print the generation number, generations since improvement, and fitness. Defaults to false. Returns: str: The generated sequence. ''' # back translate to an initial seq insert = "" for aa in insert_aa_seq: try: insert += Bio.Data.CodonTable.unambiguous_dna_by_id[genetic_code].back_table[aa] except: if aa == "*": insert += Bio.Data.CodonTable.unambiguous_dna_by_id[genetic_code].back_table[None] # create the genetic algorithm instance ga = GeneticAlgorithm(dna_to_vector(insert), crossover_probability=crossover_probability, maximise_fitness=False, population_size=population_size, mutation_probability=mutation_probability) # get the target values of k k = list(target_params.keys()) # generate the target vector from the input dict target = np.array([]) for _k in sorted([x for x in k if x != "codons"]): target = np.concatenate((target, [x[1] for x in sorted(target_params[_k].items(), key=lambda x: x[0])])) if "codons" in k: target = np.concatenate((target, [x[1] for x in sorted(target_params["codons"].items(), key=lambda x: x[0])])) def vector(seq): output = k_mer_frequencies(seq, [x for x in k if x != "codons"], include_missing=True, vector=True) if "codons" in k: output = np.concatenate((output, [x[1] for x in sorted(codon_frequencies(seq, genetic_code).items(), key=lambda x: x[0])])) return output def fitness(individual, data): individual = vector_to_dna(individual) # fitness = np.linalg.norm(target - vector(individual)) fitness = jensen_shannon_divergence([dit.ScalarDistribution(target), dit.ScalarDistribution(vector(individual))]) return fitness ga.fitness_function = fitness synonymous_codons = _synonymous_codons(genetic_codes[genetic_code]) def mutate(individual): while True: # choose a random codon codon_idx = np.random.randint(len(individual) / 6) * 6 # figure out which codon it is codon = vector_to_dna(individual[codon_idx:codon_idx+6]) # ensure that mutations actually change the sequence if len(synonymous_codons[codon]) != 1: break # choose a new one at random for the AA new_codon = dna_to_vector(np.random.choice([x for x in synonymous_codons[codon] if x != codon])) # replace it in the individual individual[codon_idx:codon_idx+6] = new_codon return individual ga.mutate_function = mutate def crossover(parent_1, parent_2): parent_1, parent_2 = list(parent_1), list(parent_2) index = random.randrange(1, len(parent_1) / 6) * 6 child_1 = parent_1[:index] + parent_2[index:] child_2 = parent_2[:index] + parent_1[index:] return child_1, child_2 ga.crossover_function = crossover def create_individual(seed_data): individual = vector_to_dna(seed_data) new = "" for codon in [individual[i:i+3] for i in range(0, len(individual), 3)]: if len(synonymous_codons[codon]) == 1: new += codon continue new += np.random.choice([x for x in synonymous_codons[codon] if x != codon]) return dna_to_vector(new) ga.create_individual = create_individual # set up for GA run ga.create_first_generation() gens_since_improvement = 0 best_indv_fitness = ga.best_individual()[0] counter = 1 # run the GA try: while gens_since_improvement < max_gens_since_improvement: ga.create_next_generation() if ga.best_individual()[0] < best_indv_fitness: best_indv_fitness = ga.best_individual()[0] gens_since_improvement = 0 else: gens_since_improvement += 1 if verbose: print("Gen: %s\tSince Improvement: %s/%s\tFitness: %s".expandtabs(15) % (counter, gens_since_improvement, max_gens_since_improvement, ga.best_individual()[0]), end="\r") counter += 1 except KeyboardInterrupt: print("\nStopping early...") if verbose: print() best_seq = vector_to_dna(ga.best_individual()[1]) best_freqs = vector(best_seq) assert Seq(best_seq).translate(genetic_code) == Seq(insert).translate(genetic_code) return best_seq