def propose_sequences( self, measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: """Propose top `sequences_batch_size` sequences for evaluation.""" old_sequences = measured_sequences["sequence"] old_sequence_set = set(old_sequences) new_seqs = set() while len(new_seqs) <= self.model_queries_per_batch: seq = self.rng.choice(old_sequences) new_seq = s_utils.generate_random_mutant(seq, self.mu / len(seq), alphabet=self.alphabet) if new_seq not in old_sequence_set: new_seqs.add(new_seq) new_seqs = np.array(list(new_seqs)) preds = self.model.get_fitness(new_seqs) if self.elitist: idxs = np.argsort(preds)[:-self.sequences_batch_size:-1] else: idxs = self.rng.integers(0, len(new_seqs), size=self.sequences_batch_size) return new_seqs[idxs], preds[idxs]
def propose_sequences( self, measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: """Propose top `sequences_batch_size` sequences for evaluation.""" # Set the torch seed by generating a random integer from the pre-seeded self.rng torch.manual_seed(self.rng.integers(-(2**31), 2**31)) measured_sequence_set = set(measured_sequences["sequence"]) # Create initial population by choosing parents from `measured_sequences` initial_pop_inds = self._choose_parents( measured_sequences["true_score"].to_numpy(), self.population_size, ) pop = measured_sequences["sequence"].to_numpy()[initial_pop_inds] scores = measured_sequences["true_score"].to_numpy()[initial_pop_inds] sequences = {} initial_cost = self.model.cost while (self.model.cost - initial_cost + self.population_size < self.model_queries_per_batch): # Create "children" by recombining parents selected from population # according to self.parent_selection_strategy and # self.recombination_strategy num_children = int(self.children_proportion * self.population_size) parents = pop[self._choose_parents(scores, num_children)] # Single-point mutation of children (for now) children = [] for seq in parents: child = s_utils.generate_random_mutant(seq, 1 / len(seq), self.alphabet) if child not in measured_sequence_set and child not in sequences: children.append(child) if len(children) == 0: continue children = np.array(children) child_scores = self.model.get_fitness(children) # Now kick out the worst samples and replace them with the new children argsorted_scores = np.argsort(scores) pop[argsorted_scores[:len(children)]] = children scores[argsorted_scores[:len(children)]] = child_scores sequences.update(zip(children, child_scores)) # We propose the top `self.sequences_batch_size` # new sequences we have generated new_seqs = np.array(list(sequences.keys())) preds = np.array(list(sequences.values())) sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1] return new_seqs[sorted_order], preds[sorted_order]
def _extend_samples(self, samples, weights): # generate random seqs around the input seq if the sample size is too small samples = list(samples) weights = list(weights) sequences = set(samples) while len(sequences) < 100: sample = random.choice(samples) sample = s_utils.generate_random_mutant(sample, self.mutation_rate, alphabet=self.alphabet) if sample not in sequences: samples.append(sample) weights.append(1) sequences.add(sample) return np.array(samples), np.array(weights)
def propose_sequences( self, measured_sequences: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: """Propose top `sequences_batch_size` sequences for evaluation.""" measured_sequence_set = set(measured_sequences["sequence"]) top_fitness = measured_sequences["true_score"].max() top_inds = measured_sequences["true_score"] >= top_fitness * ( 1 - np.sign(top_fitness) * self.threshold) parents = np.resize( measured_sequences["sequence"][top_inds].to_numpy(), self.sequences_batch_size, ) sequences = {} previous_model_cost = self.model.cost while self.model.cost - previous_model_cost < self.model_queries_per_batch: # generate recombinant mutants for i in range(self.rho): parents = self._recombine_population(parents) for i in range(0, len(parents), self.eval_batch_size): # Here we do rollouts from each parent (root of rollout tree) roots = parents[i:i + self.eval_batch_size] root_fitnesses = self.model.get_fitness(roots) nodes = list(enumerate(roots)) while (len(nodes) > 0 and self.model.cost - previous_model_cost + self.eval_batch_size < self.model_queries_per_batch): child_idxs = [] children = [] while len(children) < len(nodes): idx, node = nodes[len(children) - 1] child = s_utils.generate_random_mutant( node, self.mu * 1 / len(node), self.alphabet, ) # Stop when we generate new child that has never been seen # before if (child not in measured_sequence_set and child not in sequences): child_idxs.append(idx) children.append(child) # Stop the rollout once the child has worse predicted # fitness than the root of the rollout tree. # Otherwise, set node = child and add child to the list # of sequences to propose. fitnesses = self.model.get_fitness(children) sequences.update(zip(children, fitnesses)) nodes = [] for idx, child, fitness in zip(child_idxs, children, fitnesses): if fitness >= root_fitnesses[idx]: nodes.append((idx, child)) # We propose the top `self.sequences_batch_size` new sequences we have generated new_seqs = np.array(list(sequences.keys())) preds = np.array(list(sequences.values())) sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1] return new_seqs[sorted_order], preds[sorted_order]
def propose_sequences( self, measured_sequences_data: pd.DataFrame ) -> Tuple[np.ndarray, np.ndarray]: """Propose top `sequences_batch_size` sequences for evaluation.""" # If we are on the first round, our model has no data yet, so the # best policy is to propose random sequences in a small neighborhood. last_round = measured_sequences_data["round"].max() if last_round == 0: sequences = set() while len(sequences) < self.sequences_batch_size: sequences.add( s_utils.generate_random_mutant( self.starting_sequence, 2 / len(self.starting_sequence), self.alphabet, )) sequences = np.array(list(sequences)) return sequences, self.model.get_fitness(sequences) last_round_sequences = measured_sequences_data[ measured_sequences_data["round"] == last_round] # gamma is our threshold (the self.Q-th percentile of sequences from last round) # we will pick all of last round's sequences with fitness above the Qth # percentile gamma = np.percentile(last_round_sequences["true_score"], 100 * self.Q) initial_batch = last_round_sequences["sequence"][ last_round_sequences["true_score"] >= gamma].to_numpy() initial_weights = np.ones(len(initial_batch)) initial_batch, initial_weights = self._extend_samples( initial_batch, initial_weights) all_samples_and_weights = tuple((initial_batch, initial_weights)) # this will be the current state of the generator self.generator.train_model(initial_batch, initial_weights) # save the weights of the initial vae and save it as vae_0: # there are issues with keras model saving and loading, # so we have to recompile it generator_0 = VAE( seq_length=self.generator.seq_length, alphabet=self.generator.alphabet, batch_size=self.generator.batch_size, latent_dim=self.generator.latent_dim, intermediate_dim=self.generator.intermediate_dim, epochs=self.generator.epochs, epsilon_std=self.generator.epsilon_std, beta=self.generator.beta, validation_split=self.generator.validation_split, verbose=self.generator.verbose, ) original_weights = self.generator.vae.get_weights() generator_0.vae.set_weights(original_weights) vae_0 = generator_0.vae sequences = {} previous_model_cost = self.model.cost while self.model.cost - previous_model_cost < self.model_queries_per_batch: # generate new samples using the generator (second argument is a list of all # existing measured and proposed seqs) proposals = [] proposals = self.generator.generate( self.cycle_batch_size, all_samples_and_weights[0], all_samples_and_weights[1], ) print(self.model.cost - previous_model_cost, len(proposals)) # calculate the scores of the new samples using the model scores = self.model.get_fitness(proposals) # set a new fitness threshold if the new percentile is # higher than the current gamma = max(np.percentile(scores, self.Q * 100), gamma) # cbas and dbas mostly the same except cbas also does an importance # sampling step if self.algo == "cbas": # calculate the weights for the proposed batch log_probs_0 = self.generator.calculate_log_probability( proposals, vae=vae_0) log_probs_t = self.generator.calculate_log_probability( proposals) weights = np.exp(log_probs_0 - log_probs_t) weights = np.nan_to_num(weights) # Otherwise, `self.algo == "dbas"` else: weights = np.ones(len(proposals)) weights[scores < gamma] = 0 # add proposed samples to the total sample pool all_samples = np.append(all_samples_and_weights[0], proposals) all_weights = np.append(all_samples_and_weights[1], weights) all_samples_and_weights = (all_samples, all_weights) # update the generator # print('New training set size: ', len(all_samples_and_weights[0])) self.generator.train_model(all_samples_and_weights[0], all_samples_and_weights[1]) sequences.update(zip(proposals, scores)) # We propose the top `self.sequences_batch_size` new sequences we have generated new_seqs = np.array(list(sequences.keys())) preds = np.array(list(sequences.values())) sorted_order = np.argsort(preds)[:-self.sequences_batch_size:-1] return new_seqs[sorted_order], preds[sorted_order]