class PEPG: '''Extension of PEPG with bells and whistles.''' def __init__( self, num_params, # number of model parameters sigma_init=0.10, # initial standard deviation sigma_alpha=0.20, # learning rate for std sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than sigma_max_change=0.2, # clips adaptive sigma to 20% learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.01, # stop annealing learning rate elite_ratio=0, # if >0 then ignore learning_rate pop_size=256, # population size average_baseline=True, # set baseline to average weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # don't keep the hist best sol self.num_params = num_params self.sigma_init = sigma_init self.sigma_alpha = sigma_alpha self.sigma_decay = sigma_decay self.sigma_limit = sigma_limit self.sigma_max_change = sigma_max_change self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.pop_size = pop_size self.average_baseline = average_baseline if self.average_baseline: assert (self.pop_size % 2 == 0), "Population size must be even" self.pop_size = int(self.pop_size / 2) else: assert (self.pop_size & 1), "Population size must be odd" self.pop_size = int((self.pop_size - 1) / 2) # option to use greedy es method to select next mu, # rather than using drift param self.elite_ratio = elite_ratio self.elite_pop_size = int(self.pop_size * self.elite_ratio) self.use_elite = False if self.elite_pop_size > 0: self.use_elite = True self.forget_best = forget_best self.batch_reward = np.zeros(self.pop_size * 2) self.mu = np.zeros(self.num_params) self.sigma = np.ones(self.num_params) * self.sigma_init self.curr_best_mu = np.zeros(self.num_params) self.best_mu = np.zeros(self.num_params) self.best_reward = 0 self.first_interation = True self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True # always forget the best one if we rank # choose optimizer self.optimizer = Adam(self, learning_rate) def rms_stdev(self): sigma = self.sigma return np.mean(np.sqrt(sigma * sigma)) def ask(self): '''returns a list of parameters''' # antithetic sampling self.epsilon = np.random.randn(self.pop_size, self.num_params) self.epsilon *= self.sigma.reshape(1, self.num_params) self.epsilon_full = np.concatenate([self.epsilon, -self.epsilon]) if self.average_baseline: epsilon = self.epsilon_full else: # first population is mu, then positive epsilon, # then negative epsilon epsilon = np.concatenate( [np.zeros((1, self.num_params)), self.epsilon_full]) solutions = self.mu.reshape(1, self.num_params) + epsilon self.solutions = solutions return solutions def tell(self, scores): # input must be a numpy float array assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward_table = np.array(scores) if self.rank_fitness: reward_table = compute_centered_ranks(reward_table) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, self.solutions) reward_table += l2_decay reward_offset = 1 if self.average_baseline: b = np.mean(reward_table) reward_offset = 0 else: b = reward_table[0] # baseline reward = reward_table[reward_offset:] if self.use_elite: idx = np.argsort(reward)[::-1][0:self.elite_pop_size] else: idx = np.argsort(reward)[::-1] best_reward = reward[idx[0]] if (best_reward > b or self.average_baseline): best_mu = self.mu + self.epsilon_full[idx[0]] best_reward = reward[idx[0]] else: best_mu = self.mu best_reward = b self.curr_best_reward = best_reward self.curr_best_mu = best_mu if self.first_interation: self.sigma = np.ones(self.num_params) * self.sigma_init self.first_interation = False self.best_reward = self.curr_best_reward self.best_mu = best_mu else: if self.forget_best or (self.curr_best_reward > self.best_reward): self.best_mu = best_mu self.best_reward = self.curr_best_reward # short hand epsilon = self.epsilon sigma = self.sigma # update the mean # move mean to the average of the best idx means if self.use_elite: self.mu += self.epsilon_full[idx].mean(axis=0) else: rT = (reward[:self.pop_size] - reward[self.pop_size:]) change_mu = np.dot(rT, epsilon) self.optimizer.stepsize = self.learning_rate # adam, rmsprop, momentum, etc. update_ratio = self.optimizer.update(-change_mu) # self.mu += (change_mu * self.learning_rate) # normal SGD method # adaptive sigma # normalization if (self.sigma_alpha > 0): stdev_reward = 1.0 if not self.rank_fitness: stdev_reward = reward.std() S = epsilon * epsilon - (sigma * sigma).reshape(1, self.num_params) S /= sigma.reshape(1, self.num_params) reward_avg = (reward[:self.pop_size] + reward[self.pop_size:]) / 2.0 rS = reward_avg - b delta_sigma = (np.dot(rS, S)) / \ (2 * self.pop_size * stdev_reward) # adjust sigma according to the adaptive sigma calculation # for stability, don't let sigma move more than 10% of orig value change_sigma = self.sigma_alpha * delta_sigma change_sigma = np.minimum(change_sigma, self.sigma_max_change * self.sigma) change_sigma = np.maximum(change_sigma, -self.sigma_max_change * self.sigma) self.sigma += change_sigma if (self.sigma_decay < 1): self.sigma[self.sigma > self.sigma_limit] *= self.sigma_decay if (self.learning_rate_decay < 1 and self.learning_rate > self.learning_rate_limit): self.learning_rate *= self.learning_rate_decay def current_param(self): return self.curr_best_mu def set_mu(self, mu): self.mu = np.array(mu) def best_param(self): return self.best_mu def result(self): # return best params so far, along with historically # best reward, curr reward, sigma return (self.best_mu, self.best_reward, self.curr_best_reward, self.sigma)
class OpenES: """ Basic Version of OpenAI Evolution Strategies """ def __init__( self, num_params, # number of model parameters mu_init=None, # initial mean sigma_init=1, # initial standard deviation sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.001, # stop annealing learning rate pop_size=256, # population size antithetic=False, # whether to use anti sampling weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # forget historical best # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma_decay = sigma_decay self.sigma = sigma_init self.sigma_init = sigma_init self.sigma_limit = sigma_limit # optimizarion stuff self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.optimizer = Adam(self, learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.forget_best = forget_best self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True def ask(self, pop_size): """ Returns a list of candidates parameterss """ if self.antithetic: epsilon_half = np.random.randn(self.pop_size // 2, self.num_params) epsilon = np.concatenate([epsilon_half, -epsilon_half]) else: epsilon = np.random.randn(pop_size, self.num_params) return self.mu.reshape(1, self.num_params) + epsilon * self.sigma def tell(self, solutions, scores): """ Updates the distribution """ assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward = np.array(scores) if self.rank_fitness: reward = compute_centered_ranks(reward) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, solutions) reward += l2_decay # TBD check if ok epsilon = (solutions - self.mu.reshape(1, self.num_params)) / self.sigma # standardize the rewards to have a gaussian distribution normalized_reward = (reward - np.mean(reward)) / np.std(reward) change_mu = 1. / (self.pop_size * self.sigma) * \ np.dot(epsilon.T, normalized_reward) # updating stuff idx = np.argsort(reward)[::-1] best_reward = reward[idx[0]] best_mu = solutions[idx[0]] self.curr_best_reward = best_reward self.curr_best_mu = best_mu if self.first_interation: self.first_interation = False self.best_reward = self.curr_best_reward self.best_mu = best_mu else: if self.forget_best or (self.curr_best_reward > self.best_reward): self.best_mu = best_mu self.best_reward = self.curr_best_reward # optimization step self.optimizer.stepsize = self.learning_rate self.optimizer.update(-change_mu) # adjust sigma according to the adaptive sigma calculation if (self.sigma > self.sigma_limit): self.sigma *= self.sigma_decay if (self.learning_rate > self.learning_rate_limit): self.learning_rate *= self.learning_rate_decay def get_distrib_params(self): """ Returns the parameters of the distrubtion: the mean and sigma """ return self.mu, self.sigma def result(self): """ Returns best params so far, best score, current score and sigma """ return (self.best_mu, self.best_reward, self.curr_best_reward, self.sigma) def rms_stdev(self): sigma = self.sigma return np.mean(np.sqrt(sigma * sigma))
class Parameters: def __init__(self, population_size=1, sigma=0, alpha=0, filename=''): self.population_size = population_size self.sigma = sigma self.alpha = alpha self.optimizer = Adam() if filename: npz = np.load(filename) self.F1 = Param(npz['arr_0'], population_size, sigma) self.F2 = Param(npz['arr_1'], population_size, sigma) self.F3 = Param(npz['arr_2'], population_size, sigma) self.F4 = Param(npz['arr_3'], population_size, sigma) self.F5 = Param(npz['arr_4'], population_size, sigma) self.F6 = Param(npz['arr_5'], population_size, sigma) self.g3 = Param(npz['arr_6'], population_size, sigma) self.b3 = Param(npz['arr_7'], population_size, sigma) self.g4 = Param(npz['arr_8'], population_size, sigma) self.b4 = Param(npz['arr_9'], population_size, sigma) self.g5 = Param(npz['arr_10'], population_size, sigma) self.b5 = Param(npz['arr_11'], population_size, sigma) self.g6 = Param(npz['arr_12'], population_size, sigma) self.b6 = Param(npz['arr_13'], population_size, sigma) self.Wx0 = Param(npz['arr_14'], population_size, sigma) self.bx0 = Param(npz['arr_15'], population_size, sigma) self.Wx1 = Param(npz['arr_16'], population_size, sigma) self.bx1 = Param(npz['arr_17'], population_size, sigma) self.Wx2 = Param(npz['arr_18'], population_size, sigma) self.bx2 = Param(npz['arr_19'], population_size, sigma) self.Wv = Param(npz['arr_20'], population_size, sigma) self.bv = Param(npz['arr_21'], population_size, sigma) self.lg0 = Param(npz['arr_22'], population_size, sigma) self.lb0 = Param(npz['arr_23'], population_size, sigma) self.lg1 = Param(npz['arr_24'], population_size, sigma) self.lb1 = Param(npz['arr_25'], population_size, sigma) self.lg2 = Param(npz['arr_26'], population_size, sigma) self.lb2 = Param(npz['arr_27'], population_size, sigma) else: # filter weight is whdo # w = width # h = height # d = depth (in channels) # o = out depth (out channels)? self.F1 = Param( tf.random.normal([F_size, F_size, 3, NF1_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F2 = Param( tf.random.normal([F_size, F_size, NF1_out, NF2_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F3 = Param( tf.random.normal([F_size, F_size, NF2_out, NF3_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma) self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma) self.F4 = Param( tf.random.normal([F_size, F_size, NF3_out, NF4_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma) self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma) self.F5 = Param( tf.random.normal([F_size, F_size, NF4_out, NF5_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma) self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma) self.F6 = Param( tf.random.normal([F_size, F_size, NF5_out, NF6_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma) self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma) self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]), population_size, sigma) self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wv = Param(tf.random.normal([Y_size, H_size]), population_size, sigma) self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma) def all(self): return [self.F1, self.F2, self.F3, self.F4, self.F5, self.F6,\ self.g3, self.b3, self.g4, self.b4, self.g5, self.b5, self.g6, self.b6,\ self.Wx0, self.bx0, self.Wx1, self.bx1, self.Wx2, self.bx2,\ self.Wv, self.bv,\ self.lg0,self.lb0,self.lg1,self.lb1,self.lg2,self.lb2] # return reference to current tensors def current(self): return [param.current for param in self.all()] def set_current_population_member(self, i): for param in self.all(): param.set_current_population_member(i) def update_nes(self, reward, reward_mean, reward_std): reward = (reward - reward_mean) / (reward_std + .00001) grads = [] means = [] for param in self.all(): grads += [ param.get_grad(reward) * (self.alpha / (self.population_size * self.sigma)) ] means += [param.mean] self.optimizer.update(means, grads) for param in self.all(): param.gen_pop_about_mean(self.sigma) def mutate(self, param, i): x = param.population[i] if random.randint(1, 4) == 1: jitter = tf.random.normal(x.shape, stddev=self.sigma) return x + jitter else: return x def mate(self, param, i, j): if random.randint(1, 4) == 1: return self.mutate(param, i) else: return self.mutate(param, j) def update_ga(self, rewards): # sort parameters by rewards top_reward_indices = rewards.argsort()[-PASS_THROUGH:] top_reward_indices = top_reward_indices[::-1] for param in self.all(): # sort population for i, j in enumerate(top_reward_indices): param.population[i] = param.population[j] # generate new population for k in range(PASS_THROUGH, self.population_size): param.population[k] = self.mate(param, random.randint(0, 9), random.randint(0, 9))