def optimize(self): fitness, index = ascendent_sort(self.samplefitness) # sort the fitness self.avgfit = np.average(fitness) # compute the average fitness self.bfit = fitness[(self.batchSize * 2) - 1] bidx = index[(self.batchSize * 2) - 1] if ((bidx % 2) == 0): # regenerate the genotype of the best samples bestid = int(bidx / 2) self.bestsol = self.center + self.samples[bestid] * self.noiseStdDev else: bestid = int(bidx / 2) self.bestsol = self.center - self.samples[bestid] * self.noiseStdDev if self.rank == 0: self.updateBest( self.bfit, self.bestsol) # Stored if it is the best obtained so far popsize = self.batchSize * 2 # compute a vector of utilities [-0.5,0.5] utilities = zeros(popsize) for i in range(popsize): utilities[index[i]] = i utilities /= (popsize - 1) utilities -= 0.5 weights = zeros( self.batchSize ) # Assign the weights (utility) to samples on the basis of their fitness rank for i in range(self.batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1] ) # merge the utility of symmetric samples g = 0.0 i = 0 while i < self.batchSize: # Compute the gradient (the dot product of the samples for their utilities) gsize = -1 if self.batchSize - i < 500: # if the popsize is larger than 500, compute the gradient for multiple sub-populations gsize = self.batchSize - i else: gsize = 500 g += dot(weights[i:i + gsize], self.samples[i:i + gsize, :]) i += gsize g /= popsize # normalize the gradient for the popsize if self.wdecay == 1: globalg = -g + 0.005 * self.center # apply weight decay else: globalg = -g # adam stochastic optimizer a = self.stepsize * sqrt(1.0 - self.beta2**self.cgen) / ( 1.0 - self.beta1**self.cgen) self.m = self.beta1 * self.m + (1.0 - self.beta1) * globalg self.v = self.beta2 * self.v + (1.0 - self.beta2) * (globalg * globalg) dCenter = -a * self.m / (sqrt(self.v) + self.epsilon) self.center += dCenter # move the center in the direction of the momentum vectors self.avecenter = np.average(np.absolute(self.center))
def evaluate(self): cseed = self.seed + self.cgen * self.batchSize # Set the seed for current generation (master and workers have the same seed) self.rs = np.random.RandomState(cseed) self.samples = self.rs.randn(self.batchSize, self.nparams) self.cgen += 1 # evaluate samples candidate = np.arange(self.nparams, dtype=np.float64) for b in range(self.batchSize): for bb in range(2): if (bb == 0): candidate = self.center + self.samples[b,:] * self.noiseStdDev else: candidate = self.center - self.samples[b,:] * self.noiseStdDev self.policy.set_trainable_flat(candidate) self.policy.nn.normphase(0) # normalization data is collected during the post-evaluation of the best sample of he previous generation eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, seed=(self.seed + (self.cgen * self.batchSize) + b)) self.samplefitness[b*2+bb] = eval_rews self.steps += eval_length fitness, self.index = ascendent_sort(self.samplefitness) # sort the fitness self.avgfit = np.average(fitness) # compute the average fitness self.bfit = fitness[(self.batchSize * 2) - 1] bidx = self.index[(self.batchSize * 2) - 1] if ((bidx % 2) == 0): # regenerate the genotype of the best samples bestid = int(bidx / 2) self.bestsol = self.center + self.samples[bestid] * self.noiseStdDev else: bestid = int(bidx / 2) self.bestsol = self.center - self.samples[bestid] * self.noiseStdDev self.updateBest(self.bfit, self.bestsol) # Stored if it is the best obtained so far # postevaluate best sample of the last generation # in openaiesp.py this is done the next generation, move this section before the section "evaluate samples" to produce identical results gfit = 0 if self.bestsol is not None: self.policy.set_trainable_flat(self.bestsol) self.tnormepisodes += self.inormepisodes for t in range(self.policy.nttrials): if self.policy.normalize == 1 and self.normepisodes < self.tnormepisodes: self.policy.nn.normphase(1) self.normepisodes += 1 # we collect normalization data self.normalizationdatacollected = True else: self.policy.nn.normphase(0) eval_rews, eval_length = self.policy.rollout(1, seed=(self.seed + 100000 + t)) gfit += eval_rews self.steps += eval_length gfit /= self.policy.nttrials self.updateBestg(gfit, self.bestsol)
def run(self, maxsteps): start_time = time.time() # initialize the solution center center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters batchSize = self.batchSize if batchSize == 0: # 4 + floor(3 * log(N)) batchSize = int(4 + math.floor(3 * math.log(nparams))) # Symmetric weights in the range [-0.5,0.5] weights = zeros(batchSize) ceval = 0 # current evaluation cgen = 0 # current generation # Parameters for Adam policy m = zeros(nparams) v = zeros(nparams) epsilon = 1e-08 # To avoid numerical issues with division by zero... beta1 = 0.9 beta2 = 0.999 # RandomState for perturbing the performed actions (used only for samples, not for centroid) rs = np.random.RandomState(self.seed) fitbestsample = [0, 0] print( "Salimans2: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams)) # main loop elapsed = 0 while (ceval < maxsteps): cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = rs.randn(batchSize, nparams) # buffer vector for candidate candidate = np.arange(nparams, dtype=np.float64) # allocate the fitness vector (fitness2 is the sum on the two behaviors) fitness = zeros(batchSize * 2) fitness2 = zeros(batchSize * 2) # If normalize=1 we update the normalization vectors if (self.policy.normalize == 1): self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) # Evaluate offspring 2 times (on behavior 1 and 2) g1 = 0.0 g2 = 0.0 for beh in range(2): for b in range(batchSize): for bb in range(2): if (bb == 0): candidate = center + samples[ b, :] * self.noiseStdDev else: candidate = center - samples[ b, :] * self.noiseStdDev # Set policy parameters self.policy.set_trainable_flat(candidate) # Evaluate the offspring eval_rews, eval_length, rews1, rews2 = self.policy.rollout( self.policy.ntrials, seed=(self.seed + (cgen * self.batchSize) + b), timestep_limit=beh) # store the fitness fitness[b * 2 + bb] = eval_rews fitness2[b * 2 + bb] += (eval_rews / 2.0) # Update the number of evaluations ceval += eval_length # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) fitbestsample[beh] = fitness[batchSize * 2 - 1] # Now me must compute the symmetric weights in the range [-0.5,0.5] utilities = zeros(batchSize * 2) for i in range(batchSize * 2): utilities[index[i]] = i utilities /= (batchSize * 2 - 1) utilities -= 0.5 # Now we assign the weights to the samples for i in range(batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1] ) # pos - neg i = 0 if (beh == 0): while i < batchSize: gsize = -1 if batchSize - i < 500: gsize = batchSize - i else: gsize = 500 g1 += dot(weights[i:i + gsize], samples[i:i + gsize, :]) # weights * samples i += gsize g1 /= (batchSize * 2) else: while i < batchSize: gsize = -1 if batchSize - i < 500: gsize = batchSize - i else: gsize = 500 g2 += dot(weights[i:i + gsize], samples[i:i + gsize, :]) # weights * samples i += gsize g2 /= (batchSize * 2) # sum the gradient computed on behavior 1 and 2 glob = g1 + g2 # Weight decay if (self.wdecay == 1): globalg = -glob + 0.005 * center else: globalg = -glob # Sort by using the sum of the fitness obtained on the two behaviors fitness2, index = ascendent_sort(fitness2) centroidfit = 0 if (self.policy.nttrials > 0): bestsamid = index[batchSize * 2 - 1] if ((bestsamid % 2) == 0): bestid = int(bestsamid / 2) candidate = center + samples[bestid] * self.noiseStdDev else: bestid = int(bestsamid / 2) candidate = center - samples[bestid] * self.noiseStdDev # Update data if the current offspring is better than current best self.updateBest(fitness2[bestsamid], candidate) # post-evaluate the best sample to compute the generalization self.env.seed(self.policy.get_seed + 100000) self.policy.nn.seed(self.policy.get_seed + 100000) self.policy.set_trainable_flat(candidate) eval_rews, eval_length, rews1, rews2 = self.policy.rollout( self.policy.nttrials, timestep_limit=2, post_eval=True) gfit = eval_rews ceval += eval_length # eveltually store the new best generalization individual self.updateBestg(gfit, candidate) # ADAM policy # Compute how much the center moves a = self.stepsize * sqrt(1.0 - beta2**cgen) / (1.0 - beta1**cgen) m = beta1 * m + (1.0 - beta1) * globalg v = beta2 * v + (1.0 - beta2) * (globalg * globalg) dCenter = -a * m / (sqrt(v) + epsilon) # update center center += dCenter # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, center, centroidfit, fitness[batchSize * 2 - 1], elapsed, maxsteps) corr = stats.pearsonr(g1, g2) print( 'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.1f %.1f) avg %.2f weightsize %.2f gradientcorr %.2f' % (self.seed, ceval / float(maxsteps), cgen, ceval / 1000000, self.bestfit, self.bestgfit, fitness2[batchSize * 2 - 1], fitbestsample[0], fitbestsample[1], np.average(fitness2), np.average(np.absolute(center)), corr[0])) # Save centroid and associated vectors if (self.saveeachg > 0 and cgen > 0): if ((cgen % self.saveeachg) == 0): # save best, bestg, and stat self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time)) # save summary statistics fname = self.filedir + "/S" + str(self.seed) + ".fit" fp = open(fname, "w") fp.write( 'Seed %d gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.2f %.2f) avg %.2f weightsize %.2f gradientcorr %.2f \n' % (self.seed, cgen, ceval / 1000000, self.bestfit, self.bestgfit, fitness2[batchSize * 2 - 1], fitbestsample[0], fitbestsample[1], np.average(fitness2), np.average( np.absolute(center)), corr[0])) fp.close() # save best, bestg, and stat self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time)) # save summary statistics fname = self.filedir + "/S" + str(self.seed) + ".fit" fp = open(fname, "w") fp.write( 'Seed %d gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.2f %.2f) avg %.2f weightsize %.2f gradientcorr %.2f \n' % (self.seed, cgen, ceval / 1000000, self.bestfit, self.bestgfit, fitness2[batchSize * 2 - 1], fitbestsample[0], fitbestsample[1], np.average(fitness2), np.average(np.absolute(center)), corr[0])) fp.close() # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def runphase(self, sind, nparams): epsilon = 1e-08 beta1 = 0.9 beta2 = 0.999 weights = zeros(self.batchSize) for it in range (20): ave_rews = 0 # evaluate the centroid for i in range(self.selsize): if (self.evopop == 0): self.policy.set_trainable_flat(np.concatenate((self.selp[sind], self.selcomp[i]))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) # sanity check if (it == 0 and eval_rews != self.fmatrix[self.seli[sind],self.selc[i]]): print("warning: sanity check failed") ave_rews += eval_rews else: self.policy.set_trainable_flat(np.concatenate((self.selcomp[i], self.selp[sind]))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) # sanity check if (it == 0 and eval_rews != self.fmatrix[self.selc[i],self.seli[sind]]): print("warning: sanity check failed") ave_rews += (1.0 - eval_rews) ave_rews /= float(self.selsize) #print("centroid ", end ='') #for g in range(10): #print("%.4f " % (self.selp[sind][g+20]), end='') #print(""); if (it == 0): print("evopop %d ind %2d : " % (self.evopop, self.seli[sind]), end = '') print("%.2f " % (ave_rews), end='') # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = self.rs.randn(self.batchSize, nparams) fitness = zeros(self.batchSize * 2) # Evaluate offspring for b in range(self.batchSize): for bb in range(2): if (bb == 0): for g in range(nparams): self.candidate[g] = self.selp[sind][g] + samples[b,g] * self.noiseStdDev else: for g in range(nparams): self.candidate[g] = self.selp[sind][g] - samples[b,g] * self.noiseStdDev #print("candidad ", end ='') #for g in range(10): #print("%.4f " % (self.candidate[g+20]), end='') #print(""); # evaluate offspring ave_rews = 0 for c in range(self.selsize): if (self.evopop == 0): self.policy.set_trainable_flat(np.concatenate((self.candidate, self.selcomp[c]))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) ave_rews += eval_rews else: self.policy.set_trainable_flat(np.concatenate((self.selcomp[c], self.candidate))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) ave_rews += (1.0 - eval_rews) #print("f %.2f" % eval_rews) fitness[b*2+bb] = ave_rews / float(self.selsize) #print("%.2f " % (ave_rews / float(self.selsize)), end = '') # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) # Now me must compute the symmetric weights in the range [-0.5,0.5] utilities = zeros(self.batchSize * 2) for i in range(self.batchSize * 2): utilities[index[i]] = i utilities /= (self.batchSize * 2 - 1) utilities -= 0.5 # Now we assign the weights to the samples for i in range(self.batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg # Compute the gradient g = 0.0 i = 0 while i < self.batchSize: gsize = -1 if self.batchSize - i < 500: gsize = self.batchSize - i else: gsize = 500 g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples i += gsize # Normalization over the number of samples g /= (self.batchSize * 2) # Weight decay if (self.wdecay == 1): globalg = -g + 0.005 * self.selp[sind] else: globalg = -g # ADAM stochastic optimizer # a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen) a = self.stepsize # bias correction is not implemented self.selm[sind] = beta1 * self.selm[sind] + (1.0 - beta1) * globalg self.selv[sind] = beta2 * self.selv[sind] + (1.0 - beta2) * (globalg * globalg) dCenter = -a * self.selm[sind] / (sqrt(self.selv[sind]) + epsilon) # update center self.selp[sind] += dCenter #for g in range(10): #print("%.4f " % (self.selp[sind][g+20]), end='') #print(""); # evaluate the evolving individual at the end of the evolution phase ave_rews = 0 for i in range(self.selsize): if (self.evopop == 0): self.policy.set_trainable_flat(np.concatenate((self.selp[sind], self.selcomp[i]))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) ave_rews += eval_rews else: self.policy.set_trainable_flat(np.concatenate((self.selcomp[i], self.selp[sind]))) eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000) ave_rews += (1.0 - eval_rews) ave_rews /= float(self.selsize) print("%.2f" % (ave_rews))
def run(self, maxsteps): start_time = time.time() ##Osipov######################################## ## Hyperparameters for our network number_of_inputs = 3 number_of_hiddens = 50 number_of_outputs = 5 batch_size_train = 32 # Learning rate lr = 0.001 epochs = 100 # initialize two network with xavier initialization net_1 = NET_1(number_of_inputs, number_of_outputs, number_of_hiddens) net_2 = NET_2(number_of_inputs, number_of_outputs, number_of_hiddens) #FIX parameters of net_1 for param in net_1.parameters(): param.requires_grad = False # Loss for backpropgation criterion = nn.MSELoss() #Adam optimaizer optimizer = optim.Adam(net_2.parameters(), lr=lr) ##Osipov######################################## # initialize the solution center center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters batchSize = self.batchSize if batchSize == 0: # 4 + floor(3 * log(N)) batchSize = int(4 + math.floor(3 * math.log(nparams))) # Symmetric weights in the range [-0.5,0.5] weights = zeros(batchSize) ceval = 0 # current evaluation cgen = 0 # current generation # Parameters for Adam policy m = zeros(nparams) v = zeros(nparams) epsilon = 1e-08 # To avoid numerical issues with division by zero... beta1 = 0.9 beta2 = 0.999 # RandomState for perturbing the performed actions (used only for samples, not for centroid) rs = np.random.RandomState(self.seed) print("Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams)) if (self.fromgeneration > 0): cgen = self.fromgeneration filename = "S%dG%d.npy" % (self.seed, cgen) filedata = np.load(filename) filename = "S%dG%dm.npy" % (self.seed, cgen) m = np.load(filename) filename = "S%dG%dv.npy" % (self.seed, cgen) v = np.load(filename) fname = "statS%d.npy" % (self.seed) self.stat = np.load(fname) if (self.policy.normalize == 1): filename = "S%dG%dn.npy" % (self.seed, cgen) self.policy.normvector = np.load(fname) self.policy.nn.setNormalizationVectors() # main loop elapsed = 0 ##Osipov######################### Max_observations = 10000 train_set = [] labels_list = [] ##Osipov######################### while (ceval < maxsteps): cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = rs.randn(batchSize, nparams) # buffer vector for candidate candidate = np.arange(nparams, dtype=np.float64) # Evaluate offspring fitness = zeros(batchSize * 2) # If normalize=1 we update the normalization vectors if (self.policy.normalize == 1): self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) # Evaluate offspring for b in range(batchSize): for bb in range(2): if (bb == 0): candidate = center + samples[b,:] * self.noiseStdDev else: candidate = center - samples[b,:] * self.noiseStdDev # Set policy parameters self.policy.set_trainable_flat(candidate) # Sample of the same generation experience the same environmental conditions if (self.sameenvcond == 1): self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.ntrials, timestep_limit=1000) # Get the fitness fitness[b*2+bb] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[b*2+bb], candidate) # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) # Now me must compute the symmetric weights in the range [-0.5,0.5] utilities = zeros(batchSize * 2) for i in range(batchSize * 2): utilities[index[i]] = i utilities /= (batchSize * 2 - 1) utilities -= 0.5 # Now we assign the weights to the samples for i in range(batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg # Evaluate the centroid if (self.sameenvcond == 1): self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) self.policy.set_trainable_flat(center) ##Osipov################################### eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.ntrials, timestep_limit=1000) if len(train_set) < Max_observations: train_set += observations labels_list += outputs_net_1 else: train_set = train_set[len(observations):] train_set += observations labels_list = labels_list[len(outputs_net_1):] labels_list += outputs_net_1 dataset = BehaviourDataset(train_set, labels_list) train_loader = DataLoader(dataset, batch_size=batch_size_train, shuffle=False, num_workers=4) for epoch in range(1, epochs + 1): print('Train Epoch: ', epoch) train(net_2, optimizer, train_loader, criterion) ##Osipov##################################### centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, center) # Evaluate generalization if (self.policy.nttrials > 0): if centroidfit > fitness[batchSize * 2 - 1]: # the centroid is tested for generalization candidate = np.copy(center) else: # the best sample is tested for generalization bestsamid = index[batchSize * 2 - 1] if ((bestsamid % 2) == 0): bestid = int(bestsamid / 2) candidate = center + samples[bestid] * self.noiseStdDev else: bestid = int(bestsamid / 2) candidate = center - samples[bestid] * self.noiseStdDev self.env.seed(self.policy.get_seed + 100000) self.policy.nn.seed(self.policy.get_seed + 100000) self.policy.set_trainable_flat(candidate) eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.nttrials, timestep_limit=1000) gfit = eval_rews ceval += eval_length # eveltually store the new best generalization individual self.updateBestg(gfit, candidate) # Compute the gradient g = 0.0 i = 0 while i < batchSize: gsize = -1 if batchSize - i < 500: gsize = batchSize - i else: gsize = 500 g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples i += gsize # Normalization over the number of samples g /= (batchSize * 2) # Weight decay if (self.wdecay == 1): globalg = -g + 0.005 * center else: globalg = -g # ADAM policy # Compute how much the center moves a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen) m = beta1 * m + (1.0 - beta1) * globalg v = beta2 * v + (1.0 - beta2) * (globalg * globalg) dCenter = -a * m / (sqrt(v) + epsilon) # update center center += dCenter # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, center, centroidfit, fitness[batchSize * 2 - 1], elapsed, maxsteps) # Save centroid and associated vectors if (self.saveeachg > 0 and cgen > 0): if ((cgen % self.saveeachg) == 0): filename = "S%dG%d.npy" % (self.seed, cgen) np.save(filename, center) filename = "S%dG%dm.npy" % (self.seed, cgen) np.save(filename, m) filename = "S%dG%dv.npy" % (self.seed, cgen) np.save(filename, v) if (self.policy.normalize == 1): filename = "S%dG%dn.npy" % (self.seed, cgen) np.save(filename, self.policy.normvector) # save data self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # initialize the solution center center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters batchSize = self.batchSize if batchSize == 0: # 4 + floor(3 * log(N)) batchSize = int(4 + math.floor(3 * math.log(nparams))) # Symmetric weights in the range [-0.5,0.5] weights = zeros(batchSize) ceval = 0 # current evaluation cgen = 0 # current generation # Parameters for Adam policy m = zeros(nparams) v = zeros(nparams) epsilon = 1e-08 # To avoid numerical issues with division by zero... beta1 = 0.9 beta2 = 0.999 # RandomState for perturbing the performed actions (used only for samples, not for centroid) rs = np.random.RandomState(self.seed) print("Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams)) if (self.fromgeneration > 0): cgen = self.fromgeneration filename = "S%dG%d.npy" % (self.seed, cgen) filedata = np.load(filename) filename = "S%dG%dm.npy" % (self.seed, cgen) m = np.load(filename) filename = "S%dG%dv.npy" % (self.seed, cgen) v = np.load(filename) fname = "statS%d.npy" % (self.seed) self.stat = np.load(fname) if (self.policy.normalize == 1): filename = "S%dG%dn.npy" % (self.seed, cgen) self.policy.normvector = np.load(fname) self.policy.nn.setNormalizationVectors() # main loop elapsed = 0 while (ceval < maxsteps): cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = rs.randn(batchSize, nparams) # buffer vector for candidate candidate = np.arange(nparams, dtype=np.float64) # Evaluate offspring fitness = zeros(batchSize * 2) # If normalize=1 we update the normalization vectors if (self.policy.normalize == 1): self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) # Evaluate offspring for b in range(batchSize): if self.policy.strategy == 'symmetric': rand = np.random.uniform(0, 1) if rand < 0.5: self.env.robot.behavior1 = 5.0 self.env.robot.behavior2 = 0.0 else: self.env.robot.behavior1 = 0.0 self.env.robot.behavior2 = 5.0 for bb in range(2): if (bb == 0): candidate = center + samples[b,:] * self.noiseStdDev else: candidate = center - samples[b,:] * self.noiseStdDev # Set policy parameters self.policy.set_trainable_flat(candidate) # Sample of the same generation experience the same environmental conditions if (self.sameenvcond == 1): self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, timestep_limit=1000) # Get the fitness fitness[b*2+bb] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[b*2+bb], candidate) # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) # Now me must compute the symmetric weights in the range [-0.5,0.5] utilities = zeros(batchSize * 2) for i in range(batchSize * 2): utilities[index[i]] = i utilities /= (batchSize * 2 - 1) utilities -= 0.5 # Now we assign the weights to the samples for i in range(batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg # Evaluate the centroid if (self.sameenvcond == 1): self.env.seed(self.policy.get_seed + cgen) self.policy.nn.seed(self.policy.get_seed + cgen) self.policy.set_trainable_flat(center) eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, timestep_limit=1000) centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, center) # Evaluate generalization if (self.policy.nttrials > 0): if centroidfit > fitness[batchSize * 2 - 1]: # the centroid is tested for generalization candidate = np.copy(center) else: # the best sample is tested for generalization bestsamid = index[batchSize * 2 - 1] if ((bestsamid % 2) == 0): bestid = int(bestsamid / 2) candidate = center + samples[bestid] * self.noiseStdDev else: bestid = int(bestsamid / 2) candidate = center - samples[bestid] * self.noiseStdDev self.env.seed(self.policy.get_seed + 100000) self.policy.nn.seed(self.policy.get_seed + 100000) self.policy.set_trainable_flat(candidate) eval_rews, eval_length = self.policy.rollout(self.policy.nttrials, timestep_limit=1000, post_eval=True) gfit = eval_rews ceval += eval_length # eveltually store the new best generalization individual self.updateBestg(gfit, candidate) # Compute the gradient g = 0.0 i = 0 while i < batchSize: gsize = -1 if batchSize - i < 500: gsize = batchSize - i else: gsize = 500 g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples i += gsize # Normalization over the number of samples g /= (batchSize * 2) # Weight decay if (self.wdecay == 1): globalg = -g + 0.005 * center else: globalg = -g # ADAM policy # Compute how much the center moves a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen) m = beta1 * m + (1.0 - beta1) * globalg v = beta2 * v + (1.0 - beta2) * (globalg * globalg) dCenter = -a * m / (sqrt(v) + epsilon) # update center center += dCenter # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, center, centroidfit, fitness[batchSize * 2 - 1], elapsed, maxsteps) # Save centroid and associated vectors if (self.saveeachg > 0 and cgen > 0): if ((cgen % self.saveeachg) == 0): filename = "S%dG%d.npy" % (self.seed, cgen) np.save(filename, center) filename = "S%dG%dm.npy" % (self.seed, cgen) np.save(filename, m) filename = "S%dG%dv.npy" % (self.seed, cgen) np.save(filename, v) if (self.policy.normalize == 1): filename = "S%dG%dn.npy" % (self.seed, cgen) np.save(filename, self.policy.normvector) # save data self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # initialize the solution center self.center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters centerLearningRate = 1.0 covLearningRate = 0.6 * (3 + log(nparams)) / 3.0 / sqrt(nparams) if self.batchSize == 0: # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters self.batchSize = int( 4 + floor(3 * log(nparams))) # population size, offspring number if "Tf" in type(self.policy).__name__: # Update the number of rollout calls in policy self.policy.updaten(self.batchSize) initVar = 1.0 mu = int(floor(self.batchSize / 2)) # number of parents/points for recombination self.stepsize = 1.0 / mu weights = zeros(self.batchSize) w = self.stepsize for i in range(mu): weights[self.batchSize - mu + i] = w w += self.stepsize weights /= sum(weights) # normalize recombination weights array # initialize variance array _sigmas = ones(nparams) * initVar ceval = 0 # current evaluation cgen = 0 # current generation # RandomState for perturbing the performed actions (used only for samples, not for centroid) np.random.seed(self.seed) print( "sNES: seed %d maxmsteps %d batchSize %d stepsize %.2f sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize, self.sameenvcond, nparams)) # Set evolution mode self.policy.runEvo() # main loop elapsed = 0 while ceval < maxsteps: cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = np.random.randn(self.batchSize, nparams) S = samples.transpose() # Generate offspring offspring = tile( self.center.reshape(1, nparams), (self.batchSize, 1)) + tile(_sigmas.reshape(1, nparams), (self.batchSize, 1)) * samples # Evaluate offspring fitness = zeros(self.batchSize) # If normalize=1 we update the normalization vectors if self.policy.normalize == 1: self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.policy.setSeed(self.policy.get_seed + cgen) # Set generalization flag to False self.policy.doGeneralization(False) # Evaluate offspring for k in range(self.batchSize): # Set policy parameters (corresponding to the current offspring) self.policy.set_trainable_flat(offspring[k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[k], offspring[k]) # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) S = S[:, index] # Update center dCenter = dot(weights, S.transpose()) self.center += dCenter # Update variances Ssq = S * S SsqMinusOne = Ssq - ones((nparams, self.batchSize)) covGrad = dot(weights, SsqMinusOne.transpose()) dSigma = 0.5 * covLearningRate * covGrad _sigmas = _sigmas * exp(dSigma).transpose() centroidfit = -999999999.0 if self.evalCenter != 0: # Evaluate the centroid self.policy.set_trainable_flat(self.center) if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, self.center) # Now perform generalization if self.policy.generalize: candidate = None if centroidfit > fitness[self.batchSize - 1]: # Centroid undergoes generalization test candidate = np.copy(self.center) else: # Best sample undergoes generalization test bestsamid = index[self.batchSize - 1] candidate = np.copy(offspring[bestsamid]) # Set the seed self.policy.set_trainable_flat( candidate) # Parameters must be updated by the algorithm!! self.policy.setSeed(self.policy.get_seed + 1000000) self.policy.doGeneralization(True) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) gfit = eval_rews ceval += eval_length # Update data if the candidate is better than current best generalizing individual self.updateBestg(gfit, candidate) # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, self.center, centroidfit, fitness[self.batchSize - 1], elapsed, maxsteps) # save data self.save(cgen, ceval, centroidfit, self.center, fitness[self.batchSize - 1], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # initialize the solution center self.center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters if self.batchSize == 0: # 4 + floor(3 * log(N)) self.batchSize = int(4 + math.floor(3 * math.log(nparams))) # Symmetric weights in the range [-0.5,0.5] weights = zeros(self.batchSize) ceval = 0 # current evaluation cgen = 0 # current generation # Parameters for Adam policy m = zeros(nparams) v = zeros(nparams) epsilon = 1e-08 # To avoid numerical issues with division by zero... beta1 = 0.9 beta2 = 0.999 # RandomState for perturbing the performed actions (used only for samples, not for centroid) np.random.seed(self.seed) print( "Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams)) # Set evolution mode self.policy.runEvo() # main loop elapsed = 0 while ceval < maxsteps: cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = np.random.randn(self.batchSize, nparams) # We generate simmetric variations for the offspring symmSamples = zeros((self.batchSize * 2, nparams)) for i in range(self.batchSize): sampleIdx = 2 * i for g in range(nparams): symmSamples[sampleIdx, g] = samples[i, g] symmSamples[sampleIdx + 1, g] = -samples[i, g] # Generate offspring offspring = tile( self.center.reshape(1, nparams), (self.batchSize * 2, 1)) + self.noiseStdDev * symmSamples # Evaluate offspring fitness = zeros(self.batchSize * 2) # If normalize=1 we update the normalization vectors if self.policy.normalize == 1: self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.policy.setSeed(self.policy.get_seed + cgen) # Set generalization flag to False self.policy.doGeneralization(False) # Evaluate offspring for k in range(self.batchSize * 2): # Set policy parameters (corresponding to the current offspring) self.policy.set_trainable_flat(offspring[k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[k], offspring[k]) # Sort by fitness and compute weighted mean into center fitness, index = ascendent_sort(fitness) # Now me must compute the symmetric weights in the range [-0.5,0.5] utilities = zeros(self.batchSize * 2) for i in range(self.batchSize * 2): utilities[index[i]] = i utilities /= (self.batchSize * 2 - 1) utilities -= 0.5 # Now we assign the weights to the samples for i in range(self.batchSize): idx = 2 * i weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg # Compute the gradient g = 0.0 i = 0 while i < self.batchSize: gsize = -1 if self.batchSize - i < 500: gsize = self.batchSize - i else: gsize = 500 g += dot(weights[i:i + gsize], samples[i:i + gsize, :]) # weights * samples i += gsize # Normalization over the number of samples g /= (self.batchSize * 2) # Weight decay if (self.wdecay == 1): globalg = -g + 0.005 * self.center else: globalg = -g # ADAM policy # Compute how much the center moves a = self.stepsize * sqrt(1.0 - beta2**cgen) / (1.0 - beta1**cgen) m = beta1 * m + (1.0 - beta1) * globalg v = beta2 * v + (1.0 - beta2) * (globalg * globalg) dCenter = -a * m / (sqrt(v) + epsilon) # update center self.center += dCenter centroidfit = -999999999.0 if self.evalCenter != 0: # Evaluate the centroid self.policy.set_trainable_flat(self.center) if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, self.center) # Now perform generalization if self.policy.generalize: candidate = None if centroidfit > fitness[self.batchSize * 2 - 1]: # Centroid undergoes generalization test candidate = np.copy(self.center) else: # Best sample undergoes generalization test bestsamid = index[self.batchSize * 2 - 1] candidate = np.copy(offspring[bestsamid]) # Set the seed self.policy.set_trainable_flat( candidate) # Parameters must be updated by the algorithm!! self.policy.setSeed(self.policy.get_seed + 1000000) self.policy.doGeneralization(True) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) gfit = eval_rews ceval += eval_length # Update data if the candidate is better than current best generalizing individual self.updateBestg(gfit, candidate) # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, self.center, centroidfit, fitness[self.batchSize * 2 - 1], elapsed, maxsteps) # save data self.save(cgen, ceval, centroidfit, self.center, fitness[self.batchSize * 2 - 1], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))