def run(self, maxsteps): start_time = time.time() # initialize the solution center self.center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters centerLearningRate = 1.0 covLearningRate = 0.5 * min( 1.0 / nparams, 0.25 ) # from MATLAB # covLearningRate = 0.6*(3+log(ngenes))/ngenes/sqrt(ngenes) if self.batchSize == 0: # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters self.batchSize = int( 4 + floor(3 * log(nparams))) # population size, offspring number if "Tf" in type(self.policy).__name__: # Update the number of rollout calls in policy self.policy.updaten(self.batchSize) mu = int(floor(self.batchSize / 2)) # number of parents/points for recombination weights = log(mu + 1) - log(array(range(1, mu + 1))) # use array weights /= sum(weights) # normalize recombination weights array # initialize covariance and identity matrix _A = zeros((nparams, nparams)) # square root of covariance matrix _I = eye(nparams) # Identity matrix ceval = 0 # current evaluation cgen = 0 # current generation # RandomState for perturbing the performed actions (used only for samples, not for centroid) np.random.seed(self.seed) print( "xNES: seed %d maxmsteps %d batchSize %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, self.batchSize, self.sameenvcond, nparams)) # Set evolution mode self.policy.runEvo() # main loop elapsed = 0 while ceval < maxsteps: cgen += 1 # Compute the exponential of the covariance matrix _expA = expm(_A) # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = np.random.randn(nparams, self.batchSize) # Generate offspring offspring = tile(self.center.reshape(nparams, 1), (1, self.batchSize)) + _expA.dot(samples) # Evaluate offspring fitness = zeros(self.batchSize) # If normalize=1 we update the normalization vectors if self.policy.normalize == 1: self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.policy.setSeed(self.policy.get_seed + cgen) # Set generalization flag to False self.policy.doGeneralization(False) # Evaluate offspring for k in range(self.batchSize): # Set policy parameters (corresponding to the current offspring) self.policy.set_trainable_flat(offspring[:, k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[k], offspring[:, k]) # Sort by fitness and compute weighted mean into center fitness, index = descendent_sort(fitness) # Utilities utilities = zeros(self.batchSize) uT = zeros((self.batchSize, 1)) for i in range(mu): utilities[index[i]] = weights[i] uT[index[i], 0] = weights[i] # Compute gradients U = zeros((nparams, self.batchSize)) for i in range(nparams): for j in range(self.batchSize): U[i][j] = utilities[j] us = zeros((nparams, self.batchSize)) for i in range(nparams): for j in range(self.batchSize): us[i][j] = U[i][j] * samples[i][j] G = us.dot(samples.transpose()) - sum(utilities) * _I dCenter = centerLearningRate * _expA.dot(samples.dot(uT)) deltaCenter = zeros(nparams) for g in range(nparams): deltaCenter[g] = dCenter[g, 0] dA = covLearningRate * G # Update self.center += deltaCenter _A += dA centroidfit = -999999999.0 if self.evalCenter != 0: # Evaluate the centroid self.policy.set_trainable_flat(self.center) if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, self.center) # Now perform generalization if self.policy.generalize: candidate = None if centroidfit > fitness[0]: # Centroid undergoes generalization test candidate = np.copy(self.center) else: # Best sample undergoes generalization test bestsamid = index[0] candidate = np.copy(offspring[:, bestsamid]) # Set the seed self.policy.set_trainable_flat( candidate) # Parameters must be updated by the algorithm!! self.policy.setSeed(self.policy.get_seed + 1000000) self.policy.doGeneralization(True) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) gfit = eval_rews ceval += eval_length # Update data if the candidate is better than current best generalizing individual self.updateBestg(gfit, candidate) # Compute the average value in the covariance matrix covSize = 0.0 for g in range(nparams): for gg in range(nparams): covSize += abs(_A[g, gg]) covSize /= nparams if covSize >= 100.0: # Reset variables when covariance matrix diverges print("Reset xNES: covsize %.2f" % covSize) _A = zeros((nparams, nparams)) # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, self.center, centroidfit, fitness[0], elapsed, maxsteps) # save data self.save(cgen, ceval, centroidfit, self.center, fitness[0], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # start time nparams = self.policy.nparams # number of parameters popsize = self.batchSize # popsize ceval = 0 # current evaluation cgen = 0 # current generation rg = np.random.RandomState( self.seed) # create a random generator and initialize the seed pop = rg.randn(popsize, nparams) # population fitness = zeros(popsize) # fitness fitness_beh = zeros((popsize, 3)) self.stat = np.arange( 0, dtype=np.float64 ) # initialize vector containing performance across generations assert ((popsize % 2) == 0), print("the size of the population should be odd") # initialze the population for i in range(popsize): pop[i] = self.policy.get_trainable_flat() print( "SSS: seed %d maxmsteps %d popSize %d noiseStdDev %lf crossoverrate %lf nparams %d" % (self.seed, maxsteps / 1000000, popsize, self.noiseStdDev, self.crossoverrate, nparams)) # main loop elapsed = 0 while (ceval < maxsteps): cgen += 1 # If normalize=1 we update the normalization vectors if (self.policy.normalize == 1): self.policy.nn.updateNormalizationVectors() self.env.seed( self.policy.get_seed + cgen) # set the environment seed, it changes every generation self.policy.nn.seed( self.policy.get_seed + cgen) # set the policy seed, it changes every generation # Evaluate the population for i in range(popsize): self.policy.set_trainable_flat(pop[i]) # set policy parameters eval_rews, eval_length, rews1, rews2 = self.policy.rollout( self.policy.ntrials, timestep_limit=1000) # evaluate the individual fitness[i] = eval_rews # store fitness fitness_beh[i] = np.array([i, rews1, rews2]) ceval += eval_length # Update the number of evaluations self.updateBest( fitness[i], pop[i] ) # Update data if the current offspring is better than current best fitness, index = descendent_sort( fitness ) # create an index with the ID of the individuals sorted for fitness bfit = fitness[index[0]] self.updateBest( bfit, pop[index[0]] ) # eventually update the genotype/fitness of the best individual so far # PARETO-FRONT pareto_front_idx = [] front_len = [] halfpopsize = int(popsize / 2) #dominated = fitness_beh.copy() count = 0 #while len(dominated) > 0: #current_level = [] current_idx = [] for i in range(len(fitness_beh)): res = ~(fitness_beh[i] > fitness_beh) res = np.delete(res, i, axis=0) if not (np.any(np.all(res, axis=1))): #current_level.append(dominated[i]) pareto_front_idx.append(int(fitness_beh[i, 0])) current_idx.append(i) count += 1 # if len(pareto_front_idx) == halfpopsize: # break print("Number of genotypes in the pareto-front: %.2f" % (count)) #pareto_front_idx.append(current_level) #front_len.append(len(current_idx)) dominated = np.array(np.delete(fitness_beh[:, 0], current_idx, axis=0), dtype=np.int64) childrensize = popsize - count parent = np.random.choice(pareto_front_idx, size=childrensize, replace=True) cross_prob = np.random.uniform(low=0.0, high=1.0, size=childrensize) for i in range(childrensize): # crossover of the first parent and a randomly selected second parent among the first pareto-front if cross_prob[i] < self.crossoverrate: parent_1 = pop[parent[i]] idx_p2 = np.random.choice(pareto_front_idx, size=2, replace=False) if idx_p2[0] != parent[i]: parent_2 = pop[idx_p2[0]] else: parent_2 = pop[idx_p2[1]] cutting_points = np.random.choice(np.arange(0, nparams, 1), size=2, replace=False) min_point = cutting_points.min() max_point = cutting_points.max() # The section A and C of the first parent with the section B of the second parent if np.random.uniform(low=0.0, high=1.0) < 0.5: pop[dominated[i], :min_point] = parent_1[:min_point] pop[dominated[i], min_point:max_point] = parent_2[ min_point:max_point] pop[dominated[i], max_point:] = parent_1[max_point:] # The section A and C of the second parent with the section B of the first parent else: pop[dominated[i], :min_point] = parent_2[:min_point] pop[dominated[i], min_point:max_point] = parent_1[ min_point:max_point] pop[dominated[i], max_point:] = parent_2[max_point:] pop[dominated[i]] += (rg.randn(nparams) * self.noiseStdDev) else: pop[dominated[i]] = pop[parent[i]] + ( rg.randn(1, nparams) * self.noiseStdDev) # Postevaluate the best individual self.env.seed( self.policy.get_seed + 100000 ) # set the environmental seed, always the same for the same seed self.policy.nn.seed( self.policy.get_seed + 100000 ) # set the policy seed, always the same for the same seed self.policy.set_trainable_flat( pop[index[0]]) # set the parameters of the policy eval_rews, eval_length, _, _ = self.policy.rollout( self.policy.ntrials, timestep_limit=1000, post_eval=True) bgfit = eval_rews ceval += eval_length self.updateBestg( bgfit, pop[index[0]] ) # eventually update the genotype/fitness of the best post-evaluated individual # display info print( 'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f cbestfit %.2f cbestgfit %.2f avgfit %.2f weightsize %.2f' % (self.seed, ceval / float(maxsteps) * 100, cgen, ceval / 1000000, self.bestfit, self.bestgfit, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]])))) # store data throughout generations self.stat = np.append(self.stat, [ ceval, self.bestfit, self.bestgfit, bfit, bgfit, np.average(fitness) ]) # save data if ((time.time() - self.last_save_time) > (self.policy.saveeach * 60)): self.save(ceval, cgen, maxsteps, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]]))) self.last_save_time = time.time() self.save(ceval, cgen, maxsteps, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]]))) end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # initialize the solution center self.center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters if self.batchSize == 0: # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters self.batchSize = int( 4 + floor(3 * log(nparams))) # population size, offspring number if "Tf" in type(self.policy).__name__: # Update the number of rollout calls in policy (the initial value has been set based on configuration file) self.policy.updaten(self.batchSize) mu = int(floor(self.batchSize / 2)) # number of parents/points for recombination weights = log(mu + 1) - log(array(range(1, mu + 1))) # use array weights /= sum(weights) # normalize recombination weights array muEff = sum(weights)**2 / sum(power( weights, 2)) # variance-effective size of mu cumCov = 4 / float( nparams + 4) # time constant for cumulation for covariance matrix cumStep = (muEff + 2) / (nparams + muEff + 3 ) # t-const for cumulation for Size control muCov = muEff # size of mu used for calculating learning rate covLearningRate covLearningRate = ((1 / muCov) * 2 / (nparams + 1.4)**2 + (1 - 1 / muCov) * # learning rate for ((2 * muEff - 1) / ((nparams + 2)**2 + 2 * muEff)) ) # covariance matrix dampings = 1 + 2 * max(0, sqrt((muEff - 1) / (nparams + 1)) - 1) + cumStep # damping for stepSize usually close to 1 former damp == dampings/cumStep # Initialize dynamic (internal) strategy parameters and constants covPath = zeros(nparams) stepPath = zeros(nparams) # evolution paths for C and stepSize B = eye(nparams, nparams) # B defines the coordinate system D = eye(nparams, nparams) # diagonal matrix D defines the scaling C = dot(dot(B, D), dot(B, D).T) # covariance matrix chiN = nparams**0.5 * (1 - 1. / (4. * nparams) + 1 / (21. * nparams**2)) # expectation of ||numParameters(0,I)|| == norm(randn(numParameters,1)) self.stepsize = 0.5 ceval = 0 # current evaluation cgen = 0 # current generation # RandomState for perturbing the performed actions (used only for samples, not for centroid) np.random.seed(self.seed) print( "CMA-ES: seed %d maxmsteps %d batchSize %d stepsize %.2f sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize, self.sameenvcond, nparams)) # Set evolution mode self.policy.runEvo() """ updateCovMatRate = 1 if not self.updateCovEveryGen: updateCovMatRate = 0.1 / covLearningRate / nparams decPart = math.modf(updateCovMatRate)[0] if decPart >= 0.5: updateCovMatRate = ceil(updateCovMatRate) else: updateCovMatRate = floor(updateCovMatRate) updateCovMatRate = int(updateCovMatRate) """ # main loop elapsed = 0 while ceval < maxsteps: cgen += 1 # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0 samples = np.random.randn(nparams, self.batchSize) # Generate offspring offspring = tile( self.center.reshape(nparams, 1), (1, self.batchSize)) + self.stepsize * dot(dot(B, D), samples) # Evaluate offspring fitness = zeros(self.batchSize) # If normalize=1 we update the normalization vectors if self.policy.normalize == 1: self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.policy.setSeed(self.policy.get_seed + cgen) # Set generalization flag to False self.policy.doGeneralization(False) # Evaluate offspring for k in range(self.batchSize): # Set policy parameters (corresponding to the current offspring) self.policy.set_trainable_flat(offspring[:, k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[k], offspring[:, k]) # Sort by fitness and compute weighted mean into center fitness, index = descendent_sort(fitness) # Re-organize samples according to indices samples = samples[:, index] # Do the same for offspring offspring = offspring[:, index] # Select best <mu> samples and offspring for computing new center and cumulation paths samsel = samples[:, range(mu)] offsel = offspring[:, range(mu)] offmut = offsel - tile(self.center.reshape(nparams, 1), (1, mu)) samplemean = dot(samsel, weights) self.center = dot(offsel, weights) # Cumulation: Update evolution paths stepPath = (1 - cumStep) * stepPath \ + sqrt(cumStep * (2 - cumStep) * muEff) * dot(B, samplemean) # Eq. (4) hsig = norm(stepPath) / sqrt(1 - (1 - cumStep) ** (2 * ceval / float(self.batchSize))) / chiN \ < 1.4 + 2. / (nparams + 1) covPath = (1 - cumCov) * covPath + hsig * \ sqrt(cumCov * (2 - cumCov) * muEff) * dot(dot(B, D), samplemean) # Eq. (2) # Adapt covariance matrix C C = ((1 - covLearningRate) * C # regard old matrix % Eq. (3) + covLearningRate * (1 / muCov) * ( outer(covPath, covPath) # plus rank one update + (1 - hsig) * cumCov * (2 - cumCov) * C) + covLearningRate * (1 - 1 / muCov) # plus rank mu update * dot(dot(offmut, diag(weights)), offmut.T)) # Adapt step size self.stepsize *= exp( (cumStep / dampings) * (norm(stepPath) / chiN - 1)) # Eq. (5) # Update B and D from C # This is O(n^3). When strategy internal CPU-time is critical, the # next three lines should be executed only every (alpha/covLearningRate/N)-th # iteration, where alpha is e.g. between 0.1 and 10 C = (C + C.T) / 2 # enforce symmetry Ev, B = eig(C) # eigen decomposition, B==normalized eigenvectors Ev = real(Ev) # enforce real value D = diag( sqrt(Ev) ) #diag(ravel(sqrt(Ev))) # D contains standard deviations now B = real(B) centroidfit = -999999999.0 if self.evalCenter != 0: # Evaluate the centroid self.policy.set_trainable_flat(self.center) if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) centroidfit = eval_rews ceval += eval_length # Update data if the centroid is better than current best self.updateBest(centroidfit, self.center) # Now perform generalization if self.policy.generalize: candidate = None if centroidfit > fitness[0]: # Centroid undergoes generalization test candidate = np.copy(self.center) else: # Best sample undergoes generalization test bestsamid = index[0] candidate = np.copy(offspring[:, bestsamid]) # Set the seed self.policy.set_trainable_flat( candidate) # Parameters must be updated by the algorithm!! self.policy.setSeed(self.policy.get_seed + 1000000) self.policy.doGeneralization(True) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) gfit = eval_rews ceval += eval_length # Update data if the candidate is better than current best generalizing individual self.updateBestg(gfit, candidate) # Compute the average value in the covariance matrix covSize = 0.0 for g in range(nparams): for gg in range(nparams): covSize += abs(C[g, gg]) covSize /= nparams if self.stepsize >= 10.0 or covSize >= 100.0 or ( self.stepsize >= 5.0 and covSize >= 20.0): # Reset variables when either stepsize or covariance matrix diverges print("Reset CMAES: stepsize %.2f covsize %.2f" % (self.stepsize, covSize)) covPath = zeros(nparams) stepPath = zeros(nparams) B = eye(nparams, nparams) D = eye(nparams, nparams) C = dot(dot(B, D), dot(B, D).T) self.stepsize = 0.5 # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness, self.center, centroidfit, fitness[0], elapsed, maxsteps) # save data self.save(cgen, ceval, centroidfit, self.center, fitness[0], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self): self.loadhyperparameters() # initialize hyperparameters start_time = time.time() # start time nparams = self.policy.nparams # number of parameters ceval = 0 # current evaluation cgen = 0 # current generation rg = np.random.RandomState( self.seed) # create a random generator and initialize the seed pop = rg.randn(self.popsize, nparams) # population fitness = zeros(self.popsize) # fitness self.stat = np.arange( 0, dtype=np.float64 ) # initialize vector containing performance across generations assert ((self.popsize % 2) == 0), print("the size of the population should be odd") # initialze the population for i in range(self.popsize): pop[i] = self.policy.get_trainable_flat() print( "SSS: seed %d maxmsteps %d popSize %d noiseStdDev %lf nparams %d" % (self.seed, self.maxsteps / 1000000, self.popsize, self.mutation, nparams)) # main loop elapsed = 0 while (ceval < self.maxsteps): cgen += 1 # If normalize=1 we update the normalization vectors if (self.policy.normalize == 1): self.policy.nn.updateNormalizationVectors() self.env.seed( self.policy.get_seed + cgen) # set the environment seed, it changes every generation self.policy.nn.seed( self.policy.get_seed + cgen) # set the policy seed, it changes every generation # Evaluate the population for i in range(self.popsize): self.policy.set_trainable_flat(pop[i]) # set policy parameters eval_rews, eval_length = self.policy.rollout( self.policy.ntrials) # evaluate the individual fitness[i] = eval_rews # store fitness ceval += eval_length # Update the number of evaluations self.updateBest( fitness[i], pop[i] ) # Update data if the current offspring is better than current best fitness, index = descendent_sort( fitness ) # create an index with the ID of the individuals sorted for fitness bfit = fitness[index[0]] self.updateBest( bfit, pop[index[0]] ) # eventually update the genotype/fitness of the best individual so far # Postevaluate the best individual self.env.seed( self.policy.get_seed + 100000 ) # set the environmental seed, always the same for the same seed self.policy.nn.seed( self.policy.get_seed + 100000 ) # set the policy seed, always the same for the same seed self.policy.set_trainable_flat( pop[index[0]]) # set the parameters of the policy eval_rews, eval_length = self.policy.rollout(self.policy.ntrials) bgfit = eval_rews ceval += eval_length self.updateBestg( bgfit, pop[index[0]] ) # eventually update the genotype/fitness of the best post-evaluated individual # replace the worst half of the population with a mutated copy of the first half of the population halfpopsize = int(self.popsize / 2) for i in range(halfpopsize): pop[index[i + halfpopsize]] = pop[index[i]] + ( rg.randn(1, nparams) * self.mutation) # display info print( 'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f cbestfit %.2f cbestgfit %.2f avgfit %.2f weightsize %.2f' % (self.seed, ceval / float(self.maxsteps) * 100, cgen, ceval / 1000000, self.bestfit, self.bestgfit, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]])))) # store data throughout generations self.stat = np.append(self.stat, [ ceval, self.bestfit, self.bestgfit, bfit, bgfit, np.average(fitness) ]) # save data if ((time.time() - self.last_save_time) > (self.saveeach * 60)): self.save(ceval, cgen, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]]))) self.last_save_time = time.time() self.save(ceval, cgen, bfit, bgfit, np.average(fitness), np.average(np.absolute(pop[index[0]]))) end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
def run(self, maxsteps): start_time = time.time() # initialize the solution center (here the centroid is used to generate # random individuals) self.center = self.policy.get_trainable_flat() # Extract the number of parameters nparams = self.policy.nparams # setting parameters ceval = 0 # current evaluation cgen = 0 # current generation # RandomState for perturbing the performed actions (used only for samples, not for centroid) np.random.seed(self.seed) print( "SSS: seed %d maxmsteps %d batchSize %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, self.batchSize, self.sameenvcond, nparams)) # Set evolution mode self.policy.runEvo() # Population self.pop = tile(self.center.reshape(1, nparams), (self.batchSize, 1)) # Apply random variations to solution center for i in range(self.batchSize): for j in range(nparams): self.pop[i, j] += np.random.random() * 0.2 - 0.1 # Allocate offspring offspring = np.zeros((self.batchSize, nparams)) # Here centroid is useless centroidfit = -999999999.0 # main loop elapsed = 0 while ceval < maxsteps: cgen += 1 fitness = zeros(self.batchSize * 2) # If normalize=1 we update the normalization vectors if self.policy.normalize == 1: self.policy.nn.updateNormalizationVectors() # Reset environmental seed every generation self.policy.setSeed(self.policy.get_seed + cgen) # Set generalization flag to False self.policy.doGeneralization(False) # Evaluate parents and offspring for k in range(self.batchSize): # Set policy parameters (corresponding to the current offspring) self.policy.set_trainable_flat(self.pop[k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the parents eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current parent is better than current best self.updateBest(fitness[k], self.pop[k]) # Generate the offspring for j in range(nparams): offspring[k, j] = self.pop[k, j] if np.random.uniform(low=0.0, high=1.0) < 0.03: # Extract a random number to perform either weight # replacemente or weight perturbation if np.random.uniform(low=0.0, high=1.0) < 0.5: # Weight replacement offspring[k, j] = np.random.random() * ( self.policy.wrange * 2.0) - self.policy.wrange else: # Weight perturbation offspring[k, j] += np.random.random() * 0.2 - 0.1 self.policy.set_trainable_flat(offspring[k]) # Sample of the same generation experience the same environmental conditions if self.sameenvcond == 1: self.policy.setSeed(self.policy.get_seed + cgen) # Evaluate the offspring eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) # Get the fitness fitness[self.batchSize + k] = eval_rews # Update the number of evaluations ceval += eval_length # Update data if the current offspring is better than current best self.updateBest(fitness[self.batchSize + k], offspring[k]) # Selection parentfit = np.copy(fitness[0:self.batchSize]) # Add noise to parent fitness """ for i in range(self.batchSize): noise = np.random.random() * self.noiseStdDev * 2.0 - self.noiseStdDev parentfit[i] += (noise * parentfit[i]) """ offspringfit = np.copy(fitness[self.batchSize:(self.batchSize * 2)]) # Add noise to offspring fitness """ for i in range(self.batchSize): noise = np.random.random() * self.noiseStdDev * 2.0 - self.noiseStdDev offspringfit[i] += (noise * offspringfit[i]) """ # Sort parent and offspring based on fitness (descending mode) parentfit, parentidx = descendent_sort(parentfit) offspringfit, offspringidx = descendent_sort(offspringfit) # Population index k = 0 # Parent index p = 0 # Offspring index o = 0 while k < self.batchSize: if parentfit[p] > offspringfit[o]: p += 1 else: # Offspring replaces worst parent wp = parentidx[self.batchSize - 1 - o] bo = offspringidx[o] self.pop[wp] = np.copy(offspring[bo]) fitness[wp] = fitness[self.batchSize + bo] o += 1 k += 1 # Get the best individual (of the current generation) bfit = np.copy(fitness[0:self.batchSize]) bfit, bidx = descendent_sort(bfit) bidx = bidx[0] # Now perform generalization if self.policy.generalize: candidate = np.copy(self.pop[bidx]) # Set the seed self.policy.set_trainable_flat( candidate) # Parameters must be updated by the algorithm!! self.policy.setSeed(self.policy.get_seed + 1000000) self.policy.doGeneralization(True) eval_rews, eval_length = self.policy.rollout( timestep_limit=1000) gfit = eval_rews ceval += eval_length # Update data if the candidate is better than current best generalizing individual self.updateBestg(gfit, candidate) # Compute the elapsed time (i.e., how much time the generation lasted) elapsed = (time.time() - start_time) # Update information self.updateInfo(cgen, ceval, fitness[0:self.batchSize], self.center, centroidfit, fitness[bidx], elapsed, maxsteps) # save data self.save(cgen, ceval, centroidfit, self.center, fitness[bidx], (time.time() - start_time)) # print simulation time end_time = time.time() print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))