Пример #1
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        self.center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        centerLearningRate = 1.0
        covLearningRate = 0.5 * min(
            1.0 / nparams, 0.25
        )  # from MATLAB # covLearningRate = 0.6*(3+log(ngenes))/ngenes/sqrt(ngenes)
        if self.batchSize == 0:
            # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters
            self.batchSize = int(
                4 +
                floor(3 * log(nparams)))  # population size, offspring number
            if "Tf" in type(self.policy).__name__:
                # Update the number of rollout calls in policy
                self.policy.updaten(self.batchSize)
        mu = int(floor(self.batchSize /
                       2))  # number of parents/points for recombination
        weights = log(mu + 1) - log(array(range(1, mu + 1)))  # use array
        weights /= sum(weights)  # normalize recombination weights array
        # initialize covariance and identity matrix
        _A = zeros((nparams, nparams))  # square root of covariance matrix
        _I = eye(nparams)  # Identity matrix

        ceval = 0  # current evaluation
        cgen = 0  # current generation

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        np.random.seed(self.seed)

        print(
            "xNES: seed %d maxmsteps %d batchSize %d sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, self.batchSize, self.sameenvcond,
               nparams))

        # Set evolution mode
        self.policy.runEvo()

        # main loop
        elapsed = 0
        while ceval < maxsteps:
            cgen += 1

            # Compute the exponential of the covariance matrix
            _expA = expm(_A)
            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = np.random.randn(nparams, self.batchSize)
            # Generate offspring
            offspring = tile(self.center.reshape(nparams, 1),
                             (1, self.batchSize)) + _expA.dot(samples)
            # Evaluate offspring
            fitness = zeros(self.batchSize)
            # If normalize=1 we update the normalization vectors
            if self.policy.normalize == 1:
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.policy.setSeed(self.policy.get_seed + cgen)
            # Set generalization flag to False
            self.policy.doGeneralization(False)
            # Evaluate offspring
            for k in range(self.batchSize):
                # Set policy parameters (corresponding to the current offspring)
                self.policy.set_trainable_flat(offspring[:, k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the offspring
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current offspring is better than current best
                self.updateBest(fitness[k], offspring[:, k])

            # Sort by fitness and compute weighted mean into center
            fitness, index = descendent_sort(fitness)
            # Utilities
            utilities = zeros(self.batchSize)
            uT = zeros((self.batchSize, 1))
            for i in range(mu):
                utilities[index[i]] = weights[i]
                uT[index[i], 0] = weights[i]

            # Compute gradients
            U = zeros((nparams, self.batchSize))
            for i in range(nparams):
                for j in range(self.batchSize):
                    U[i][j] = utilities[j]

            us = zeros((nparams, self.batchSize))
            for i in range(nparams):
                for j in range(self.batchSize):
                    us[i][j] = U[i][j] * samples[i][j]
            G = us.dot(samples.transpose()) - sum(utilities) * _I
            dCenter = centerLearningRate * _expA.dot(samples.dot(uT))
            deltaCenter = zeros(nparams)
            for g in range(nparams):
                deltaCenter[g] = dCenter[g, 0]
            dA = covLearningRate * G

            # Update
            self.center += deltaCenter
            _A += dA

            centroidfit = -999999999.0
            if self.evalCenter != 0:
                # Evaluate the centroid
                self.policy.set_trainable_flat(self.center)
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                centroidfit = eval_rews
                ceval += eval_length
                # Update data if the centroid is better than current best
                self.updateBest(centroidfit, self.center)

            # Now perform generalization
            if self.policy.generalize:
                candidate = None
                if centroidfit > fitness[0]:
                    # Centroid undergoes generalization test
                    candidate = np.copy(self.center)
                else:
                    # Best sample undergoes generalization test
                    bestsamid = index[0]
                    candidate = np.copy(offspring[:, bestsamid])
                # Set the seed
                self.policy.set_trainable_flat(
                    candidate)  # Parameters must be updated by the algorithm!!
                self.policy.setSeed(self.policy.get_seed + 1000000)
                self.policy.doGeneralization(True)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # Update data if the candidate is better than current best generalizing individual
                self.updateBestg(gfit, candidate)

            # Compute the average value in the covariance matrix
            covSize = 0.0
            for g in range(nparams):
                for gg in range(nparams):
                    covSize += abs(_A[g, gg])
            covSize /= nparams
            if covSize >= 100.0:
                # Reset variables when covariance matrix diverges
                print("Reset xNES: covsize %.2f" % covSize)
                _A = zeros((nparams, nparams))

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, self.center, centroidfit,
                            fitness[0], elapsed, maxsteps)

        # save data
        self.save(cgen, ceval, centroidfit, self.center, fitness[0],
                  (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Пример #2
0
    def run(self, maxsteps):

        start_time = time.time()  # start time
        nparams = self.policy.nparams  # number of parameters
        popsize = self.batchSize  # popsize
        ceval = 0  # current evaluation
        cgen = 0  # current generation
        rg = np.random.RandomState(
            self.seed)  # create a random generator and initialize the seed
        pop = rg.randn(popsize, nparams)  # population
        fitness = zeros(popsize)  # fitness
        fitness_beh = zeros((popsize, 3))
        self.stat = np.arange(
            0, dtype=np.float64
        )  # initialize vector containing performance across generations

        assert ((popsize %
                 2) == 0), print("the size of the population should be odd")

        # initialze the population
        for i in range(popsize):
            pop[i] = self.policy.get_trainable_flat()

        print(
            "SSS: seed %d maxmsteps %d popSize %d noiseStdDev %lf crossoverrate %lf nparams %d"
            % (self.seed, maxsteps / 1000000, popsize, self.noiseStdDev,
               self.crossoverrate, nparams))

        # main loop
        elapsed = 0
        while (ceval < maxsteps):

            cgen += 1

            # If normalize=1 we update the normalization vectors
            if (self.policy.normalize == 1):
                self.policy.nn.updateNormalizationVectors()

            self.env.seed(
                self.policy.get_seed +
                cgen)  # set the environment seed, it changes every generation
            self.policy.nn.seed(
                self.policy.get_seed +
                cgen)  # set the policy seed, it changes every generation

            # Evaluate the population
            for i in range(popsize):
                self.policy.set_trainable_flat(pop[i])  # set policy parameters
                eval_rews, eval_length, rews1, rews2 = self.policy.rollout(
                    self.policy.ntrials,
                    timestep_limit=1000)  # evaluate the individual
                fitness[i] = eval_rews  # store fitness
                fitness_beh[i] = np.array([i, rews1, rews2])
                ceval += eval_length  # Update the number of evaluations
                self.updateBest(
                    fitness[i], pop[i]
                )  # Update data if the current offspring is better than current best

            fitness, index = descendent_sort(
                fitness
            )  # create an index with the ID of the individuals sorted for fitness
            bfit = fitness[index[0]]
            self.updateBest(
                bfit, pop[index[0]]
            )  # eventually update the genotype/fitness of the best individual so far

            # PARETO-FRONT
            pareto_front_idx = []
            front_len = []
            halfpopsize = int(popsize / 2)
            #dominated = fitness_beh.copy()
            count = 0
            #while len(dominated) > 0:
            #current_level = []
            current_idx = []
            for i in range(len(fitness_beh)):
                res = ~(fitness_beh[i] > fitness_beh)
                res = np.delete(res, i, axis=0)
                if not (np.any(np.all(res, axis=1))):
                    #current_level.append(dominated[i])
                    pareto_front_idx.append(int(fitness_beh[i, 0]))
                    current_idx.append(i)
                    count += 1
                    # if len(pareto_front_idx) == halfpopsize:
                    #    break
            print("Number of genotypes in the pareto-front: %.2f" % (count))
            #pareto_front_idx.append(current_level)
            #front_len.append(len(current_idx))
            dominated = np.array(np.delete(fitness_beh[:, 0],
                                           current_idx,
                                           axis=0),
                                 dtype=np.int64)

            childrensize = popsize - count
            parent = np.random.choice(pareto_front_idx,
                                      size=childrensize,
                                      replace=True)
            cross_prob = np.random.uniform(low=0.0,
                                           high=1.0,
                                           size=childrensize)

            for i in range(childrensize):
                # crossover of the first parent and a randomly selected second parent among the first pareto-front
                if cross_prob[i] < self.crossoverrate:
                    parent_1 = pop[parent[i]]
                    idx_p2 = np.random.choice(pareto_front_idx,
                                              size=2,
                                              replace=False)
                    if idx_p2[0] != parent[i]:
                        parent_2 = pop[idx_p2[0]]
                    else:
                        parent_2 = pop[idx_p2[1]]
                    cutting_points = np.random.choice(np.arange(0, nparams, 1),
                                                      size=2,
                                                      replace=False)
                    min_point = cutting_points.min()
                    max_point = cutting_points.max()

                    # The section A and C of the first parent with the section B of the second parent
                    if np.random.uniform(low=0.0, high=1.0) < 0.5:
                        pop[dominated[i], :min_point] = parent_1[:min_point]
                        pop[dominated[i], min_point:max_point] = parent_2[
                            min_point:max_point]
                        pop[dominated[i], max_point:] = parent_1[max_point:]
                    # The section A and C of the second parent with the section B of the first parent
                    else:
                        pop[dominated[i], :min_point] = parent_2[:min_point]
                        pop[dominated[i], min_point:max_point] = parent_1[
                            min_point:max_point]
                        pop[dominated[i], max_point:] = parent_2[max_point:]

                    pop[dominated[i]] += (rg.randn(nparams) * self.noiseStdDev)

                else:
                    pop[dominated[i]] = pop[parent[i]] + (
                        rg.randn(1, nparams) * self.noiseStdDev)

            # Postevaluate the best individual
            self.env.seed(
                self.policy.get_seed + 100000
            )  # set the environmental seed, always the same for the same seed
            self.policy.nn.seed(
                self.policy.get_seed + 100000
            )  # set the policy seed, always the same for the same seed
            self.policy.set_trainable_flat(
                pop[index[0]])  # set the parameters of the policy
            eval_rews, eval_length, _, _ = self.policy.rollout(
                self.policy.ntrials, timestep_limit=1000, post_eval=True)
            bgfit = eval_rews
            ceval += eval_length
            self.updateBestg(
                bgfit, pop[index[0]]
            )  # eventually update the genotype/fitness of the best post-evaluated individual

            # display info
            print(
                'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f cbestfit %.2f cbestgfit %.2f avgfit %.2f weightsize %.2f'
                %
                (self.seed, ceval / float(maxsteps) * 100, cgen,
                 ceval / 1000000, self.bestfit, self.bestgfit, bfit, bgfit,
                 np.average(fitness), np.average(np.absolute(pop[index[0]]))))

            # store data throughout generations
            self.stat = np.append(self.stat, [
                ceval, self.bestfit, self.bestgfit, bfit, bgfit,
                np.average(fitness)
            ])

            # save data
            if ((time.time() - self.last_save_time) >
                (self.policy.saveeach * 60)):
                self.save(ceval, cgen, maxsteps, bfit, bgfit,
                          np.average(fitness),
                          np.average(np.absolute(pop[index[0]])))
                self.last_save_time = time.time()

        self.save(ceval, cgen, maxsteps, bfit, bgfit, np.average(fitness),
                  np.average(np.absolute(pop[index[0]])))
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Пример #3
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        self.center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        if self.batchSize == 0:
            # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters
            self.batchSize = int(
                4 +
                floor(3 * log(nparams)))  # population size, offspring number
            if "Tf" in type(self.policy).__name__:
                # Update the number of rollout calls in policy (the initial value has been set based on configuration file)
                self.policy.updaten(self.batchSize)

        mu = int(floor(self.batchSize /
                       2))  # number of parents/points for recombination
        weights = log(mu + 1) - log(array(range(1, mu + 1)))  # use array
        weights /= sum(weights)  # normalize recombination weights array
        muEff = sum(weights)**2 / sum(power(
            weights, 2))  # variance-effective size of mu
        cumCov = 4 / float(
            nparams + 4)  # time constant for cumulation for covariance matrix
        cumStep = (muEff + 2) / (nparams + muEff + 3
                                 )  # t-const for cumulation for Size control
        muCov = muEff  # size of mu used for calculating learning rate covLearningRate
        covLearningRate = ((1 / muCov) * 2 / (nparams + 1.4)**2 +
                           (1 - 1 / muCov) *  # learning rate for
                           ((2 * muEff - 1) / ((nparams + 2)**2 + 2 * muEff))
                           )  # covariance matrix
        dampings = 1 + 2 * max(0,
                               sqrt((muEff - 1) / (nparams + 1)) - 1) + cumStep
        # damping for stepSize usually close to 1 former damp == dampings/cumStep
        # Initialize dynamic (internal) strategy parameters and constants
        covPath = zeros(nparams)
        stepPath = zeros(nparams)  # evolution paths for C and stepSize
        B = eye(nparams, nparams)  # B defines the coordinate system
        D = eye(nparams, nparams)  # diagonal matrix D defines the scaling
        C = dot(dot(B, D), dot(B, D).T)  # covariance matrix
        chiN = nparams**0.5 * (1 - 1. / (4. * nparams) + 1 /
                               (21. * nparams**2))
        # expectation of ||numParameters(0,I)|| == norm(randn(numParameters,1))
        self.stepsize = 0.5

        ceval = 0  # current evaluation
        cgen = 0  # current generation

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        np.random.seed(self.seed)

        print(
            "CMA-ES: seed %d maxmsteps %d batchSize %d stepsize %.2f sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize,
               self.sameenvcond, nparams))

        # Set evolution mode
        self.policy.runEvo()
        """
        updateCovMatRate = 1
        if not self.updateCovEveryGen:
            updateCovMatRate = 0.1 / covLearningRate / nparams
            decPart = math.modf(updateCovMatRate)[0]
            if decPart >= 0.5:
                updateCovMatRate = ceil(updateCovMatRate)
            else:
                updateCovMatRate = floor(updateCovMatRate)
        updateCovMatRate = int(updateCovMatRate)
        """

        # main loop
        elapsed = 0
        while ceval < maxsteps:
            cgen += 1

            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = np.random.randn(nparams, self.batchSize)
            # Generate offspring
            offspring = tile(
                self.center.reshape(nparams, 1),
                (1, self.batchSize)) + self.stepsize * dot(dot(B, D), samples)
            # Evaluate offspring
            fitness = zeros(self.batchSize)
            # If normalize=1 we update the normalization vectors
            if self.policy.normalize == 1:
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.policy.setSeed(self.policy.get_seed + cgen)
            # Set generalization flag to False
            self.policy.doGeneralization(False)
            # Evaluate offspring
            for k in range(self.batchSize):
                # Set policy parameters (corresponding to the current offspring)
                self.policy.set_trainable_flat(offspring[:, k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the offspring
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current offspring is better than current best
                self.updateBest(fitness[k], offspring[:, k])

            # Sort by fitness and compute weighted mean into center
            fitness, index = descendent_sort(fitness)
            # Re-organize samples according to indices
            samples = samples[:, index]
            # Do the same for offspring
            offspring = offspring[:, index]
            # Select best <mu> samples and offspring for computing new center and cumulation paths
            samsel = samples[:, range(mu)]
            offsel = offspring[:, range(mu)]
            offmut = offsel - tile(self.center.reshape(nparams, 1), (1, mu))

            samplemean = dot(samsel, weights)
            self.center = dot(offsel, weights)

            # Cumulation: Update evolution paths
            stepPath = (1 - cumStep) * stepPath \
                     + sqrt(cumStep * (2 - cumStep) * muEff) * dot(B, samplemean)   # Eq. (4)
            hsig = norm(stepPath) / sqrt(1 - (1 - cumStep) ** (2 * ceval / float(self.batchSize))) / chiN \
                     < 1.4 + 2. / (nparams + 1)
            covPath = (1 - cumCov) * covPath + hsig * \
                     sqrt(cumCov * (2 - cumCov) * muEff) * dot(dot(B, D), samplemean) # Eq. (2)

            # Adapt covariance matrix C
            C = ((1 - covLearningRate) * C  # regard old matrix   % Eq. (3)
                 + covLearningRate * (1 / muCov) * (
                     outer(covPath, covPath)  # plus rank one update
                     + (1 - hsig) * cumCov * (2 - cumCov) * C) +
                 covLearningRate * (1 - 1 / muCov)  # plus rank mu update
                 * dot(dot(offmut, diag(weights)), offmut.T))

            # Adapt step size
            self.stepsize *= exp(
                (cumStep / dampings) * (norm(stepPath) / chiN - 1))  # Eq. (5)

            # Update B and D from C
            # This is O(n^3). When strategy internal CPU-time is critical, the
            # next three lines should be executed only every (alpha/covLearningRate/N)-th
            # iteration, where alpha is e.g. between 0.1 and 10
            C = (C + C.T) / 2  # enforce symmetry
            Ev, B = eig(C)  # eigen decomposition, B==normalized eigenvectors
            Ev = real(Ev)  # enforce real value
            D = diag(
                sqrt(Ev)
            )  #diag(ravel(sqrt(Ev))) # D contains standard deviations now
            B = real(B)

            centroidfit = -999999999.0
            if self.evalCenter != 0:
                # Evaluate the centroid
                self.policy.set_trainable_flat(self.center)
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                centroidfit = eval_rews
                ceval += eval_length
                # Update data if the centroid is better than current best
                self.updateBest(centroidfit, self.center)

            # Now perform generalization
            if self.policy.generalize:
                candidate = None
                if centroidfit > fitness[0]:
                    # Centroid undergoes generalization test
                    candidate = np.copy(self.center)
                else:
                    # Best sample undergoes generalization test
                    bestsamid = index[0]
                    candidate = np.copy(offspring[:, bestsamid])
                # Set the seed
                self.policy.set_trainable_flat(
                    candidate)  # Parameters must be updated by the algorithm!!
                self.policy.setSeed(self.policy.get_seed + 1000000)
                self.policy.doGeneralization(True)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # Update data if the candidate is better than current best generalizing individual
                self.updateBestg(gfit, candidate)

            # Compute the average value in the covariance matrix
            covSize = 0.0
            for g in range(nparams):
                for gg in range(nparams):
                    covSize += abs(C[g, gg])
            covSize /= nparams
            if self.stepsize >= 10.0 or covSize >= 100.0 or (
                    self.stepsize >= 5.0 and covSize >= 20.0):
                # Reset variables when either stepsize or covariance matrix diverges
                print("Reset CMAES: stepsize %.2f covsize %.2f" %
                      (self.stepsize, covSize))
                covPath = zeros(nparams)
                stepPath = zeros(nparams)
                B = eye(nparams, nparams)
                D = eye(nparams, nparams)
                C = dot(dot(B, D), dot(B, D).T)
                self.stepsize = 0.5

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, self.center, centroidfit,
                            fitness[0], elapsed, maxsteps)

        # save data
        self.save(cgen, ceval, centroidfit, self.center, fitness[0],
                  (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Пример #4
0
    def run(self):

        self.loadhyperparameters()  # initialize hyperparameters
        start_time = time.time()  # start time
        nparams = self.policy.nparams  # number of parameters
        ceval = 0  # current evaluation
        cgen = 0  # current generation
        rg = np.random.RandomState(
            self.seed)  # create a random generator and initialize the seed
        pop = rg.randn(self.popsize, nparams)  # population
        fitness = zeros(self.popsize)  # fitness
        self.stat = np.arange(
            0, dtype=np.float64
        )  # initialize vector containing performance across generations

        assert ((self.popsize %
                 2) == 0), print("the size of the population should be odd")

        # initialze the population
        for i in range(self.popsize):
            pop[i] = self.policy.get_trainable_flat()

        print(
            "SSS: seed %d maxmsteps %d popSize %d noiseStdDev %lf nparams %d" %
            (self.seed, self.maxsteps / 1000000, self.popsize, self.mutation,
             nparams))

        # main loop
        elapsed = 0
        while (ceval < self.maxsteps):

            cgen += 1

            # If normalize=1 we update the normalization vectors
            if (self.policy.normalize == 1):
                self.policy.nn.updateNormalizationVectors()

            self.env.seed(
                self.policy.get_seed +
                cgen)  # set the environment seed, it changes every generation
            self.policy.nn.seed(
                self.policy.get_seed +
                cgen)  # set the policy seed, it changes every generation

            # Evaluate the population
            for i in range(self.popsize):
                self.policy.set_trainable_flat(pop[i])  # set policy parameters
                eval_rews, eval_length = self.policy.rollout(
                    self.policy.ntrials)  # evaluate the individual
                fitness[i] = eval_rews  # store fitness
                ceval += eval_length  # Update the number of evaluations
                self.updateBest(
                    fitness[i], pop[i]
                )  # Update data if the current offspring is better than current best

            fitness, index = descendent_sort(
                fitness
            )  # create an index with the ID of the individuals sorted for fitness
            bfit = fitness[index[0]]
            self.updateBest(
                bfit, pop[index[0]]
            )  # eventually update the genotype/fitness of the best individual so far

            # Postevaluate the best individual
            self.env.seed(
                self.policy.get_seed + 100000
            )  # set the environmental seed, always the same for the same seed
            self.policy.nn.seed(
                self.policy.get_seed + 100000
            )  # set the policy seed, always the same for the same seed
            self.policy.set_trainable_flat(
                pop[index[0]])  # set the parameters of the policy
            eval_rews, eval_length = self.policy.rollout(self.policy.ntrials)
            bgfit = eval_rews
            ceval += eval_length
            self.updateBestg(
                bgfit, pop[index[0]]
            )  # eventually update the genotype/fitness of the best post-evaluated individual

            # replace the worst half of the population with a mutated copy of the first half of the population
            halfpopsize = int(self.popsize / 2)
            for i in range(halfpopsize):
                pop[index[i + halfpopsize]] = pop[index[i]] + (
                    rg.randn(1, nparams) * self.mutation)

            # display info
            print(
                'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f cbestfit %.2f cbestgfit %.2f avgfit %.2f weightsize %.2f'
                %
                (self.seed, ceval / float(self.maxsteps) * 100, cgen,
                 ceval / 1000000, self.bestfit, self.bestgfit, bfit, bgfit,
                 np.average(fitness), np.average(np.absolute(pop[index[0]]))))

            # store data throughout generations
            self.stat = np.append(self.stat, [
                ceval, self.bestfit, self.bestgfit, bfit, bgfit,
                np.average(fitness)
            ])

            # save data
            if ((time.time() - self.last_save_time) > (self.saveeach * 60)):
                self.save(ceval, cgen, bfit, bgfit, np.average(fitness),
                          np.average(np.absolute(pop[index[0]])))
                self.last_save_time = time.time()

        self.save(ceval, cgen, bfit, bgfit, np.average(fitness),
                  np.average(np.absolute(pop[index[0]])))
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Пример #5
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center (here the centroid is used to generate
        # random individuals)
        self.center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters

        ceval = 0  # current evaluation
        cgen = 0  # current generation

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        np.random.seed(self.seed)

        print(
            "SSS: seed %d maxmsteps %d batchSize %d sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, self.batchSize, self.sameenvcond,
               nparams))

        # Set evolution mode
        self.policy.runEvo()

        # Population
        self.pop = tile(self.center.reshape(1, nparams), (self.batchSize, 1))
        # Apply random variations to solution center
        for i in range(self.batchSize):
            for j in range(nparams):
                self.pop[i, j] += np.random.random() * 0.2 - 0.1

        # Allocate offspring
        offspring = np.zeros((self.batchSize, nparams))

        # Here centroid is useless
        centroidfit = -999999999.0

        # main loop
        elapsed = 0
        while ceval < maxsteps:
            cgen += 1

            fitness = zeros(self.batchSize * 2)
            # If normalize=1 we update the normalization vectors
            if self.policy.normalize == 1:
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.policy.setSeed(self.policy.get_seed + cgen)
            # Set generalization flag to False
            self.policy.doGeneralization(False)
            # Evaluate parents and offspring
            for k in range(self.batchSize):
                # Set policy parameters (corresponding to the current offspring)
                self.policy.set_trainable_flat(self.pop[k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the parents
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current parent is better than current best
                self.updateBest(fitness[k], self.pop[k])
                # Generate the offspring
                for j in range(nparams):
                    offspring[k, j] = self.pop[k, j]
                    if np.random.uniform(low=0.0, high=1.0) < 0.03:
                        # Extract a random number to perform either weight
                        # replacemente or weight perturbation
                        if np.random.uniform(low=0.0, high=1.0) < 0.5:
                            # Weight replacement
                            offspring[k, j] = np.random.random() * (
                                self.policy.wrange * 2.0) - self.policy.wrange
                        else:
                            # Weight perturbation
                            offspring[k, j] += np.random.random() * 0.2 - 0.1
                self.policy.set_trainable_flat(offspring[k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the offspring
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[self.batchSize + k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current offspring is better than current best
                self.updateBest(fitness[self.batchSize + k], offspring[k])

            # Selection
            parentfit = np.copy(fitness[0:self.batchSize])
            # Add noise to parent fitness
            """
            for i in range(self.batchSize):
                noise = np.random.random() * self.noiseStdDev * 2.0 - self.noiseStdDev
                parentfit[i] += (noise * parentfit[i])
            """
            offspringfit = np.copy(fitness[self.batchSize:(self.batchSize *
                                                           2)])
            # Add noise to offspring fitness
            """
            for i in range(self.batchSize):
                noise = np.random.random() * self.noiseStdDev * 2.0 - self.noiseStdDev
                offspringfit[i] += (noise * offspringfit[i])
            """
            # Sort parent and offspring based on fitness (descending mode)
            parentfit, parentidx = descendent_sort(parentfit)
            offspringfit, offspringidx = descendent_sort(offspringfit)
            # Population index
            k = 0
            # Parent index
            p = 0
            # Offspring index
            o = 0
            while k < self.batchSize:
                if parentfit[p] > offspringfit[o]:
                    p += 1
                else:
                    # Offspring replaces worst parent
                    wp = parentidx[self.batchSize - 1 - o]
                    bo = offspringidx[o]
                    self.pop[wp] = np.copy(offspring[bo])
                    fitness[wp] = fitness[self.batchSize + bo]
                    o += 1
                k += 1

            # Get the best individual (of the current generation)
            bfit = np.copy(fitness[0:self.batchSize])
            bfit, bidx = descendent_sort(bfit)
            bidx = bidx[0]

            # Now perform generalization
            if self.policy.generalize:
                candidate = np.copy(self.pop[bidx])
                # Set the seed
                self.policy.set_trainable_flat(
                    candidate)  # Parameters must be updated by the algorithm!!
                self.policy.setSeed(self.policy.get_seed + 1000000)
                self.policy.doGeneralization(True)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # Update data if the candidate is better than current best generalizing individual
                self.updateBestg(gfit, candidate)

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness[0:self.batchSize],
                            self.center, centroidfit, fitness[bidx], elapsed,
                            maxsteps)

        # save data
        self.save(cgen, ceval, centroidfit, self.center, fitness[bidx],
                  (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))