def __init__(self,
                 task,
                 policy,
                 population_size=20,
                 sigma=0.5,
                 num_workers=1):
        """
        Initialize the CMA-ES algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            policy (Policy): specify the policy (model) to optimize
            population_size (int): size of the population
            sigma (float): initial standard deviation for CMA-ES
            num_workers (int): number of workers/jobs to run in parallel
        """
        # create explorer
        # create evaluator
        # create updater
        # super(CEM, self).__init__(self, explorer, evaluator, updater, num_workers=1)

        if isinstance(task, Env):
            task = RLTask(task, policy)
        self.task = task
        self.policy = policy
        self.population_size = population_size
        self.sigma = sigma
        self.num_workers = num_workers
        self.es = None

        self.best_reward = -np.infty
        self.best_parameters = None
예제 #2
0
    def __init__(self, task=None, policy=None, domain=(-3., 3.), num_workers=1):
        """
        Initialize the Bayesian Optimization algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            policy (Policy): specify the policy (model) to optimize
            num_workers (int): number of workers/jobs to run in parallel
        """
        if isinstance(task, Env):
            task = RLTask(task, policy)
        self.task = task
        self.policy = policy
        self.num_workers = num_workers

        self.best_reward = -np.infty
        self.best_parameters = None

        self.num_steps = 1000
        self.num_rollouts = 1
        self.verbose = False
        self.episode = 0
        self.render = False

        self.domain = domain

        self.rewards = []
예제 #3
0
    def __init__(self, task, policy, population_size=20, elite_fraction=0.2, num_workers=1):
        """
        Initialize the CEM algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            policy (Policy): specify the policy (model) to optimize
            population_size (int): size of the population
            elite_fraction (float): fraction of elites to use to compute the new mean and covariance matrix of the
                multivariate normal distribution
            num_workers (int): number of workers/jobs to run in parallel
        """
        # create explorer
        # create evaluator
        # create updater
        # super(CEM, self).__init__(self, explorer, evaluator, updater, num_workers=1)

        if isinstance(task, Env):
            task = RLTask(task, policy)
        self.task = task
        self.policy = policy
        self.population_size = population_size
        self.elite_fraction = elite_fraction
        self.num_workers = num_workers

        self.best_reward = -np.infty
        self.best_parameters = None
    def __init__(self, task, policy, num_variations=None, std_dev=0.01, difference_type='central', learning_rate=0.001,
                 normalize_grad=False, num_workers=1):
        # hyperparameters
        """
        Initialize the FD algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            policy (Policy): specify the policy (model) to optimize
            num_variations (None, int): number of times we vary the parameters by a small different increment.
                If None, it will be twice the number of parameters as according to [1], it yields very accurate
                gradient estimates.
            std_dev (float): the small increments are generated from a Normal distribution center at 0 and
            difference_type (str): there are two difference type of estimators: 'forward' or 'central'.
                The forward-difference estimator computes the gradient using
                :math:`J(\theta + \Delta\theta) - J(\theta)`, while the central-difference estimator computes the
                gradient using :math:`J(\theta + \Delta\theta) - J(\theta - \Delta\theta)`
            learning_rate (float): learning rate (=coefficient) for the gradient ascent step
            normalize_grad (bool): specify if we should normalize the gradients
            num_workers (int): number of workers/jobs to run in parallel
        """
        # create explorer
        # create evaluator
        # create updater
        # super(FD, self).__init__(self, explorer, evaluator, updater, num_workers=1)

        if isinstance(task, Env):
            task = RLTask(task, policy)
        self.task = task
        self.policy = policy
        self.num_workers = num_workers

        # set the number of variations (small increments to vary the parameters)
        # From [1]: "Empirically it can be observed that taking the number of variations as twice the number
        # of parameters yields very accurate gradient estimates"
        if num_variations is None:
            self.num_variations = 2 * self.policy.num_parameters

        # set standard deviation
        self.stddev = np.abs(std_dev)

        # set difference type
        if difference_type != 'forward' and difference_type != 'central':
            raise ValueError("Expecting the 'difference_type' argument to be 'forward' or 'central'. Instead got "
                             "'{}'".format(difference_type))
        self.difference_type = difference_type

        # set other parameters
        self.lr = learning_rate
        self.normalize_grad = bool(normalize_grad)

        # remember best parameters
        self.best_reward = -np.infty
        self.best_parameters = None
예제 #5
0
 def task(self, task):
     """Set the RL task."""
     if isinstance(task, (tuple, list)):
         env, policy = None, None
         for t in task:
             if isinstance(t, Env):
                 env = t
             if isinstance(t, Policy):  # TODO if multiple policies
                 policy = t
         if env is None or policy is None:
             raise ValueError("Expecting the task to be an instance of `RLTask` or a list/tuple of an environment "
                              "and policy.")
         task = RLTask(env, policy)
     if not isinstance(task, RLTask):
         raise TypeError("Expecting the task to be an instance of `RLTask`, instead got: {}".format(type(task)))
     self._task = task
    def __init__(self,
                 task,
                 policy,
                 std_params=1.,
                 num_best_rollouts=10,
                 num_workers=1):
        """
        Initialize the PoWER algorithm.

        Args:
            task (RLTask, Env): RL task/env to run.
            policy (Policy): specify the policy (model) to optimize.
            std_params (float): standard deviation of the parameters.
        """
        # create explorer
        # create evaluator
        # create updater
        # super(PoWER, self).__init__(task, exploration_strategy, memory, hyperparameters)

        # set task
        if isinstance(task, Env):
            task = RLTask(task, policy)
        if not isinstance(task, RLTask):
            raise TypeError("Expecting task to be an instance of RLTask.")
        self.task = task

        # set policy
        self.policy = policy
        if not self.policy.is_parametric():
            raise ValueError("The policy should be parametric")
        if not self.policy.is_linear():
            raise ValueError(
                "The policy should be linear with respect to the parameters")

        # set standard deviation of the parameters
        self.std_params = std_params

        # set num best rollouts for memory
        self.num_best_rollouts = num_best_rollouts

        # remember best parameters
        self.best_reward = -np.infty
        self.best_parameters = None
# -*- coding: utf-8 -*-
#!/usr/bin/env python
"""Example on how to use the 'Cartpole' OpenAI Gym environments in PyRoboLearn using a random policy
"""

from pyrobolearn.envs import gym
from pyrobolearn.policies import RandomPolicy
from pyrobolearn.tasks import RLTask

# create env, state, and action from gym
env = gym.make('CartPole-v1')
state, action = env.state, env.action
print("State and action space: {} and {}".format(state.space, action.space))

# create policy
policy = RandomPolicy(state, action)

# create task and run it
task = RLTask(env, policy)
task.run(num_steps=1000, dt=0.02, use_terminating_condition=False, render=True)
예제 #8
0
    def __init__(self, task, policy, population_size=20, species_elitism=2, elitism=2, min_species_size=2,
                 survival_threshold=0.2, max_stagnation=15, compatibility_threshold=3, num_workers=1):
        r"""
        Initialize the NEAT algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            policy (Policy): specify the policy (model) to optimize
            population_size (int): size of the population
            elitism (int): The number of most-fit individuals in each species that will be preserved as-is from one
                generation to the next.
            num_workers (int): number of workers/jobs to run in parallel
        """
        # create explorer
        # create evaluator
        # create updater
        # super(NEAT, self).__init__(self, explorer, evaluator, updater, num_workers=1)

        # set task
        if isinstance(task, Env):
            task = RLTask(task, policy)
        self.task = task

        # set policy
        if policy is None:
            policy = self.task.policies[0]  # TODO: currently assume only 1 policy
        if not isinstance(policy, NEATPolicy):
            raise TypeError("Expecting the policy to be an instance of 'NEATPolicy'.")
        self.policy = policy

        # set config file
        # more info about genome's config file: https://neat-python.readthedocs.io/en/latest/config_file.html
        # more info about activation fct: https://neat-python.readthedocs.io/en/latest/activation.html
        config_dict = {'[NEAT]': {'fitness_criterion': 'max',
                                  'fitness_threshold': 100,
                                  'no_fitness_termination': True,
                                  'pop_size': population_size,
                                  'reset_on_extinction': True},
                       '[DefaultSpeciesSet]': {'compatibility_threshold': compatibility_threshold},
                       '[DefaultStagnation]': {'species_fitness_func': 'max',
                                               'max_stagnation': max_stagnation,
                                               'species_elitism': species_elitism},
                       '[DefaultReproduction]': {'elitism': elitism,
                                                 'survival_threshold': survival_threshold,
                                                 'min_species_size': min_species_size}}

        # update config file of policy
        self.policy.update_config(config_dict)

        # get population
        self.population = self.policy.population

        # create useful variables
        self.num_steps = 1000
        self.num_rollouts = 1
        self.verbose = False
        self.episode = 0
        self.avg_rewards, self.max_rewards = [], []

        self.best_reward = -np.infty
        self.best_parameters = None