예제 #1
0
def rollout(env, policy, path_length, render=False, speedup=None):
    Da = flat_dim(env.action_space)
    Do = flat_dim(env.observation_space)

    observation = env.reset()
    policy.reset()

    observations = np.zeros((path_length + 1, Do))
    actions = np.zeros((path_length, Da))
    terminals = np.zeros((path_length, ))
    rewards = np.zeros((path_length, ))
    agent_infos = []
    env_infos = []

    t = 0
    for t in range(path_length):

        action, agent_info = policy.get_action(observation)
        next_obs, reward, terminal, env_info = env.step(action)

        agent_infos.append(agent_info)
        env_infos.append(env_info)

        actions[t] = action
        terminals[t] = terminal
        rewards[t] = reward
        observations[t] = observation

        observation = next_obs

        if render:
            env.render()
            time_step = 0.05
            time.sleep(time_step / speedup)

        if terminal:
            break

    observations[t + 1] = observation

    path = {
        'observations': observations[:t + 1],
        'actions': actions[:t + 1],
        'rewards': rewards[:t + 1],
        'terminals': terminals[:t + 1],
        'next_observations': observations[1:t + 2],
        'agent_infos': agent_infos,
        'env_infos': env_infos
    }

    return path
예제 #2
0
    def __init__(self, env_spec, q_functions):
        Serializable.quick_init(self, locals())

        self.q_functions = q_functions

        self._Da = flat_dim(env_spec.action_space)
        self._Do = flat_dim(env_spec.observation_space)

        self._observations_ph = tf.placeholder(
            tf.float32, shape=[None, self._Do], name='observations')
        self._actions_ph = tf.placeholder(
            tf.float32, shape=[None, self._Da], name='actions')

        self._output = self.output_for(
            self._observations_ph, self._actions_ph, reuse=True)
예제 #3
0
    def __init__(
        self,
        env,
        scale_reward=1.,
        normalize_obs=False,
        normalize_reward=False,
        flatten_obs=True,
        obs_alpha=0.001,
        reward_alpha=0.001,
    ):
        Serializable.quick_init(self, locals())
        super(NormalizedEnv, self).__init__(env)
        self._scale_reward = scale_reward
        self._normalize_obs = normalize_obs
        self._normalize_reward = normalize_reward
        self._flatten_obs = flatten_obs

        self._obs_alpha = obs_alpha
        flat_obs_dim = flat_dim(env.observation_space)
        self._obs_mean = np.zeros(flat_obs_dim)
        self._obs_var = np.ones(flat_obs_dim)

        self._reward_alpha = reward_alpha
        self._reward_mean = 0.
        self._reward_var = 1.
예제 #4
0
파일: terrain.py 프로젝트: gntoni/garage
def clear_patch(hfield, box):
    ''' Clears a patch shaped like box, assuming robot is placed in center of hfield
    @param box: garage.spaces.Box-like
    '''
    if flat_dim(box) > 2:
        raise ValueError("Provide 2dim box")

    # clear patch
    h_center = int(0.5 * hfield.shape[0])
    w_center = int(0.5 * hfield.shape[1])
    fromrow, torow = w_center + int(box.low[0] / STEP), w_center + int(
        box.high[0] / STEP)
    fromcol, tocol = h_center + int(box.low[1] / STEP), h_center + int(
        box.high[1] / STEP)
    hfield[fromrow:torow, fromcol:tocol] = 0.0

    # convolve to smoothen edges somewhat, in case hills were cut off
    K = np.ones((10, 10)) / 100.0
    s = convolve2d(hfield[fromrow - 9:torow + 9, fromcol - 9:tocol + 9],
                   K,
                   mode='same',
                   boundary='symm')
    hfield[fromrow - 9:torow + 9, fromcol - 9:tocol + 9] = s

    return hfield
예제 #5
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(64, 64),
                 name="ContinuousMLPPolicy",
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=tf.nn.tanh,
                 input_include_goal=False,
                 bn=False):
        """
        Initialize class with multiple attributes.

        Args:
            env_spec():
            hidden_sizes(list or tuple, optional):
                A list of numbers of hidden units for all hidden layers.
            name(str, optional):
                A str contains the name of the policy.
            hidden_nonlinearity(optional):
                An activation shared by all fc layers.
            output_nonlinearity(optional):
                An activation used by the output layer.
            bn(bool, optional):
                A bool to indicate whether normalize the layer or not.
        """
        assert isinstance(env_spec.action_space, Box)

        Serializable.quick_init(self, locals())
        super(ContinuousMLPPolicy, self).__init__(env_spec)

        self.name = name
        self._env_spec = env_spec
        if input_include_goal:
            obs_dim = flat_dim(
                env_spec.observation_space.spaces["observation"])
            goal_dim = flat_dim(
                env_spec.observation_space.spaces["desired_goal"])
            self._obs_dim = obs_dim + goal_dim
        else:
            self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim
        self._action_bound = env_spec.action_space.high
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._output_nonlinearity = output_nonlinearity
        self._batch_norm = bn
        self._policy_network_name = "policy_network"
예제 #6
0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes=(100, 100),
                 name='q_function'):
        Serializable.quick_init(self, locals())

        self._Da = flat_dim(env_spec.action_space)
        self._Do = flat_dim(env_spec.observation_space)

        self._observations_ph = tf.placeholder(
            tf.float32, shape=[None, self._Do], name='observations')
        self._actions_ph = tf.placeholder(
            tf.float32, shape=[None, self._Da], name='actions')

        super(NNQFunction, self).__init__(
            inputs=(self._observations_ph, self._actions_ph),
            name=name,
            hidden_layer_sizes=hidden_layer_sizes)
예제 #7
0
파일: noisy_env.py 프로젝트: gntoni/garage
 def __init__(
     self,
     env,
     obs_noise=1e-1,
 ):
     Serializable.quick_init(self, locals())
     super(NoisyObservationEnv, self).__init__(env)
     self.obs_noise = obs_noise
     self._action_flat_dim = flat_dim(self.action_space)
예제 #8
0
    def __init__(self,
                 env_spec,
                 name="ContinuousMLPQFunction",
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 action_merge_layer=-2,
                 output_nonlinearity=None,
                 input_include_goal=False,
                 bn=False):
        """
        Initialize class with multiple attributes.

        Args:
            env_spec():
            name(str, optional): A str contains the name of the policy.
            hidden_sizes(list or tuple, optional):
                A list of numbers of hidden units for all hidden layers.
            hidden_nonlinearity(optional):
                An activation shared by all fc layers.
            action_merge_layer(int, optional):
                An index to indicate when to merge action layer.
            output_nonlinearity(optional):
                An activation used by the output layer.
            bn(bool, optional):
                A bool to indicate whether normalize the layer or not.
        """
        Serializable.quick_init(self, locals())

        self.name = name
        self._env_spec = env_spec
        if input_include_goal:
            obs_dim = flat_dim(
                env_spec.observation_space.spaces["observation"])
            goal_dim = flat_dim(
                env_spec.observation_space.spaces["desired_goal"])
            self._obs_dim = obs_dim + goal_dim
        else:
            self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._action_merge_layer = action_merge_layer
        self._output_nonlinearity = output_nonlinearity
        self._batch_norm = bn
예제 #9
0
파일: noisy_env.py 프로젝트: gntoni/garage
 def __init__(
     self,
     env,
     action_delay=3,
 ):
     assert action_delay > 0, "Should not use this env transformer"
     Serializable.quick_init(self, locals())
     super(DelayedActionEnv, self).__init__(env)
     self.action_delay = action_delay
     self._action_flat_dim = flat_dim(self.action_space)
     self._queued_actions = None
예제 #10
0
    def __init__(
        self,
        env,
        obs_noise=1e-1,
    ):
        super().__init__(env)

        self.obs_noise = obs_noise
        self._action_flat_dim = flat_dim(self.action_space)

        # Always call Serializable constructor last
        Serializable.quick_init(self, locals())
예제 #11
0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes,
                 squash=True,
                 name='policy'):
        Serializable.quick_init(self, locals())

        self._action_dim = flat_dim(env_spec.action_space)
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]
        self._squash = squash
        self._name = name

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation')

        self._actions = self.actions_for(self._observation_ph)

        super(StochasticNNPolicy, self).__init__(
            env_spec, self._observation_ph, self._actions, self._name)
    def __init__(self, env_spec, max_replay_buffer_size):
        super(SimpleReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._env_spec = env_spec
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._action_dim = flat_dim(env_spec.action_space)
        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros(
            (max_replay_buffer_size, self._observation_dim))
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = np.zeros(
            (max_replay_buffer_size, self._observation_dim))
        self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
        self._rewards = np.zeros(max_replay_buffer_size)
        # self._terminals[i] = a terminal was received at time i
        self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
        self._top = 0
        self._size = 0
예제 #13
0
    def __init__(self, env_spec, max_replay_buffer_size):
        super(SimpleReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._env_spec = env_spec
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._action_dim = flat_dim(env_spec.action_space)
        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros((max_replay_buffer_size,
                                       self._observation_dim))
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = np.zeros((max_replay_buffer_size,
                                   self._observation_dim))
        self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
        self._rewards = np.zeros(max_replay_buffer_size)
        # self._terminals[i] = a terminal was received at time i
        self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
        self._top = 0
        self._size = 0
예제 #14
0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes,
                 squash=True,
                 name='policy'):
        Serializable.quick_init(self, locals())

        self._action_dim = flat_dim(env_spec.action_space)
        self._observation_dim = flat_dim(env_spec.observation_space)
        self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]
        self._squash = squash
        self._name = name

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation')

        self._actions = self.actions_for(self._observation_ph)

        super(StochasticNNPolicy,
              self).__init__(env_spec, self._observation_ph, self._actions,
                             self._name)
예제 #15
0
def test_unflatten():
    env = normalize(gym.make('Blackjack-v0'),
                    normalize_reward=True,
                    normalize_obs=True,
                    flatten_obs=False)
    for i in range(10):
        env.reset()
        for e in range(100):
            action = env.action_space.sample()
            next_obs, reward, done, info = env.step(action)
            assert flatten(env.observation_space,
                           next_obs).shape == flat_dim(env.observation_space)
            if done:
                break
    env.close()
예제 #16
0
def test_flatten():
    env = normalize(gym.make('Pendulum-v0'),
                    normalize_reward=True,
                    normalize_obs=True,
                    flatten_obs=True)
    for i in range(10):
        env.reset()
        for e in range(100):
            env.render()
            action = env.action_space.sample()
            next_obs, reward, done, info = env.step(action)
            assert next_obs.shape == flat_dim(env.observation_space)
            if done:
                break
    env.close()
예제 #17
0
 def _set_sensor_mask(self, env, sensor_idx):
     obsdim = flat_dim(env.observation_space)
     if len(sensor_idx) > obsdim:
         raise ValueError(
             ("Length of sensor mask ({0}) cannot be greater "
              "than observation dim ({1})").format(len(sensor_idx), obsdim))
     if len(sensor_idx) == obsdim and not np.any(np.array(sensor_idx) > 1):
         sensor_mask = np.array(sensor_idx, dtype=np.bool)
     elif np.any(np.unique(sensor_idx, return_counts=True)[1] > 1):
         raise ValueError(("Double entries or boolean mask "
                           "with dim ({0}) < observation dim ({1})").format(
                               len(sensor_idx), obsdim))
     else:
         sensor_mask = np.zeros((obsdim, ), dtype=np.bool)
         sensor_mask[sensor_idx] = 1
     self._sensor_mask = sensor_mask
예제 #18
0
    def __init__(
        self,
        base_kwargs,
        env,
        pool,
        qf,
        policy,
        plotter=None,
        policy_lr=1E-3,
        qf_lr=1E-3,
        value_n_particles=16,
        td_target_update_interval=1,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=16,
        kernel_update_ratio=0.5,
        discount=0.99,
        reward_scale=1,
        use_saved_qf=False,
        use_saved_policy=False,
        save_full_state=False,
        train_qf=True,
        train_policy=True,
    ):
        """
        Args:
            base_kwargs (dict): Dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.
            env (`rllab.Env`): rllab environment object.
            pool (`PoolBase`): Replay buffer to add gathered samples to.
            qf (`NNQFunction`): Q-function approximator.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            qf_lr (`float`): Learning rate used for the Q-function approximator.
            value_n_particles (`int`): The number of action samples used for
                estimating the value of next state.
            td_target_update_interval (`int`): How often the target network is
                updated to match the current Q-function.
            kernel_fn (function object): A function object that represents
                a kernel function.
            kernel_n_particles (`int`): Total number of particles per state
                used in SVGD updates.
            kernel_update_ratio ('float'): The ratio of SVGD particles used for
                the computation of the inner/outer empirical expectation.
            discount ('float'): Discount factor.
            reward_scale ('float'): A factor that scales the raw rewards.
                Useful for adjusting the temperature of the optimal Boltzmann
                distribution.
            use_saved_qf ('boolean'): If true, use the initial parameters provided
                in the Q-function instead of reinitializing.
            use_saved_policy ('boolean'): If true, use the initial parameters provided
                in the policy instead of reinitializing.
            save_full_state ('boolean'): If true, saves the full algorithm
                state, including the replay buffer.
        """
        super(SQL, self).__init__(**base_kwargs)

        self.env = env
        self.pool = pool
        self.qf = qf
        self.policy = policy
        self.plotter = plotter

        self._qf_lr = qf_lr
        self._policy_lr = policy_lr
        self._discount = discount
        self._reward_scale = reward_scale

        self._value_n_particles = value_n_particles
        self._qf_target_update_interval = td_target_update_interval

        self._kernel_fn = kernel_fn
        self._kernel_n_particles = kernel_n_particles
        self._kernel_update_ratio = kernel_update_ratio

        self._save_full_state = save_full_state
        self._train_qf = train_qf
        self._train_policy = train_policy

        self._observation_dim = flat_dim(self.env.observation_space)
        self._action_dim = flat_dim(self.env.action_space)

        self._create_placeholders()

        self._training_ops = []
        self._target_ops = []

        self._create_td_update()
        self._create_svgd_update()
        self._create_target_ops()

        if use_saved_qf:
            saved_qf_params = qf.get_param_values()
        if use_saved_policy:
            saved_policy_params = policy.get_param_values()

        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())

        if use_saved_qf:
            self.qf.set_param_values(saved_qf_params)
        if use_saved_policy:
            self.policy.set_param_values(saved_policy_params)
예제 #19
0
    def _build_net(self, reuse=None, custom_getter=None, trainable=None):
        """
        Set up q network based on class attributes. This function uses layers
        defined in rllab.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            custom_getter: A customized getter object used to get variables.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(
                self.name, reuse=reuse, custom_getter=custom_getter):
            l_obs = L.InputLayer(
                shape=(None, flat_dim(self._env_spec.observation_space)),
                name="obs")
            l_action = L.InputLayer(
                shape=(None, flat_dim(self._env_spec.action_space)),
                name="actions")

            n_layers = len(self._hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (self._action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(
                    l_hidden,
                    num_units=size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name="hidden_%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(
                l_hidden,
                num_units=1,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name="output")

            output_var = L.get_output(l_output)

        self._f_qval = tensor_utils.compile_function(
            [l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action

        LayersPowered.__init__(self, [l_output])
예제 #20
0
파일: ddpg.py 프로젝트: gntoni/garage
    def __init__(self,
                 env,
                 actor,
                 critic,
                 n_epochs=500,
                 n_epoch_cycles=20,
                 n_rollout_steps=100,
                 n_train_steps=50,
                 reward_scale=1.,
                 batch_size=64,
                 target_update_tau=0.01,
                 discount=0.99,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 actor_weight_decay=0,
                 critic_weight_decay=0,
                 replay_buffer_size=int(1e6),
                 min_buffer_size=10000,
                 exploration_strategy=None,
                 plot=False,
                 pause_for_plot=False,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 name=None):
        """
        Construct class.

        Args:
            env(): Environment.
            actor(garage.tf.policies.ContinuousMLPPolicy): Policy network.
            critic(garage.tf.q_functions.ContinuousMLPQFunction):
         Q Value network.
            n_epochs(int, optional): Number of epochs.
            n_epoch_cycles(int, optional): Number of epoch cycles.
            n_rollout_steps(int, optional): Number of rollout steps.
            n_train_steps(int, optional): Number of train steps.
            reward_scale(float): The scaling factor applied to the rewards when
         training.
            batch_size(int): Number of samples for each minibatch.
            target_update_tau(float): Interpolation parameter for doing the
         soft target update.
            discount(float): Discount factor for the cumulative return.
            actor_lr(float): Learning rate for training policy network.
            critic_lr(float): Learning rate for training q value network.
            actor_weight_decay(float): L2 weight decay factor for parameters of
         the policy network.
            critic_weight_decay(float): L2 weight decay factor for parameters
         of the q value network.
            replay_buffer_size(int): Size of the replay buffer.
            min_buffer_size(int): Minimum size of the replay buffer to start
         training.
            exploration_strategy(): Exploration strategy.
            plot(bool): Whether to visualize the policy performance after each
         eval_interval.
            pause_for_plot(bool): Whether to pause before continuing when
         plotting.
            actor_optimizer(): Optimizer for training policy network.
            critic_optimizer(): Optimizer for training q function network.
        """
        self.env = env

        self.observation_dim = flat_dim(env.observation_space)
        self.action_dim = flat_dim(env.action_space)
        _, self.action_bound = bounds(env.action_space)

        self.actor = actor
        self.critic = critic
        self.n_epochs = n_epochs
        self.n_epoch_cycles = n_epoch_cycles
        self.n_rollout_steps = n_rollout_steps
        self.n_train_steps = n_train_steps
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.tau = target_update_tau
        self.discount = discount
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_weight_decay = actor_weight_decay
        self.critic_weight_decay = critic_weight_decay
        self.replay_buffer_size = replay_buffer_size
        self.min_buffer_size = min_buffer_size
        self.es = exploration_strategy
        self.plot = plot
        self.pause_for_plot = pause_for_plot
        self.actor_optimizer = actor_optimizer
        self.critic_optimizer = critic_optimizer
        self.name = name
        self._initialize()