Exemplo n.º 1
0
def make(env_name, frame_stack, action_repeat, seed):
    domain, task = split_env_name(env_name)

    if domain == 'manip':
        env = manipulation.load(f'{task}_vision', seed=seed)
    else:
        env = suite.load(domain,
                         task,
                         task_kwargs={'random': seed},
                         visualize_reward=False)

    # apply action repeat and scaling
    env = ActionRepeatWrapper(env, action_repeat)
    env = action_scale.Wrapper(env, minimum=-1.0, maximum=+1.0)
    # flatten features
    env = FlattenObservationWrapper(env)

    if domain != 'manip':
        # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26
        camera_id = 2 if domain == 'quadruped' else 0
        render_kwargs = {'height': 84, 'width': 84, 'camera_id': camera_id}
        env = pixels.Wrapper(env,
                             pixels_only=False,
                             render_kwargs=render_kwargs)

    env = FrameStackWrapper(env, frame_stack)

    action_spec = env.action_spec()
    assert np.all(action_spec.minimum >= -1.0)
    assert np.all(action_spec.maximum <= +1.0)

    return env
Exemplo n.º 2
0
    def __init__(self, args: argparse.Namespace):
        assert args.env_batch_size > 0
        assert args.environment_name in CONTROL_SUITE_ENVS
        assert args.max_episode_length > 0

        from dm_control import suite
        from dm_control.suite.wrappers import pixels

        domain, task = args.environment_name.split('-')

        self._envs = [
            suite.load(domain_name=domain,
                       task_name=task,
                       task_kwargs={'time_limit': np.inf})
            for _ in range(args.env_batch_size)
        ]

        self._envs = [
            pixels.Wrapper(env, render_kwargs={'camera_id': 0})
            for env in self._envs
        ]

        # Time step counter
        self._t = 0
        # Set time step limit
        self._max_t = args.max_episode_length
        # Check whether images or states should be observed
        self._state_obs = args.state_observations

        # Get bit depth for preprocessing the observations
        self._bit_depth = args.bit_depth
        # Get the size of observations
        self._observation_size = (32,
                                  32) if args.downscale_observations else (64,
                                                                           64)
Exemplo n.º 3
0
    def test_single_array_observation(self, pixels_only):
        pixel_key = 'depth'

        env = FakeArrayObservationEnvironment()
        observation_spec = env.observation_spec()
        self.assertIsInstance(observation_spec, specs.ArraySpec)

        wrapped = pixels.Wrapper(env,
                                 observation_key=pixel_key,
                                 pixels_only=pixels_only)
        wrapped_observation_spec = wrapped.observation_spec()
        self.assertIsInstance(wrapped_observation_spec,
                              collections.OrderedDict)

        if pixels_only:
            self.assertEqual(1, len(wrapped_observation_spec))
            self.assertEqual([pixel_key],
                             list(wrapped_observation_spec.keys()))
        else:
            self.assertEqual(2, len(wrapped_observation_spec))
            self.assertEqual([pixels.STATE_KEY, pixel_key],
                             list(wrapped_observation_spec.keys()))

        time_step = wrapped.reset()

        depth_observation = time_step.observation[pixel_key]
        wrapped_observation_spec[pixel_key].validate(depth_observation)

        self.assertEqual(depth_observation.shape, (4, 5, 3))
        self.assertEqual(depth_observation.dtype, np.uint8)
Exemplo n.º 4
0
def make_dm_control(env_name, env_config):
    from dm_control import suite
    from dm_control.suite.wrappers import pixels
    from .dm_wrapper import DMControlAdapter, DMControlDummyWrapper
    pixel_input = env_config.pixel_input
    domain_name, task_name = env_name.split('-')
    env = suite.load(domain_name=domain_name, task_name=task_name)
    if pixel_input:
        if os.getenv('DISABLE_MUJOCO_RENDERING'):
            # We are asking for rendering on a pod that cannot support rendering, 
            # This happens in GPU based learners when we only want to create the environment
            # to see the dimensions.
            # So we will add a dummy environment
            # TODO: add a dummy wrapper that only contains the correct specs
            env = DMControlDummyWrapper(env) #...
        else:
            env = pixels.Wrapper(env, render_kwargs={'height': 84, 'width': 84, 'camera_id': 0})
    # TODO: what to do with reward visualization
    # Reward visualization should only be done in the eval agent
    # env = suite.load(domain_name=domain_name, task_name=task_name, visualize_reward=record_video)
    env = DMControlAdapter(env, pixel_input)
    env = FilterWrapper(env, env_config)
    env = ObservationConcatenationWrapper(env)
    if pixel_input:
        env = TransposeWrapper(env)
        env = GrayscaleWrapper(env)
        if env_config.frame_stacks > 1:
            env = FrameStackWrapper(env, env_config)
    env_config.action_spec = env.action_spec()
    env_config.obs_spec = env.observation_spec()
    return env, env_config
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='Test learned model')
    parser.add_argument('dir',
                        type=str,
                        help='log directory to load learned model')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=2)
    parser.add_argument('--episodes', type=int, default=1)
    args = parser.parse_args()

    # define environment and apply wrapper
    env = suite.load(args.domain_name, args.task_name)
    env = pixels.Wrapper(env,
                         render_kwargs={
                             'height': 64,
                             'width': 64,
                             'camera_id': 0
                         })
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define models
    with open(os.path.join(args.dir, 'args.json'), 'r') as f:
        train_args = json.load(f)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(train_args['state_dim'],
                                    env.action_space.shape[0],
                                    train_args['rnn_hidden_dim']).to(device)
    action_model = ActionModel(train_args['state_dim'],
                               train_args['rnn_hidden_dim'],
                               env.action_space.shape[0]).to(device)

    # load learned parameters
    encoder.load_state_dict(torch.load(os.path.join(args.dir, 'encoder.pth')))
    rssm.load_state_dict(torch.load(os.path.join(args.dir, 'rssm.pth')))
    action_model.load_state_dict(
        torch.load(os.path.join(args.dir, 'action_model.pth')))

    # define agent
    policy = Agent(encoder, rssm, action_model)

    # test learnged model in the environment
    for episode in range(args.episodes):
        policy.reset()
        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = policy(obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            if args.render:
                env.render(height=256, width=256, camera_id=0)

        print('Total test reward at episode [%4d/%4d] is %f' %
              (episode + 1, args.episodes, total_reward))
Exemplo n.º 6
0
 def __init__(self, env_name, seed, max_episode_length, bit_depth):
     domain, task = env_name.split('-')
     self._env = suite.load(domain_name=domain,
                            task_name=task,
                            task_kwargs={'random': seed})
     self._env = pixels.Wrapper(self._env)
     self.max_episode_length = max_episode_length
     self.action_repeat = CONTROL_SUITE_ACTION_REPEATS[domain]
     self.bit_depth = bit_depth
Exemplo n.º 7
0
 def test_envs_same(self):
   # Test that the camera augmentations with magnitude 0 gives the same results
   # as when no camera augmentations are used.
   render_kwargs = {'width': 84, 'height': 84, 'camera_id': 0}
   domain_and_task = [('cartpole', 'swingup'),
                      ('reacher', 'easy'),
                      ('finger', 'spin'),
                      ('cheetah', 'run'),
                      ('ball_in_cup', 'catch'),
                      ('walker', 'walk')]
   for (domain, task) in domain_and_task:
     seed = 42
     envs = [('baseline',
              pixels.Wrapper(
                  dm_control_suite.load(
                      domain, task, task_kwargs={'random': seed}),
                  render_kwargs=render_kwargs)),
             ('no-wrapper',
              pixels.Wrapper(
                  dm_control_suite.load(
                      domain, task, task_kwargs={'random': seed}),
                  render_kwargs=render_kwargs)),
             ('w/-camera_kwargs',
              pixels.Wrapper(
                  distraction_wrap(
                      dm_control_suite.load(
                          domain, task, task_kwargs={'random': seed}), domain),
                  render_kwargs=render_kwargs))]
     frames = []
     for _, env in envs:
       random_state = np.random.RandomState(42)
       action_spec = env.action_spec()
       time_step = env.reset()
       frames.append([])
       while not time_step.last() and len(frames[-1]) < 20:
         action = random_state.uniform(
             action_spec.minimum, action_spec.maximum, size=action_spec.shape)
         time_step = env.step(action)
         frame = time_step.observation['pixels'][:, :, 0:3]
         frames[-1].append(frame)
     frames_np = np.array(frames)
     for i in range(1, len(envs)):
       difference = np.mean(abs(frames_np[0] - frames_np[i]))
       self.assertEqual(difference, 0.)
Exemplo n.º 8
0
 def __init__(self, env):
     self._env = pixels.Wrapper(pixels.Wrapper(env), pixels_only=True)
     action_spec = self._env.action_spec()
     time_step = self._env.reset()
     observation_dm = time_step.observation["pixels"]
     screen_height_dm = observation_dm.shape[0]
     screen_width_dm = observation_dm.shape[1]
     screen_depth_dm = observation_dm.shape[2]
     self.observation_space = Box(low=0,
                                  high=255,
                                  shape=(screen_height_dm, screen_width_dm,
                                         screen_depth_dm),
                                  dtype=np.uint8)
     self.action_space = Box(action_spec.minimum,
                             action_spec.maximum,
                             dtype=np.float32)
     self.random_action = np.random.uniform(action_spec.minimum,
                                            action_spec.maximum,
                                            size=action_spec.shape)
Exemplo n.º 9
0
def load_pixels(
    domain_name: Text,
    task_name: Text,
    observation_key: Text = 'pixels',
    pixels_only: bool = True,
    task_kwargs=None,
    environment_kwargs=None,
    visualize_reward: bool = False,
    render_kwargs=None,
    env_wrappers: Sequence[types.PyEnvWrapper] = ()
) -> py_environment.PyEnvironment:
    """Returns an environment from a domain name, task name and optional settings.

  Args:
    domain_name: A string containing the name of a domain.
    task_name: A string containing the name of a task.
    observation_key: Optional custom string specifying the pixel observation's
      key in the `OrderedDict` of observations. Defaults to 'pixels'.
    pixels_only: If True (default), the original set of 'state' observations
      returned by the wrapped environment will be discarded, and the
      `OrderedDict` of observations will only contain pixels. If False, the
      `OrderedDict` will contain the original observations as well as the pixel
      observations.
    task_kwargs: Optional `dict` of keyword arguments for the task.
    environment_kwargs: Optional `dict` specifying keyword arguments for the
      environment.
    visualize_reward: Optional `bool`. If `True`, object colours in rendered
      frames are set to indicate the reward at each step. Default `False`.
    render_kwargs: Optional `dict` of keyword arguments for rendering.
    env_wrappers: Iterable with references to wrapper classes to use on the
      wrapped environment.

  Returns:
    The requested environment.

  Raises:
    ImportError: if dm_control module was not available.
  """
    dm_env = _load_env(domain_name,
                       task_name,
                       task_kwargs=task_kwargs,
                       environment_kwargs=environment_kwargs,
                       visualize_reward=visualize_reward)

    dm_env = pixels.Wrapper(dm_env,
                            pixels_only=pixels_only,
                            render_kwargs=render_kwargs,
                            observation_key=observation_key)
    env = dm_control_wrapper.DmControlWrapper(dm_env, render_kwargs)

    for wrapper in env_wrappers:
        env = wrapper(env)

    return env
Exemplo n.º 10
0
 def __init__(self, env, symbolic, seed, max_episode_length, action_repeat, bit_depth):
   from dm_control import suite
   from dm_control.suite.wrappers import pixels
   domain, task = env.split('-')
   self.symbolic = symbolic
   self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={'random': seed})
   if not symbolic:
     self._env = pixels.Wrapper(self._env)
   self.max_episode_length = max_episode_length
   self.action_repeat = action_repeat
   if action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]:
     print('Using action repeat %d; recommended action repeat for domain is %d' % (action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain]))
   self.bit_depth = bit_depth
Exemplo n.º 11
0
 def __init__(self,
              environment: control.Environment,
              *,
              height: int = 84,
              width: int = 84,
              camera_id: int = 0):
     render_kwargs = {
         'height': height,
         'width': width,
         'camera_id': camera_id
     }
     pixel_environment = pixels.Wrapper(environment,
                                        pixels_only=True,
                                        render_kwargs=render_kwargs)
     super().__init__(pixel_environment)
Exemplo n.º 12
0
 def test_dynamic(self):
   camera_kwargs = get_camera_params(
       domain_name='cartpole', scale=0.1, dynamic=True)
   env = cartpole.swingup()
   env = camera.DistractingCameraEnv(env, camera_id=0, **camera_kwargs)
   env = pixels.Wrapper(env, render_kwargs={'camera_id': 0})
   action_spec = env.action_spec()
   time_step = env.reset()
   frames = []
   while not time_step.last() and len(frames) < 10:
     action = np.random.uniform(
         action_spec.minimum, action_spec.maximum, size=action_spec.shape)
     time_step = env.step(action)
     frames.append(time_step.observation['pixels'])
   self.assertEqual(frames[0].shape, (240, 320, 3))
Exemplo n.º 13
0
 def __init__(self,
              env,
              symbolic,
              seed,
              max_episode_length,
              action_repeat,
              bit_depth,
              action_noise_scale=None,
              render_size=64,
              use_rgbgr=False):
     from dm_control import suite
     from dm_control.suite.wrappers import pixels
     from dm_control.suite.wrappers import action_noise
     domain, task = env.split('-')
     self.symbolic = symbolic
     self.render_size = render_size if render_size else 64
     self.use_rgbgr = use_rgbgr
     if self.use_rgbgr:
         self.obs_tuple = (4, render_size, render_size)
     else:
         self.obs_tuple = (3, render_size, render_size)
     self.action_noise_scale = action_noise_scale
     self._env = suite.load(domain_name=domain,
                            task_name=task,
                            task_kwargs={'random': seed})
     if not symbolic:
         self._env = pixels.Wrapper(self._env)
     if self.action_noise_scale is not None:
         self._env = action_noise.Wrapper(self._env,
                                          scale=self.action_noise_scale)
     self.max_episode_length = max_episode_length
     if action_repeat < 0:
         try:
             action_repeat = CONTROL_SUITE_ACTION_REPEATS[domain]
         except KeyError:
             action_repeat = 2
     self.action_repeat = action_repeat
     try:
         if action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]:
             print(
                 'Using action repeat %d; recommended action repeat for domain is %d'
                 % (action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain]))
     except KeyError:
         pass
     self.bit_depth = bit_depth
Exemplo n.º 14
0
def load(
        domain_name,
        task_name,
        task_kwargs=None,
        environment_kwargs=None,
        env_load_fn=suite.load,  # use custom_suite.load for customized env
        action_repeat_wrapper=wrappers.ActionRepeat,
        action_repeat=1,
        frame_stack=4,
        episode_length=1000,
        actions_in_obs=True,
        rewards_in_obs=False,
        pixels_obs=True,
        # Render params
        grayscale=False,
        visualize_reward=False,
        render_kwargs=None):
    """Returns an environment from a domain name, task name."""
    env = env_load_fn(domain_name,
                      task_name,
                      task_kwargs=task_kwargs,
                      environment_kwargs=environment_kwargs,
                      visualize_reward=visualize_reward)
    if pixels_obs:
        env = pixel_wrapper.Wrapper(env,
                                    pixels_only=False,
                                    render_kwargs=render_kwargs)

    env = dm_control_wrapper.DmControlWrapper(env, render_kwargs)

    if pixels_obs and grayscale:
        env = GrayscaleWrapper(env)
    if action_repeat > 1:
        env = action_repeat_wrapper(env, action_repeat)
    if pixels_obs:
        env = FrameStack(env, frame_stack, actions_in_obs, rewards_in_obs)
    else:
        env = FlattenState(env)

    # Adjust episode length based on action_repeat
    max_episode_steps = (episode_length + action_repeat - 1) // action_repeat

    # Apply a time limit wrapper at the end to properly trigger all reset()
    env = wrappers.TimeLimit(env, max_episode_steps)
    return env
Exemplo n.º 15
0
    def test_dict_observation(self, pixels_only):
        pixel_key = 'rgb'

        env = cartpole.swingup()

        # Make sure we are testing the right environment for the test.
        observation_spec = env.observation_spec()
        self.assertIsInstance(observation_spec, collections.OrderedDict)

        width = 320
        height = 240

        # The wrapper should only add one observation.
        wrapped = pixels.Wrapper(env,
                                 observation_key=pixel_key,
                                 pixels_only=pixels_only,
                                 render_kwargs={
                                     'width': width,
                                     'height': height
                                 })

        wrapped_observation_spec = wrapped.observation_spec()
        self.assertIsInstance(wrapped_observation_spec,
                              collections.OrderedDict)

        if pixels_only:
            self.assertEqual(1, len(wrapped_observation_spec))
            self.assertEqual([pixel_key],
                             list(wrapped_observation_spec.keys()))
        else:
            self.assertEqual(
                len(observation_spec) + 1, len(wrapped_observation_spec))
            expected_keys = list(observation_spec.keys()) + [pixel_key]
            self.assertEqual(expected_keys,
                             list(wrapped_observation_spec.keys()))

        # Check that the added spec item is consistent with the added observation.
        time_step = wrapped.reset()
        rgb_observation = time_step.observation[pixel_key]
        wrapped_observation_spec[pixel_key].validate(rgb_observation)

        self.assertEqual(rgb_observation.shape, (height, width, 3))
        self.assertEqual(rgb_observation.dtype, np.uint8)
Exemplo n.º 16
0
    def __init__(self, env_id, seed, max_episode_length=1000):
        super(ControlSuiteEnv, self).__init__()
        domain, task = env_id.split("-")
        from dm_control import suite
        from dm_control.suite.wrappers import pixels
        self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={"random": seed})
        self._env = pixels.Wrapper(self._env)
        self._env.action_space = self.action_size
        self._env.observation_space = self.observation_size
        self._env.reward_range = (-float("inf"), float("inf"))
        self._env.metadata = self.metadata
        self._env.spec = None
        self._env = UnSuite(self._env)

        self.action_repeat = CONTROL_SUITE_ACTION_REPEATS.get(domain, 1)
        self.max_episode_length = max_episode_length * self.action_repeat
        if self.action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]:
            print("Using action repeat %d; recommended action repeat for domain is %d" % (
                self.action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain]))
        self.t = 0
Exemplo n.º 17
0
    def __init__(
        self,
        domain,
        task,
        frame_skip=1,
        normalize=False,
        pixel_wrapper_kwargs=None,
        task_kwargs={},
        environment_kwargs={},
        max_path_length=1200,
    ):
        save__init__args(locals(), underscore=True)

        env = suite.load(domain_name=domain,
                         task_name=task,
                         task_kwargs=task_kwargs,
                         environment_kwargs=environment_kwargs)
        if normalize:
            np.testing.assert_equal(env.action_spec().minimum, -1)
            np.testing.assert_equal(env.action_spec().maximum, 1)
        if pixel_wrapper_kwargs is not None:
            env = pixels.Wrapper(env, **pixel_wrapper_kwargs)
        self._env = env

        self._observation_keys = tuple(env.observation_spec().keys())
        observation_space = convert_dm_control_to_rlpyt_space(
            env.observation_spec())
        self._observation_space = observation_space

        action_space = convert_dm_control_to_rlpyt_space(env.action_spec())
        if len(action_space.shape) > 1:
            raise NotImplementedError(
                "Shape of the action space ({}) is not flat, make sure to"
                " check the implemenation.".format(action_space))
        self._action_space = action_space

        self._step_count = 0
Exemplo n.º 18
0
import torch
import numpy as np

# import os
# os.environ["MUJOCO_GL"] = 'osmesa'

from dm_control import suite
from dm_control.suite.wrappers import pixels

import utils

env = suite.load(domain_name="humanoid", task_name="stand")
# import ipdb; ipdb.set_trace()
env = pixels.Wrapper(env)
spec = env.action_spec()
time_step = env.reset()
total_reward = 0.0
frames = [time_step.observation['pixels']]
for t in range(1000):
    print(t)
    action = np.random.uniform(spec.minimum, spec.maximum, spec.shape)
    time_step = env.step(action)
    frames.append(time_step.observation['pixels'].copy())
    total_reward += time_step.reward

print("Total number of frames: {}".format(len(frames)))

# utils.save_gif('humanoid.mp4',
#                [torch.tensor(frame.copy()).float()/255 for frame in frames],
#                color_last=True)
Exemplo n.º 19
0
    def __init__(self,
                 domain,
                 task,
                 *args,
                 env=None,
                 normalize=True,
                 observation_keys=(),
                 goal_keys=(),
                 unwrap_time_limit=True,
                 pixel_wrapper_kwargs=None,
                 **kwargs):
        assert not args, (
            "Gym environments don't support args. Use kwargs instead.")

        self.normalize = normalize
        self.unwrap_time_limit = unwrap_time_limit

        super(DmControlAdapter, self).__init__(
            domain,  task, *args, goal_keys=goal_keys, **kwargs)

        if env is None:
            assert (domain is not None and task is not None), (domain, task)
            env = suite.load(
                domain_name=domain,
                task_name=task,
                task_kwargs=kwargs
                # TODO(hartikainen): Figure out how to pass kwargs to this guy.
                # Need to split into `task_kwargs`, `environment_kwargs`, and
                # `visualize_reward` bool. Check the suite.load(.) in:
                # https://github.com/deepmind/dm_control/blob/master/dm_control/suite/__init__.py
            )
            self._env_kwargs = kwargs
        else:
            assert not kwargs
            assert domain is None and task is None, (domain, task)

        # Ensure action space is already normalized.
        if normalize:
            np.testing.assert_equal(env.action_spec().minimum, -1)
            np.testing.assert_equal(env.action_spec().maximum, 1)

        if pixel_wrapper_kwargs is not None:
            env = pixels.Wrapper(env, **pixel_wrapper_kwargs)

        self._env = env

        assert isinstance(env.observation_spec(), OrderedDict)
        self.observation_keys = (
            observation_keys or tuple(env.observation_spec().keys()))

        observation_space = convert_dm_control_to_gym_space(
            env.observation_spec())

        self._observation_space = type(observation_space)([
            (name, copy.deepcopy(space))
            for name, space in observation_space.spaces.items()
            if name in self.observation_keys
        ])

        action_space = convert_dm_control_to_gym_space(self._env.action_spec())

        if len(action_space.shape) > 1:
            raise NotImplementedError(
                "Shape of the action space ({}) is not flat, make sure to"
                " check the implemenation.".format(action_space))

        self._action_space = action_space
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(description='Dreamer for DM control')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--log-dir', type=str, default='log')
    parser.add_argument('--test-interval', type=int, default=10)
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=2)
    parser.add_argument('--state-dim', type=int, default=30)
    parser.add_argument('--rnn-hidden-dim', type=int, default=200)
    parser.add_argument('--buffer-capacity', type=int, default=1000000)
    parser.add_argument('--all-episodes', type=int, default=1000)
    parser.add_argument('-S', '--seed-episodes', type=int, default=5)
    parser.add_argument('-C', '--collect-interval', type=int, default=100)
    parser.add_argument('-B', '--batch-size', type=int, default=50)
    parser.add_argument('-L', '--chunk-length', type=int, default=50)
    parser.add_argument('-H', '--imagination-horizon', type=int, default=15)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lambda_', type=float, default=0.95)
    parser.add_argument('--model_lr', type=float, default=6e-4)
    parser.add_argument('--value_lr', type=float, default=8e-5)
    parser.add_argument('--action_lr', type=float, default=8e-5)
    parser.add_argument('--eps', type=float, default=1e-4)
    parser.add_argument('--clip-grad-norm', type=int, default=100)
    parser.add_argument('--free-nats', type=int, default=3)
    parser.add_argument('--action-noise-var', type=float, default=0.3)
    args = parser.parse_args()

    # Prepare logging
    log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name)
    log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M'))
    os.makedirs(log_dir)
    with open(os.path.join(log_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))
    writer = SummaryWriter(log_dir=log_dir)

    # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness))
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # define env and apply wrappers
    env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed})
    env = pixels.Wrapper(env, render_kwargs={'height': 64,
                                             'width': 64,
                                             'camera_id': 0})
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define replay buffer
    replay_buffer = ReplayBuffer(capacity=args.buffer_capacity,
                                 observation_shape=env.observation_space.shape,
                                 action_dim=env.action_space.shape[0])

    # define models and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(args.state_dim,
                                    env.action_space.shape[0],
                                    args.rnn_hidden_dim).to(device)
    obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device)
    reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device)
    model_params = (list(encoder.parameters()) +
                    list(rssm.parameters()) +
                    list(obs_model.parameters()) +
                    list(reward_model.parameters()))
    model_optimizer = Adam(model_params, lr=args.model_lr, eps=args.eps)

    # define value model and action model and optimizer
    value_model = ValueModel(args.state_dim, args.rnn_hidden_dim).to(device)
    action_model = ActionModel(args.state_dim, args.rnn_hidden_dim,
                               env.action_space.shape[0]).to(device)
    value_optimizer = Adam(value_model.parameters(), lr=args.value_lr, eps=args.eps)
    action_optimizer = Adam(action_model.parameters(), lr=args.action_lr, eps=args.eps)

    # collect seed episodes with random action
    for episode in range(args.seed_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = env.action_space.sample()
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs

    # main training loop
    for episode in range(args.seed_episodes, args.all_episodes):
        # -----------------------------
        #      collect experiences
        # -----------------------------
        start = time.time()
        policy = Agent(encoder, rssm, action_model)

        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = policy(obs)
            action += np.random.normal(0, np.sqrt(args.action_noise_var),
                                       env.action_space.shape[0])
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs
            total_reward += reward

        writer.add_scalar('total reward at train', total_reward, episode)
        print('episode [%4d/%4d] is collected. Total reward is %f' %
              (episode+1, args.all_episodes, total_reward))
        print('elasped time for interaction: %.2fs' % (time.time() - start))

        # update parameters of model, value model, action model
        start = time.time()
        for update_step in range(args.collect_interval):
            # ---------------------------------------------------------------
            #      update model (encoder, rssm, obs_model, reward_model)
            # ---------------------------------------------------------------
            observations, actions, rewards, _ = \
                replay_buffer.sample(args.batch_size, args.chunk_length)

            # preprocess observations and transpose tensor for RNN training
            observations = preprocess_obs(observations)
            observations = torch.as_tensor(observations, device=device)
            observations = observations.transpose(3, 4).transpose(2, 3)
            observations = observations.transpose(0, 1)
            actions = torch.as_tensor(actions, device=device).transpose(0, 1)
            rewards = torch.as_tensor(rewards, device=device).transpose(0, 1)

            # embed observations with CNN
            embedded_observations = encoder(
                observations.reshape(-1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1)

            # prepare Tensor to maintain states sequence and rnn hidden states sequence
            states = torch.zeros(
                args.chunk_length, args.batch_size, args.state_dim, device=device)
            rnn_hiddens = torch.zeros(
                args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device)

            # initialize state and rnn hidden state with 0 vector
            state = torch.zeros(args.batch_size, args.state_dim, device=device)
            rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device)

            # compute state and rnn hidden sequences and kl loss
            kl_loss = 0
            for l in range(args.chunk_length-1):
                next_state_prior, next_state_posterior, rnn_hidden = \
                    rssm(state, actions[l], rnn_hidden, embedded_observations[l+1])
                state = next_state_posterior.rsample()
                states[l+1] = state
                rnn_hiddens[l+1] = rnn_hidden
                kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1)
                kl_loss += kl.clamp(min=args.free_nats).mean()
            kl_loss /= (args.chunk_length - 1)

            # states[0] and rnn_hiddens[0] are always 0 and have no information
            states = states[1:]
            rnn_hiddens = rnn_hiddens[1:]

            # compute reconstructed observations and predicted rewards
            flatten_states = states.view(-1, args.state_dim)
            flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim)
            recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view(
                args.chunk_length-1, args.batch_size, 3, 64, 64)
            predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view(
                args.chunk_length-1, args.batch_size, 1)

            # compute loss for observation and reward
            obs_loss = 0.5 * mse_loss(
                recon_observations, observations[1:], reduction='none').mean([0, 1]).sum()
            reward_loss = 0.5 * mse_loss(predicted_rewards, rewards[:-1])

            # add all losses and update model parameters with gradient descent
            model_loss = kl_loss + obs_loss + reward_loss
            model_optimizer.zero_grad()
            model_loss.backward()
            clip_grad_norm_(model_params, args.clip_grad_norm)
            model_optimizer.step()

            # ----------------------------------------------
            #      update value_model and action_model
            # ----------------------------------------------
            # detach gradient because Dreamer doesn't update model with actor-critic loss
            flatten_states = flatten_states.detach()
            flatten_rnn_hiddens = flatten_rnn_hiddens.detach()

            # prepare tensor to maintain imaginated trajectory's states and rnn_hiddens
            imaginated_states = torch.zeros(args.imagination_horizon + 1,
                                            *flatten_states.shape,
                                            device=flatten_states.device)
            imaginated_rnn_hiddens = torch.zeros(args.imagination_horizon + 1,
                                                 *flatten_rnn_hiddens.shape,
                                                 device=flatten_rnn_hiddens.device)
            imaginated_states[0] = flatten_states
            imaginated_rnn_hiddens[0] = flatten_rnn_hiddens

            # compute imaginated trajectory using action from action_model
            for h in range(1, args.imagination_horizon + 1):
                actions = action_model(flatten_states, flatten_rnn_hiddens)
                flatten_states_prior, flatten_rnn_hiddens = rssm.prior(flatten_states,
                                                                       actions,
                                                                       flatten_rnn_hiddens)
                flatten_states = flatten_states_prior.rsample()
                imaginated_states[h] = flatten_states
                imaginated_rnn_hiddens[h] = flatten_rnn_hiddens

            # compute rewards and values corresponding to imaginated states and rnn_hiddens
            flatten_imaginated_states = imaginated_states.view(-1, args.state_dim)
            flatten_imaginated_rnn_hiddens = imaginated_rnn_hiddens.view(-1, args.rnn_hidden_dim)
            imaginated_rewards = \
                reward_model(flatten_imaginated_states,
                             flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1)
            imaginated_values = \
                value_model(flatten_imaginated_states,
                            flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1)
            # compute lambda target
            lambda_target_values = lambda_target(imaginated_rewards, imaginated_values,
                                                 args.gamma, args.lambda_)

            # update_value model
            value_loss = 0.5 * mse_loss(imaginated_values, lambda_target_values.detach())
            value_optimizer.zero_grad()
            value_loss.backward(retain_graph=True)
            clip_grad_norm_(value_model.parameters(), args.clip_grad_norm)
            value_optimizer.step()

            # update action model (multiply -1 for gradient ascent)
            action_loss = -1 * (lambda_target_values.mean())
            action_optimizer.zero_grad()
            action_loss.backward()
            clip_grad_norm_(action_model.parameters(), args.clip_grad_norm)
            action_optimizer.step()

            # print losses and add to tensorboard
            print('update_step: %3d model loss: %.5f, kl_loss: %.5f, '
                  'obs_loss: %.5f, reward_loss: %.5f, '
                  'value_loss: %.5f action_loss: %.5f'
                  % (update_step + 1, model_loss.item(), kl_loss.item(),
                     obs_loss.item(), reward_loss.item(),
                     value_loss.item(), action_loss.item()))
            total_update_step = episode * args.collect_interval + update_step
            writer.add_scalar('model loss', model_loss.item(), total_update_step)
            writer.add_scalar('kl loss', kl_loss.item(), total_update_step)
            writer.add_scalar('obs loss', obs_loss.item(), total_update_step)
            writer.add_scalar('reward loss', reward_loss.item(), total_update_step)
            writer.add_scalar('value loss', value_loss.item(), total_update_step)
            writer.add_scalar('action loss', action_loss.item(), total_update_step)

        print('elasped time for update: %.2fs' % (time.time() - start))

        # ----------------------------------------------
        #      evaluation without exploration noise
        # ----------------------------------------------
        if (episode + 1) % args.test_interval == 0:
            policy = Agent(encoder, rssm, action_model)
            start = time.time()
            obs = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = policy(obs, training=False)
                obs, reward, done, _ = env.step(action)
                total_reward += reward

            writer.add_scalar('total reward at test', total_reward, episode)
            print('Total test reward at episode [%4d/%4d] is %f' %
                  (episode+1, args.all_episodes, total_reward))
            print('elasped time for test: %.2fs' % (time.time() - start))

    # save learned model parameters
    torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth'))
    torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth'))
    torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth'))
    torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth'))
    torch.save(value_model.state_dict(), os.path.join(log_dir, 'value_model.pth'))
    torch.save(action_model.state_dict(), os.path.join(log_dir, 'action_model.pth'))
    writer.close()
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='PlaNet for DM control')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--log-dir', type=str, default='log')
    parser.add_argument('--test-interval', type=int, default=10)
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=4)
    parser.add_argument('--state-dim', type=int, default=30)
    parser.add_argument('--rnn-hidden-dim', type=int, default=200)
    parser.add_argument('--buffer-capacity', type=int, default=1000000)
    parser.add_argument('--all-episodes', type=int, default=1000)
    parser.add_argument('-S', '--seed-episodes', type=int, default=5)
    parser.add_argument('-C', '--collect-interval', type=int, default=100)
    parser.add_argument('-B', '--batch-size', type=int, default=50)
    parser.add_argument('-L', '--chunk-length', type=int, default=50)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--eps', type=float, default=1e-4)
    parser.add_argument('--clip-grad-norm', type=int, default=1000)
    parser.add_argument('--free-nats', type=int, default=3)
    parser.add_argument('-H', '--horizon', type=int, default=12)
    parser.add_argument('-I', '--N-iterations', type=int, default=10)
    parser.add_argument('-J', '--N-candidates', type=int, default=1000)
    parser.add_argument('-K', '--N-top-candidates', type=int, default=100)
    parser.add_argument('--action-noise-var', type=float, default=0.3)
    args = parser.parse_args()

    # Prepare logging
    log_dir = os.path.join(args.log_dir,
                           args.domain_name + '_' + args.task_name)
    log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M'))
    os.makedirs(log_dir)
    with open(os.path.join(log_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))
    writer = SummaryWriter(log_dir=log_dir)

    # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness))
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # define env and apply wrappers
    env = suite.load(args.domain_name,
                     args.task_name,
                     task_kwargs={'random': args.seed})
    env = pixels.Wrapper(env,
                         render_kwargs={
                             'height': 64,
                             'width': 64,
                             'camera_id': 0
                         })
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define replay buffer
    replay_buffer = ReplayBuffer(capacity=args.buffer_capacity,
                                 observation_shape=env.observation_space.shape,
                                 action_dim=env.action_space.shape[0])

    # define models and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0],
                                    args.rnn_hidden_dim).to(device)
    obs_model = ObservationModel(args.state_dim,
                                 args.rnn_hidden_dim).to(device)
    reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device)
    all_params = (list(encoder.parameters()) + list(rssm.parameters()) +
                  list(obs_model.parameters()) +
                  list(reward_model.parameters()))
    optimizer = Adam(all_params, lr=args.lr, eps=args.eps)

    # collect initial experience with random action
    for episode in range(args.seed_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = env.action_space.sample()
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs

    # main training loop
    for episode in range(args.seed_episodes, args.all_episodes):
        # collect experiences
        start = time.time()
        cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon,
                             args.N_iterations, args.N_candidates,
                             args.N_top_candidates)

        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = cem_agent(obs)
            action += np.random.normal(0, np.sqrt(args.action_noise_var),
                                       env.action_space.shape[0])
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs
            total_reward += reward

        writer.add_scalar('total reward at train', total_reward, episode)
        print('episode [%4d/%4d] is collected. Total reward is %f' %
              (episode + 1, args.all_episodes, total_reward))
        print('elasped time for interaction: %.2fs' % (time.time() - start))

        # update model parameters
        start = time.time()
        for update_step in range(args.collect_interval):
            observations, actions, rewards, _ = \
                replay_buffer.sample(args.batch_size, args.chunk_length)

            # preprocess observations and transpose tensor for RNN training
            observations = preprocess_obs(observations)
            observations = torch.as_tensor(observations, device=device)
            observations = observations.transpose(3, 4).transpose(2, 3)
            observations = observations.transpose(0, 1)
            actions = torch.as_tensor(actions, device=device).transpose(0, 1)
            rewards = torch.as_tensor(rewards, device=device).transpose(0, 1)

            # embed observations with CNN
            embedded_observations = encoder(observations.reshape(
                -1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1)

            # prepare Tensor to maintain states sequence and rnn hidden states sequence
            states = torch.zeros(args.chunk_length,
                                 args.batch_size,
                                 args.state_dim,
                                 device=device)
            rnn_hiddens = torch.zeros(args.chunk_length,
                                      args.batch_size,
                                      args.rnn_hidden_dim,
                                      device=device)

            # initialize state and rnn hidden state with 0 vector
            state = torch.zeros(args.batch_size, args.state_dim, device=device)
            rnn_hidden = torch.zeros(args.batch_size,
                                     args.rnn_hidden_dim,
                                     device=device)

            # compute state and rnn hidden sequences and kl loss
            kl_loss = 0
            for l in range(args.chunk_length - 1):
                next_state_prior, next_state_posterior, rnn_hidden = \
                    rssm(state, actions[l], rnn_hidden, embedded_observations[l+1])
                state = next_state_posterior.rsample()
                states[l + 1] = state
                rnn_hiddens[l + 1] = rnn_hidden
                kl = kl_divergence(next_state_prior,
                                   next_state_posterior).sum(dim=1)
                kl_loss += kl.clamp(min=args.free_nats).mean()
            kl_loss /= (args.chunk_length - 1)

            # compute reconstructed observations and predicted rewards
            flatten_states = states.view(-1, args.state_dim)
            flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim)
            recon_observations = obs_model(flatten_states,
                                           flatten_rnn_hiddens).view(
                                               args.chunk_length,
                                               args.batch_size, 3, 64, 64)
            predicted_rewards = reward_model(flatten_states,
                                             flatten_rnn_hiddens).view(
                                                 args.chunk_length,
                                                 args.batch_size, 1)

            # compute loss for observation and reward
            obs_loss = 0.5 * mse_loss(recon_observations[1:],
                                      observations[1:],
                                      reduction='none').mean([0, 1]).sum()
            reward_loss = 0.5 * mse_loss(predicted_rewards[1:], rewards[:-1])

            # add all losses and update model parameters with gradient descent
            loss = kl_loss + obs_loss + reward_loss
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(all_params, args.clip_grad_norm)
            optimizer.step()

            # print losses and add tensorboard
            print(
                'update_step: %3d loss: %.5f, kl_loss: %.5f, obs_loss: %.5f, reward_loss: % .5f'
                % (update_step + 1, loss.item(), kl_loss.item(),
                   obs_loss.item(), reward_loss.item()))
            total_update_step = episode * args.collect_interval + update_step
            writer.add_scalar('overall loss', loss.item(), total_update_step)
            writer.add_scalar('kl loss', kl_loss.item(), total_update_step)
            writer.add_scalar('obs loss', obs_loss.item(), total_update_step)
            writer.add_scalar('reward loss', reward_loss.item(),
                              total_update_step)

        print('elasped time for update: %.2fs' % (time.time() - start))

        # test to get score without exploration noise
        if (episode + 1) % args.test_interval == 0:
            start = time.time()
            cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon,
                                 args.N_iterations, args.N_candidates,
                                 args.N_top_candidates)
            obs = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = cem_agent(obs)
                obs, reward, done, _ = env.step(action)
                total_reward += reward

            writer.add_scalar('total reward at test', total_reward, episode)
            print('Total test reward at episode [%4d/%4d] is %f' %
                  (episode + 1, args.all_episodes, total_reward))
            print('elasped time for test: %.2fs' % (time.time() - start))

    # save learned model parameters
    torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth'))
    torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth'))
    torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth'))
    torch.save(reward_model.state_dict(),
               os.path.join(log_dir, 'reward_model.pth'))
    writer.close()
Exemplo n.º 22
0
from dm_control import suite
from dm_control.suite.wrappers import pixels
import numpy as np

DOMAIN_NAME = "acrobot"
TASK_NAME = "swingup"

# Load one task:
env = suite.load(domain_name=DOMAIN_NAME, task_name=TASK_NAME)

# Wrap the environment to obtain the pixels
env = pixels.Wrapper(env, pixels_only=False)

# Step through an episode and print out reward, discount and observation.
action_spec = env.action_spec()
time_step = env.reset()

while not time_step.last():
    action = np.random.uniform(action_spec.minimum,
                               action_spec.maximum,
                               size=action_spec.shape)
    time_step = env.step(action)
    observation_dm = time_step.observation["pixels"]
    print(observation_dm)
Exemplo n.º 23
0
def load(domain_name,
         task_name,
         difficulty=None,
         dynamic=False,
         background_dataset_path=None,
         background_dataset_videos="train",
         background_kwargs=None,
         camera_kwargs=None,
         color_kwargs=None,
         task_kwargs=None,
         environment_kwargs=None,
         visualize_reward=False,
         render_kwargs=None,
         pixels_only=True,
         pixels_observation_key="pixels",
         env_state_wrappers=None):
    """Returns an environment from a domain name, task name and optional settings.
  ```python
  env = suite.load('cartpole', 'balance')
  ```
  Adding a difficulty will configure distractions matching the reference paper
  for easy, medium, hard.
  Users can also toggle dynamic properties for distractions.
  Args:
    domain_name: A string containing the name of a domain.
    task_name: A string containing the name of a task.
    difficulty: Difficulty for the suite. One of 'easy', 'medium', 'hard'.
    dynamic: Boolean controlling whether distractions are dynamic or static.
    background_dataset_path: String to the davis directory that contains the
      video directories.
    background_dataset_videos: String ('train'/'val') or list of strings of the
      DAVIS videos to be used for backgrounds.
    background_kwargs: Dict, overwrites settings for background distractions.
    camera_kwargs: Dict, overwrites settings for camera distractions.
    color_kwargs: Dict, overwrites settings for color distractions.
    task_kwargs: Dict, dm control task kwargs.
    environment_kwargs: Optional `dict` specifying keyword arguments for the
      environment.
    visualize_reward: Optional `bool`. If `True`, object colours in rendered
      frames are set to indicate the reward at each step. Default `False`.
    render_kwargs: Dict, render kwargs for pixel wrapper.
    pixels_only: Boolean controlling the exclusion of states in the observation.
    pixels_observation_key: Key in the observation used for the rendered image.
    env_state_wrappers: Env state wrappers to be called before the PixelWrapper.
  Returns:
    The requested environment.
  """
    if not is_available():
        raise ImportError("dm_control module is not available. Make sure you "
                          "follow the installation instructions from the "
                          "dm_control package.")

    if difficulty not in [None, "easy", "medium", "hard"]:
        raise ValueError(
            "Difficulty should be one of: 'easy', 'medium', 'hard'.")

    render_kwargs = render_kwargs or {}
    if "camera_id" not in render_kwargs:
        render_kwargs["camera_id"] = 2 if domain_name == "quadruped" else 0

    env = suite.load(domain_name,
                     task_name,
                     task_kwargs=task_kwargs,
                     environment_kwargs=environment_kwargs,
                     visualize_reward=visualize_reward)

    # Apply background distractions.
    if difficulty or background_kwargs:
        background_dataset_path = (background_dataset_path
                                   or suite_utils.DEFAULT_BACKGROUND_PATH)
        final_background_kwargs = dict()
        if difficulty:
            # Get kwargs for the given difficulty.
            num_videos = suite_utils.DIFFICULTY_NUM_VIDEOS[difficulty]
            final_background_kwargs.update(
                suite_utils.get_background_kwargs(domain_name, num_videos,
                                                  dynamic,
                                                  background_dataset_path,
                                                  background_dataset_videos))
        else:
            # Set the dataset path and the videos.
            final_background_kwargs.update(
                dict(dataset_path=background_dataset_path,
                     dataset_videos=background_dataset_videos))
        if background_kwargs:
            # Overwrite kwargs with those passed here.
            final_background_kwargs.update(background_kwargs)
        env = background.DistractingBackgroundEnv(env,
                                                  **final_background_kwargs)

    # Apply camera distractions.
    if difficulty or camera_kwargs:
        final_camera_kwargs = dict(camera_id=render_kwargs["camera_id"])
        if difficulty:
            # Get kwargs for the given difficulty.
            scale = suite_utils.DIFFICULTY_SCALE[difficulty]
            final_camera_kwargs.update(
                suite_utils.get_camera_kwargs(domain_name, scale, dynamic))
        if camera_kwargs:
            # Overwrite kwargs with those passed here.
            final_camera_kwargs.update(camera_kwargs)
        env = camera.DistractingCameraEnv(env, **final_camera_kwargs)

    # Apply color distractions.
    if difficulty or color_kwargs:
        final_color_kwargs = dict()
        if difficulty:
            # Get kwargs for the given difficulty.
            scale = suite_utils.DIFFICULTY_SCALE[difficulty]
            final_color_kwargs.update(
                suite_utils.get_color_kwargs(scale, dynamic))
        if color_kwargs:
            # Overwrite kwargs with those passed here.
            final_color_kwargs.update(color_kwargs)
        env = color.DistractingColorEnv(env, **final_color_kwargs)

    if env_state_wrappers is not None:
        for wrapper in env_state_wrappers:
            env = wrapper(env)
    # Apply Pixel wrapper after distractions. This is needed to ensure the
    # changes from the distraction wrapper are applied to the MuJoCo environment
    # before the rendering occurs.
    env = pixels.Wrapper(env,
                         pixels_only=pixels_only,
                         render_kwargs=render_kwargs,
                         observation_key=pixels_observation_key)

    return env
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(
        description='Open-loop video prediction with learned model')
    parser.add_argument('dir',
                        type=str,
                        help='log directory to load learned model')
    parser.add_argument('--length',
                        type=int,
                        default=50,
                        help='the length of video prediction')
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=2)
    args = parser.parse_args()

    # define environment and apply wrapper
    env = suite.load(args.domain_name, args.task_name)
    env = pixels.Wrapper(env,
                         render_kwargs={
                             'height': 64,
                             'width': 64,
                             'camera_id': 0
                         })
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define models
    with open(os.path.join(args.dir, 'args.json'), 'r') as f:
        train_args = json.load(f)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(train_args['state_dim'],
                                    env.action_space.shape[0],
                                    train_args['rnn_hidden_dim']).to(device)
    obs_model = ObservationModel(train_args['state_dim'],
                                 train_args['rnn_hidden_dim']).to(device)
    action_model = ActionModel(train_args['state_dim'],
                               train_args['rnn_hidden_dim'],
                               env.action_space.shape[0]).to(device)

    # load learned parameters
    encoder.load_state_dict(torch.load(os.path.join(args.dir, 'encoder.pth')))
    rssm.load_state_dict(torch.load(os.path.join(args.dir, 'rssm.pth')))
    obs_model.load_state_dict(
        torch.load(os.path.join(args.dir, 'obs_model.pth')))
    action_model.load_state_dict(
        torch.load(os.path.join(args.dir, 'action_model.pth')))

    # define agent
    policy = Agent(encoder, rssm, action_model)

    # open-loop video prediction
    # select starting point of open-loop prediction randomly
    starting_point = torch.randint(1000 // args.action_repeat - args.length,
                                   (1, )).item()
    # interact in environment until starting point and charge context in policy.rnn_hidden
    obs = env.reset()
    for _ in range(starting_point):
        action = policy(obs)
        obs, _, _, _ = env.step(action)

    # preprocess observatin and embed by encoder
    preprocessed_obs = preprocess_obs(obs)
    preprocessed_obs = torch.as_tensor(preprocessed_obs, device=device)
    preprocessed_obs = preprocessed_obs.transpose(1,
                                                  2).transpose(0,
                                                               1).unsqueeze(0)
    with torch.no_grad():
        embedded_obs = encoder(preprocessed_obs)

    # compute state using embedded observation
    # NOTE: after this, state is updated only using prior,
    #       it means model doesn't see observation
    rnn_hidden = policy.rnn_hidden
    state = rssm.posterior(rnn_hidden, embedded_obs).sample()
    frame = np.zeros((64, 128, 3))
    frames = []
    for _ in range(args.length):
        # action is selected same as training time (closed-loop)
        action = policy(obs)
        obs, _, _, _ = env.step(action)

        # update state and reconstruct observation with same action
        action = torch.as_tensor(action, device=device).unsqueeze(0)
        with torch.no_grad():
            state_prior, rnn_hidden = rssm.prior(state, action, rnn_hidden)
            state = state_prior.sample()
            predicted_obs = obs_model(state, rnn_hidden)

        # arrange GT frame and predicted frame in parallel
        frame[:, :64, :] = preprocess_obs(obs)
        frame[:, 64:, :] = predicted_obs.squeeze().transpose(0, 1).transpose(
            1, 2).cpu().numpy()
        frames.append((frame + 0.5).clip(0.0, 1.0))

    save_video_as_gif(frames)
Exemplo n.º 25
0
    def __init__(
            self,
            level: LevelSelection,
            frame_skip: int,
            visualization_parameters: VisualizationParameters,
            target_success_rate: float = 1.0,
            seed: Union[None, int] = None,
            human_control: bool = False,
            observation_type: ObservationType = ObservationType.Measurements,
            custom_reward_threshold: Union[int, float] = None,
            **kwargs):
        """
        :param level: (str)
            A string representing the control suite level to run. This can also be a LevelSelection object.
            For example, cartpole:swingup.

        :param frame_skip: (int)
            The number of frames to skip between any two actions given by the agent. The action will be repeated
            for all the skipped frames.

        :param visualization_parameters: (VisualizationParameters)
            The parameters used for visualizing the environment, such as the render flag, storing videos etc.

        :param target_success_rate: (float)
            Stop experiment if given target success rate was achieved.

        :param seed: (int)
            A seed to use for the random number generator when running the environment.

        :param human_control: (bool)
            A flag that allows controlling the environment using the keyboard keys.

        :param observation_type: (ObservationType)
            An enum which defines which observation to use. The current options are to use:
            * Measurements only - a vector of joint torques and similar measurements
            * Image only - an image of the environment as seen by a camera attached to the simulator
            * Measurements & Image - both type of observations will be returned in the state using the keys
            'measurements' and 'pixels' respectively.

        :param custom_reward_threshold: (float)
            Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment.

        """
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters,
                         target_success_rate)

        self.observation_type = observation_type

        # load and initialize environment
        domain_name, task_name = self.env_id.split(":")
        self.env = suite.load(domain_name=domain_name,
                              task_name=task_name,
                              task_kwargs={'random': seed})

        if observation_type != ObservationType.Measurements:
            self.env = pixels.Wrapper(
                self.env,
                pixels_only=observation_type == ObservationType.Image)

        # seed
        if self.seed is not None:
            np.random.seed(self.seed)
            random.seed(self.seed)

        self.state_space = StateSpace({})

        # image observations
        if observation_type != ObservationType.Measurements:
            self.state_space['pixels'] = ImageObservationSpace(
                shape=self.env.observation_spec()['pixels'].shape, high=255)

        # measurements observations
        if observation_type != ObservationType.Image:
            measurements_space_size = 0
            measurements_names = []
            for observation_space_name, observation_space in self.env.observation_spec(
            ).items():
                if len(observation_space.shape) == 0:
                    measurements_space_size += 1
                    measurements_names.append(observation_space_name)
                elif len(observation_space.shape) == 1:
                    measurements_space_size += observation_space.shape[0]
                    measurements_names.extend([
                        "{}_{}".format(observation_space_name, i)
                        for i in range(observation_space.shape[0])
                    ])
            self.state_space['measurements'] = VectorObservationSpace(
                shape=measurements_space_size,
                measurements_names=measurements_names)

        # actions
        self.action_space = BoxActionSpace(
            shape=self.env.action_spec().shape[0],
            low=self.env.action_spec().minimum,
            high=self.env.action_spec().maximum)

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1] * scale,
                                            image.shape[0] * scale)

        self.target_success_rate = target_success_rate
Exemplo n.º 26
0
    def __init__(self,
                 domain_name,
                 task_name,
                 horizon=None,
                 gamma=0.99,
                 task_kwargs=None,
                 dt=.01,
                 width_screen=480,
                 height_screen=480,
                 camera_id=0,
                 use_pixels=False,
                 pixels_width=64,
                 pixels_height=64):
        """
        Constructor.

        Args:
             domain_name (str): name of the environment;
             task_name (str): name of the task of the environment;
             horizon (int): the horizon;
             gamma (float): the discount factor;
             task_kwargs (dict, None): parameters of the task;
             dt (float, .01): duration of a control step;
             width_screen (int, 480): width of the screen;
             height_screen (int, 480): height of the screen;
             camera_id (int, 0): position of camera to render the environment;
             use_pixels (bool, False): if True, pixel observations are used
                rather than the state vector;
             pixels_width (int, 64): width of the pixel observation;
             pixels_height (int, 64): height of the pixel observation;

        """
        # MDP creation
        self.env = suite.load(domain_name, task_name, task_kwargs=task_kwargs)
        if use_pixels:
            self.env = pixels.Wrapper(self.env,
                                      render_kwargs={
                                          'width': pixels_width,
                                          'height': pixels_height
                                      })

        # get the default horizon
        if horizon is None:
            horizon = self.env._step_limit

        # Hack to ignore dm_control time limit.
        self.env._step_limit = np.inf

        if use_pixels:
            self._convert_observation_space = self._convert_observation_space_pixels
            self._convert_observation = self._convert_observation_pixels
        else:
            self._convert_observation_space = self._convert_observation_space_vector
            self._convert_observation = self._convert_observation_vector

        # MDP properties
        action_space = self._convert_action_space(self.env.action_spec())
        observation_space = self._convert_observation_space(
            self.env.observation_spec())
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        self._viewer = ImageViewer((width_screen, height_screen), dt)
        self._camera_id = camera_id

        super().__init__(mdp_info)

        self._state = None
Exemplo n.º 27
0
    def __init__(
            self,
            level: LevelSelection,
            frame_skip: int,
            visualization_parameters: VisualizationParameters,
            seed: Union[None, int] = None,
            human_control: bool = False,
            observation_type: ObservationType = ObservationType.Measurements,
            custom_reward_threshold: Union[int, float] = None,
            **kwargs):
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters)

        self.observation_type = observation_type

        # load and initialize environment
        domain_name, task_name = self.env_id.split(":")
        self.env = suite.load(domain_name=domain_name,
                              task_name=task_name,
                              task_kwargs={'random': seed})

        if observation_type != ObservationType.Measurements:
            self.env = pixels.Wrapper(
                self.env,
                pixels_only=observation_type == ObservationType.Image)

        # seed
        if self.seed is not None:
            np.random.seed(self.seed)
            random.seed(self.seed)

        self.state_space = StateSpace({})

        # image observations
        if observation_type != ObservationType.Measurements:
            self.state_space['pixels'] = ImageObservationSpace(
                shape=self.env.observation_spec()['pixels'].shape, high=255)

        # measurements observations
        if observation_type != ObservationType.Image:
            measurements_space_size = 0
            measurements_names = []
            for observation_space_name, observation_space in self.env.observation_spec(
            ).items():
                if len(observation_space.shape) == 0:
                    measurements_space_size += 1
                    measurements_names.append(observation_space_name)
                elif len(observation_space.shape) == 1:
                    measurements_space_size += observation_space.shape[0]
                    measurements_names.extend([
                        "{}_{}".format(observation_space_name, i)
                        for i in range(observation_space.shape[0])
                    ])
            self.state_space['measurements'] = VectorObservationSpace(
                shape=measurements_space_size,
                measurements_names=measurements_names)

        # actions
        self.action_space = BoxActionSpace(
            shape=self.env.action_spec().shape[0],
            low=self.env.action_spec().minimum,
            high=self.env.action_spec().maximum)

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1] * scale,
                                            image.shape[0] * scale)