def _restore_checkpoint(self): checkpointer = Checkpointer(ckpt_dir=os.path.join( self._train_dir, 'algorithm'), algorithm=self._algorithm, trainer_progress=self._trainer_progress) super()._restore_checkpoint(checkpointer)
def _restore_checkpoint(self): checkpointer = Checkpointer( ckpt_dir=os.path.join(self._train_dir, 'algorithm'), algorithm=self._algorithm, metrics=nn.ModuleList(self._algorithm.get_metrics()), trainer_progress=self._trainer_progress) super()._restore_checkpoint(checkpointer)
def test_data_buffer(self): dim = 20 capacity = 256 data_spec = (TensorSpec(shape=()), TensorSpec(shape=(dim // 3 - 1, )), TensorSpec(shape=(dim - dim // 3, ))) data_buffer = DataBuffer(data_spec=data_spec, capacity=capacity) def _get_batch(batch_size): x = torch.randn(batch_size, dim, requires_grad=True) x = (x[:, 0], x[:, 1:dim // 3], x[..., dim // 3:]) return x data_buffer.add_batch(_get_batch(100)) self.assertEqual(int(data_buffer.current_size), 100) batch = _get_batch(1000) # test that the created batch has gradients self.assertTrue(batch[0].requires_grad) data_buffer.add_batch(batch) ret = data_buffer.get_batch(2) # test that DataBuffer detaches gradients of inputs self.assertFalse(ret[0].requires_grad) self.assertEqual(int(data_buffer.current_size), capacity) ret = data_buffer.get_batch_by_indices(torch.arange(capacity)) self.assertEqual(ret[0], batch[0][-capacity:]) self.assertEqual(ret[1], batch[1][-capacity:]) self.assertEqual(ret[2], batch[2][-capacity:]) batch = _get_batch(100) data_buffer.add_batch(batch) ret = data_buffer.get_batch_by_indices( torch.arange(data_buffer.current_size - 100, data_buffer.current_size)) self.assertEqual(ret[0], batch[0]) self.assertEqual(ret[1], batch[1]) self.assertEqual(ret[2], batch[2][-capacity:]) # Test checkpoint working with tempfile.TemporaryDirectory() as checkpoint_directory: checkpoint = Checkpointer(checkpoint_directory, data_buffer=data_buffer) checkpoint.save(10) data_buffer = DataBuffer(data_spec=data_spec, capacity=capacity) checkpoint = Checkpointer(checkpoint_directory, data_buffer=data_buffer) global_step = checkpoint.load() self.assertEqual(global_step, 10) ret = data_buffer.get_batch_by_indices( torch.arange(data_buffer.current_size - 100, data_buffer.current_size)) self.assertEqual(ret[0], batch[0]) self.assertEqual(ret[1], batch[1]) self.assertEqual(ret[2], batch[2][-capacity:]) data_buffer.clear() self.assertEqual(int(data_buffer.current_size), 0)
def play(root_dir, env, algorithm, checkpoint_step="latest", epsilon_greedy=0., num_episodes=10, max_episode_length=0, sleep_time_per_step=0.01, record_file=None, future_steps=0, append_blank_frames=0, render=True, render_prediction=False, ignored_parameter_prefixes=[]): """Play using the latest checkpoint under `train_dir`. The following example record the play of a trained model to a mp4 video: .. code-block:: bash python -m alf.bin.play \ --root_dir=~/tmp/bullet_humanoid/ppo2/ppo2-11 \ --num_episodes=1 \ --record_file=ppo_bullet_humanoid.mp4 Args: root_dir (str): same as the root_dir used for `train()` env (AlfEnvironment): the environment algorithm (RLAlgorithm): the training algorithm checkpoint_step (int|str): the number of training steps which is used to specify the checkpoint to be loaded. If checkpoint_step is 'latest', the most recent checkpoint named 'latest' will be loaded. epsilon_greedy (float): a floating value in [0,1], representing the chance of action sampling instead of taking argmax. This can help prevent a dead loop in some deterministic environment like Breakout. num_episodes (int): number of episodes to play max_episode_length (int): if >0, each episode is limited to so many steps. sleep_time_per_step (float): sleep so many seconds for each step record_file (str): if provided, video will be recorded to a file instead of shown on the screen. future_steps (int): whether to encode some information from future steps into the current frame. If future_steps is larger than zero, then the related information (e.g. observation, reward, action etc.) will be cached and the encoding of them to video frames is deferred to the time when ``future_steps`` of future frames are available. This defer mode is potentially useful to display for each frame some information that expands beyond a single time step to the future. Currently this mode only support offline rendering, i.e. rendering and saving the video to ``record_file``. If a non-positive value is provided, it is treated as not using the defer mode and the plots for displaying future information will not be displayed. append_blank_frames (int): If >0, wil append such number of blank frames at the end of the episode in the rendered video file. A negative value has the same effects as 0 and no blank frames will be appended. This option has no effects when displaying the frames on the screen instead of recording to a file. render (bool): If False, then this function only evaluates the trained model without calling rendering functions. This value will be ignored if a ``record_file`` argument is provided. render_prediction (bool): If True, when using ``VideoRecorder`` to render a video, extra prediction info (returned by ``predict_step()``) will also be rendered by the side of video frames. ignored_parameter_prefixes (list[str]): ignore the parameters whose name has one of these prefixes in the checkpoint. """ root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') ckpt_dir = os.path.join(train_dir, 'algorithm') checkpointer = Checkpointer(ckpt_dir=ckpt_dir, algorithm=algorithm) checkpointer.load(checkpoint_step, ignored_parameter_prefixes=ignored_parameter_prefixes, including_optimizer=False, including_replay_buffer=False) recorder = None if record_file is not None: recorder = VideoRecorder(env, future_steps=future_steps, append_blank_frames=append_blank_frames, render_prediction=render_prediction, path=record_file) elif render: # pybullet_envs need to render() before reset() to enable mode='human' env.render(mode='human') env.reset() time_step = common.get_initial_time_step(env) algorithm.eval() policy_state = algorithm.get_initial_predict_state(env.batch_size) trans_state = algorithm.get_initial_transform_state(env.batch_size) episode_reward = 0. episode_length = 0 episodes = 0 metrics = [ alf.metrics.AverageReturnMetric(buffer_size=num_episodes, reward_shape=env.reward_spec().shape), alf.metrics.AverageEpisodeLengthMetric(buffer_size=num_episodes), ] while episodes < num_episodes: time_step, policy_step, trans_state = _step( algorithm=algorithm, env=env, time_step=time_step, policy_state=policy_state, trans_state=trans_state, epsilon_greedy=epsilon_greedy, metrics=metrics) policy_state = policy_step.state episode_length += 1 is_last_step = time_step.is_last() or (episode_length >= max_episode_length > 0) if recorder: recorder.capture_frame(time_step, policy_step, is_last_step) elif render: env.render(mode='human') time.sleep(sleep_time_per_step) time_step_reward = time_step.reward.view(-1).float().cpu().numpy() episode_reward += time_step_reward if is_last_step: logging.info("episode_length=%s episode_reward=%s" % (episode_length, episode_reward)) episode_reward = 0. episode_length = 0. episodes += 1 # observe the last step for m in metrics: m(time_step.cpu()) time_step = env.reset() for m in metrics: logging.info( "%s: %s", m.name, map_structure( lambda x: x.cpu().numpy().item() if x.ndim == 0 else x.cpu().numpy(), m.result())) if recorder: recorder.close() env.reset()
def play(root_dir, env, algorithm, checkpoint_step="latest", epsilon_greedy=0.1, num_episodes=10, max_episode_length=0, sleep_time_per_step=0.01, record_file=None, ignored_parameter_prefixes=[]): """Play using the latest checkpoint under `train_dir`. The following example record the play of a trained model to a mp4 video: .. code-block:: bash python -m alf.bin.play \ --root_dir=~/tmp/bullet_humanoid/ppo2/ppo2-11 \ --num_episodes=1 \ --record_file=ppo_bullet_humanoid.mp4 Args: root_dir (str): same as the root_dir used for `train()` env (AlfEnvironment): the environment algorithm (RLAlgorithm): the training algorithm checkpoint_step (int|str): the number of training steps which is used to specify the checkpoint to be loaded. If checkpoint_step is 'latest', the most recent checkpoint named 'latest' will be loaded. epsilon_greedy (float): a floating value in [0,1], representing the chance of action sampling instead of taking argmax. This can help prevent a dead loop in some deterministic environment like Breakout. num_episodes (int): number of episodes to play max_episode_length (int): if >0, each episode is limited to so many steps. sleep_time_per_step (float): sleep so many seconds for each step record_file (str): if provided, video will be recorded to a file instead of shown on the screen. ignored_parameter_prefixes (list[str]): ignore the parameters whose name has one of these prefixes in the checkpoint. """ root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') ckpt_dir = os.path.join(train_dir, 'algorithm') checkpointer = Checkpointer(ckpt_dir=ckpt_dir, algorithm=algorithm) checkpointer.load( checkpoint_step, ignored_parameter_prefixes=ignored_parameter_prefixes, including_optimizer=False, including_replay_buffer=False) recorder = None if record_file is not None: recorder = VideoRecorder(env, path=record_file) else: # pybullet_envs need to render() before reset() to enable mode='human' env.render(mode='human') env.reset() time_step = common.get_initial_time_step(env) algorithm.eval() policy_state = algorithm.get_initial_predict_state(env.batch_size) trans_state = algorithm.get_initial_transform_state(env.batch_size) episode_reward = 0. episode_length = 0 episodes = 0 metrics = [ alf.metrics.AverageReturnMetric( buffer_size=num_episodes, reward_shape=env.reward_spec().shape), alf.metrics.AverageEpisodeLengthMetric(buffer_size=num_episodes), ] while episodes < num_episodes: time_step, policy_state, trans_state, info = _step( algorithm=algorithm, env=env, time_step=time_step, policy_state=policy_state, trans_state=trans_state, epsilon_greedy=epsilon_greedy, metrics=metrics) episode_length += 1 if recorder: recorder.capture_frame(info) else: env.render(mode='human') time.sleep(sleep_time_per_step) time_step_reward = time_step.reward.view(-1).float().cpu().numpy() episode_reward += time_step_reward if time_step.is_last() or episode_length >= max_episode_length > 0: logging.info("episode_length=%s episode_reward=%s" % (episode_length, episode_reward)) episode_reward = 0. episode_length = 0. episodes += 1 # observe the last step for m in metrics: m(time_step.cpu()) time_step = env.reset() for m in metrics: logging.info("%s: %f", m.name, m.result()) if recorder: recorder.close() env.reset()