def _update(self): # Compute the lambda-returns. batch = self.replay.get_full('observations', 'next_observations') values, next_values = self._evaluate(**batch) values, next_values = values.numpy(), next_values.numpy() self.replay.compute_returns(values, next_values) # Update the actor once. keys = 'observations', 'actions', 'advantages', 'log_probs' batch = self.replay.get_full(*keys) batch = {k: torch.as_tensor(v) for k, v in batch.items()} infos = self.actor_updater(**batch) for k, v in infos.items(): logger.store('actor/' + k, v.numpy()) # Update the critic multiple times. for batch in self.replay.get('observations', 'returns'): batch = {k: torch.as_tensor(v) for k, v in batch.items()} infos = self.critic_updater(**batch) for k, v in infos.items(): logger.store('critic/' + k, v.numpy()) # Update the normalizers. if self.model.observation_normalizer: self.model.observation_normalizer.update() if self.model.return_normalizer: self.model.return_normalizer.update()
def _update(self, steps): keys = ('observations', 'actions', 'next_observations', 'rewards', 'discounts') # Update both the actor and the critic multiple times. for batch in self.replay.get(*keys, steps=steps): infos = self._update_actor_critic(**batch) for key in infos: for k, v in infos[key].items(): logger.store(key + '/' + k, v.numpy()) # Update the normalizers. if self.model.observation_normalizer: self.model.observation_normalizer.update() if self.model.return_normalizer: self.model.return_normalizer.update()
def _update(self): keys = ('observations', 'actions', 'next_observations', 'rewards', 'discounts') for i, batch in enumerate(self.replay.get(*keys)): if (i + 1) % self.delay_steps == 0: infos = self._update_actor_critic(**batch) else: infos = dict(critic=self.critic_updater(**batch)) for key in infos: for k, v in infos[key].items(): logger.store(key + '/' + k, v.numpy()) # Update the normalizers. if self.model.observation_normalizer: self.model.observation_normalizer.update() if self.model.return_normalizer: self.model.return_normalizer.update()
def _test(self): '''Tests the agent on the test environment.''' # Start the environment. if not hasattr(self, 'test_observations'): self.test_observations = self.test_environment.start() assert len(self.test_observations) == 1 # Test loop. for _ in range(self.test_episodes): score, length = 0, 0 while True: # Select an action. actions = self.agent.test_step(self.test_observations, self.steps) assert not np.isnan(actions.sum()) logger.store('test/action', actions, stats=True) # Take a step in the environment. self.test_observations, infos = self.test_environment.step( actions) self.agent.test_update(**infos, steps=self.steps) score += infos['rewards'][0] length += 1 if infos['resets'][0]: break # Log the data. logger.store('test/episode_score', score, stats=True) logger.store('test/episode_length', length, stats=True)
def _update(self): # Compute the lambda-returns. batch = self.replay.get_full('observations', 'next_observations') values, next_values = self._evaluate(**batch) values, next_values = values.numpy(), next_values.numpy() self.replay.compute_returns(values, next_values) actor_keys = ('observations', 'actions', 'log_probs', 'locs', 'scales', 'advantages') actor_batch = self.replay.get_full(*actor_keys) actor_infos = self.actor_updater(**actor_batch) for k, v in actor_infos.items(): logger.store('actor/' + k, v.numpy()) critic_keys = 'observations', 'returns' critic_iterations = 0 for critic_batch in self.replay.get(*critic_keys): critic_infos = self.critic_updater(**critic_batch) critic_iterations += 1 for k, v in critic_infos.items(): logger.store('critic/' + k, v.numpy()) logger.store('critic/iterations', critic_iterations) # Update the normalizers. if self.model.observation_normalizer: self.model.observation_normalizer.update() if self.model.return_normalizer: self.model.return_normalizer.update()
def _update(self): # Compute the lambda-returns. batch = self.replay.get_full('observations', 'next_observations') values, next_values = self._evaluate(**batch) values, next_values = values.numpy(), next_values.numpy() self.replay.compute_returns(values, next_values) train_actor = True actor_iterations = 0 critic_iterations = 0 keys = 'observations', 'actions', 'advantages', 'log_probs', 'returns' # Update both the actor and the critic multiple times. for batch in self.replay.get(*keys): if train_actor: batch = {k: torch.as_tensor(v) for k, v in batch.items()} infos = self._update_actor_critic(**batch) actor_iterations += 1 else: batch = { k: torch.as_tensor(batch[k]) for k in ('observations', 'returns') } infos = dict(critic=self.critic_updater(**batch)) critic_iterations += 1 # Stop earlier the training of the actor. if train_actor: train_actor = not infos['actor']['stop'].numpy() for key in infos: for k, v in infos[key].items(): logger.store(key + '/' + k, v.numpy()) logger.store('actor/iterations', actor_iterations) logger.store('critic/iterations', critic_iterations) # Update the normalizers. if self.model.observation_normalizer: self.model.observation_normalizer.update() if self.model.return_normalizer: self.model.return_normalizer.update()
def run(self): '''Runs the main training loop.''' start_time = last_epoch_time = time.time() # Start the environments. observations = self.environment.start() num_workers = len(observations) scores = np.zeros(num_workers) lengths = np.zeros(num_workers, int) self.steps, epoch_steps, epochs, episodes = 0, 0, 0, 0 steps_since_save = 0 while True: # Select actions. actions = self.agent.step(observations, self.steps) assert not np.isnan(actions.sum()) logger.store('train/action', actions, stats=True) # Take a step in the environments. observations, infos = self.environment.step(actions) self.agent.update(**infos, steps=self.steps) scores += infos['rewards'] lengths += 1 self.steps += num_workers epoch_steps += num_workers steps_since_save += num_workers # Show the progress bar. if self.show_progress: logger.show_progress(self.steps, self.epoch_steps, self.max_steps) # Check the finished episodes. for i in range(num_workers): if infos['resets'][i]: logger.store('train/episode_score', scores[i], stats=True) logger.store('train/episode_length', lengths[i], stats=True) scores[i] = 0 lengths[i] = 0 episodes += 1 # End of the epoch. if epoch_steps >= self.epoch_steps: # Evaluate the agent on the test environment. if self.test_environment: self._test() # Log the data. epochs += 1 current_time = time.time() epoch_time = current_time - last_epoch_time sps = epoch_steps / epoch_time logger.store('train/episodes', episodes) logger.store('train/epochs', epochs) logger.store('train/seconds', current_time - start_time) logger.store('train/epoch_seconds', epoch_time) logger.store('train/epoch_steps', epoch_steps) logger.store('train/steps', self.steps) logger.store('train/worker_steps', self.steps // num_workers) logger.store('train/steps_per_second', sps) logger.dump() last_epoch_time = time.time() epoch_steps = 0 # End of training. stop_training = self.steps >= self.max_steps # Save a checkpoint. if stop_training or steps_since_save >= self.save_steps: path = os.path.join(logger.get_path(), 'checkpoints') if os.path.isdir(path) and self.replace_checkpoint: for file in os.listdir(path): if file.startswith('step_'): os.remove(os.path.join(path, file)) checkpoint_name = f'step_{self.steps}' save_path = os.path.join(path, checkpoint_name) self.agent.save(save_path) steps_since_save = self.steps % self.save_steps if stop_training: break
def run(self): '''Runs the main training loop.''' start_time = last_epoch_time = time.time() # Start the environments. observations = self.environment.start() num_workers = len(observations) scores = np.zeros(len(observations)) lengths = np.zeros(len(observations), int) steps, epoch_steps, epochs, episodes = 0, 0, 0, 0 while steps < self.max_steps: # Select actions. actions = self.agent.step(observations) assert not np.isnan(actions.sum()) logger.store('train/action', actions, stats=True) # Take a step in the environments. observations, infos = self.environment.step(actions) self.agent.update(**infos) scores += infos['rewards'] lengths += 1 steps += num_workers epoch_steps += num_workers # Show the progress bar. if self.show_progress: logger.show_progress(steps, self.epoch_steps, self.max_steps) # Check the finished episodes. for i in range(num_workers): if infos['resets'][i]: logger.store('train/episode_score', scores[i], stats=True) logger.store('train/episode_length', lengths[i], stats=True) scores[i] = 0 lengths[i] = 0 episodes += 1 # End of the epoch. if epoch_steps >= self.epoch_steps: # Evaluate the agent on the test environment. if self.test_environment: self._test() # Log the data. epochs += 1 current_time = time.time() epoch_time = current_time - last_epoch_time sps = epoch_steps / epoch_time logger.store('train/episodes', episodes) logger.store('train/epochs', epochs) logger.store('train/seconds', current_time - start_time) logger.store('train/epoch_seconds', epoch_time) logger.store('train/epoch_steps', epoch_steps) logger.store('train/steps', steps) logger.store('train/steps_per_second', sps) logger.dump() last_epoch_time = time.time() epoch_steps = 0 # Save a checkpoint. if self.save_steps and steps % self.save_steps == 0: save_name = 'checkpoints/step_{}'.format(steps) save_path = os.path.join(logger.get_path(), save_name) self.agent.save(save_path)