示例#1
0
    def reset(self):
        logger.record("debug/body_x", self.robot.body_xyz[0])
        self.robot.step_num = 0
        obs = super().reset()
        obs = np.concatenate([obs, [self.robot.body_real_xyz[0]]])

        return obs
示例#2
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        losses = []
        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Compute the target Q values
                target_q = self.q_net_target(replay_data.next_observations)
                # Follow greedy policy: use the one with the highest value
                target_q, _ = target_q.max(dim=1)
                # Avoid potential broadcast issue
                target_q = target_q.reshape(-1, 1)
                if self.loss_type == self.DQN_CLIPPED_LOSS_CONSTANT:
                    target_q_a_max = target_q.clone()
                # 1-step TD target
                target_q = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q

            # Get current Q estimates
            current_q = self.q_net(replay_data.observations)

            # Retrieve the q-values for the actions from the replay buffer
            current_q = th.gather(current_q,
                                  dim=1,
                                  index=replay_data.actions.long())

            if self.loss_type == self.DQN_CLIPPED_LOSS_CONSTANT:
                loss = dqn_clipped_loss(current_q, target_q, target_q_a_max,
                                        self.gamma)
            elif self.loss_type == self.DQN_REG_LOSS_CONSTANT:
                loss = dqn_reg_loss(current_q, target_q,
                                    self.dqn_reg_loss_weight)
            else:
                # standard DQN
                # Compute Huber loss (less sensitive to outliers)
                # testing some things
                loss = F.smooth_l1_loss(current_q, target_q)

            losses.append(loss.item())

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                        self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/loss", np.mean(losses))
示例#3
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        losses = []
        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Compute the quantiles of next observation
                next_quantiles = self.quantile_net_target(
                    replay_data.next_observations)
                # Compute the greedy actions which maximize the next Q values
                next_greedy_actions = next_quantiles.mean(
                    dim=1, keepdim=True).argmax(dim=2, keepdim=True)
                # Make "n_quantiles" copies of actions, and reshape to (batch_size, n_quantiles, 1)
                next_greedy_actions = next_greedy_actions.expand(
                    batch_size, self.n_quantiles, 1)
                # Follow greedy policy: use the one with the highest Q values
                next_quantiles = next_quantiles.gather(
                    dim=2, index=next_greedy_actions).squeeze(dim=2)
                # 1-step TD target
                target_quantiles = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * next_quantiles

            # Get current quantile estimates
            current_quantiles = self.quantile_net(replay_data.observations)

            # Make "n_quantiles" copies of actions, and reshape to (batch_size, n_quantiles, 1).
            actions = replay_data.actions[..., None].long().expand(
                batch_size, self.n_quantiles, 1)
            # Retrieve the quantiles for the actions from the replay buffer
            current_quantiles = th.gather(current_quantiles,
                                          dim=2,
                                          index=actions).squeeze(dim=2)

            # Compute Quantile Huber loss, summing over a quantile dimension as in the paper.
            loss = quantile_huber_loss(current_quantiles,
                                       target_quantiles,
                                       sum_over_quantiles=True)
            losses.append(loss.item())

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            if self.max_grad_norm is not None:
                th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                            self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/loss", np.mean(losses))
示例#4
0
    def train(self,
              gradient_steps: int,
              batch_size: int = 100,
              entropy_tau=0.03) -> None:
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        losses = []
        for _ in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            with torch.no_grad():
                # Compute the next Q-values using the target network
                next_q_values = self.q_net_target(
                    replay_data.next_observations)

                # calculate entropy term with logsum
                logsum = torch.logsumexp(
                    (next_q_values - next_q_values.max(1)[0].unsqueeze(-1)) /
                    entropy_tau, 1).unsqueeze(-1)
                tau_log_pi_next = next_q_values - next_q_values.max(
                    1)[0].unsqueeze(-1) - entropy_tau * logsum

                pi_target = F.softmax(next_q_values / entropy_tau, dim=1)

                regularized_next_q_values = (
                    pi_target * (next_q_values - tau_log_pi_next) *
                    (1 - replay_data.dones)).sum(1)
                target_q_values = replay_data.rewards + (
                    self.gamma * regularized_next_q_values).unsqueeze(-1)

            # Get current Q-values estimates
            current_q_values = self.q_net(replay_data.observations)

            # Retrieve the q-values for the actions from the replay buffer
            current_q_values = torch.gather(current_q_values,
                                            dim=1,
                                            index=replay_data.actions.long())

            # Compute Huber loss (less sensitive to outliers)
            loss = F.smooth_l1_loss(current_q_values, target_q_values)
            losses.append(loss.item())

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                           self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/loss", np.mean(losses))
示例#5
0
    def step_wait(self):
        if self.needs_reset:
            raise RuntimeError(
                'Tried to step vectorized environment that needs reset!')

        obss, rews, dones, infos = self.venv.step_wait()

        self.curr_ep_rewards += rews
        self.curr_ep_lengths += 1

        new_infos = list(infos[:])
        for key in self.curr_ep_data:
            self.curr_ep_data[key] += [
                info[key] for info in infos
            ]  #[dk for dk in map(lambda d: d[key], infos)]

        for i in range(len(dones)):
            if dones[i]:
                info = infos[i].copy()
                ep_rew = self.curr_ep_rewards[i]
                ep_len = self.curr_ep_lengths[i]
                ep_time = round(time.time() - self.t_start, 6)
                ep_info = {'r': ep_rew, 'l': ep_len, 't': ep_time}
                for key in self.curr_ep_data:
                    # Change in behavior: grab only the values in episode that would be overwritten
                    ep_info[key] = self.curr_ep_data[key][i]
                    self.curr_ep_data[key][i] = 0
                self.episode_rewards.append(ep_rew)
                self.episode_lengths.append(ep_len)
                self.episode_times.append(ep_time)
                self.curr_ep_rewards[i] = 0
                self.curr_ep_lengths[i] = 0
                if self.logger:
                    for key in self.curr_rollout_data:
                        self.curr_rollout_data[key].append(ep_info[key])
                info['episode'] = ep_info
                new_infos[i] = info
        self.total_steps += self.num_envs
        self.step_idx_in_rollout += 1

        if self.step_idx_in_rollout == self.rollout_size:
            if self.logger:
                # Correct the value for time (a bit ugly, I know)
                if 't' in self.curr_rollout_data:
                    self.curr_rollout_data['t'] = [time.time() - self.t_start]
                # Store the average values per rollout
                self.logger.writerow({
                    k: safe_mean(self.curr_rollout_data[k])
                    for k in self.curr_rollout_data
                })
                self.file_handler.flush()
                for key in self.info_keywords:
                    logger.record(key, safe_mean(self.curr_rollout_data[key]))
                for key in self.curr_rollout_data:
                    self.curr_rollout_data[key] = []
                self.step_idx_in_rollout = 0

        return obss, rews, dones, new_infos
示例#6
0
 def step(self, a):
     if self.robot.step_num % 1000 == 0:
         for n in range(len(a)):
             logger.record(f"debug/motor_{n}", a[n])
     self.robot.step_num += 1
     obs, r, done, info = self.super_step(a)
     if self.robot.step_num > self.max_episode_steps:
         done = True
     return obs, r, done, info
示例#7
0
    def train(self) -> None:
        """
        Update policy using the currently gathered
        rollout buffer (one gradient step over whole data).
        """
        # This will only loop once (get all data in one go)
        for rollout_data in self.rollout_buffer.get(batch_size=None):

            actions = rollout_data.actions
            if isinstance(self.env.action_space, spaces.Discrete
                          ):  # it triggers, but why do I need float to long?
                # Convert discrete action from float to long
                actions = actions.long().flatten()

            # TODO: avoid second computation of everything because of the gradient
            values, log_prob, entropy = self.policy.evaluate_actions(
                rollout_data.observations, actions)
            values = values.flatten()

            # Normalize advantage (not present in the original implementation)
            advantages = rollout_data.advantages

            # Policy gradient loss
            policy_loss = -(advantages * log_prob).mean()

            # Value loss using the TD(gae_lambda) target
            value_loss = F.mse_loss(rollout_data.returns, values)

            # Entropy loss favor exploration
            if entropy is None:
                # Approximate entropy when no analytical form
                entropy_loss = -th.mean(-log_prob)
            else:
                entropy_loss = -th.mean(entropy)

            loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

            # Optimization step
            self.policy.optimizer.zero_grad()
            loss.backward()

            # Clip grad norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                        self.max_grad_norm)
            self.policy.optimizer.step()

        explained_var = explained_variance(
            self.rollout_buffer.returns.flatten(),
            self.rollout_buffer.values.flatten())

        logger.record("train/explained_variance", explained_var)
        logger.record("train/entropy_loss", entropy_loss.item())
        logger.record("train/policy_loss", policy_loss.item())
        logger.record("train/value_loss", value_loss.item())
        if hasattr(self.policy, "log_std"):
            logger.record("train/std",
                          th.exp(self.policy.log_std).mean().item())
示例#8
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:

        # Update learning rate according to lr schedule
        self._update_learning_rate([self.actor.optimizer, self.critic.optimizer])

        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):

            self._n_updates += 1
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Select action according to policy and add clipped noise
                noise = replay_data.actions.clone().data.normal_(0, self.target_policy_noise)
                noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
                next_actions = (self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1)

                # Compute the next Q-values: min over all critics targets
                next_q_values = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1)
                next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values

            # Get current Q-values estimates for each critic network
            current_q_values = self.critic(replay_data.observations, replay_data.actions)

            # Compute critic loss
            critic_loss = sum([F.mse_loss(current_q, target_q_values) for current_q in current_q_values])
            critic_losses.append(critic_loss.item())

            # Optimize the critics
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Delayed policy updates
            if self._n_updates % self.policy_delay == 0:
                # Compute actor loss
                actor_loss = -self.critic.q1_forward(replay_data.observations, self.actor(replay_data.observations)).mean()
                actor_losses.append(actor_loss.item())

                # Optimize the actor
                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()

                polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.value_tau)
                polyak_update(self.actor.parameters(), self.actor_target.parameters(), self.tau)

        logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        if len(actor_losses) > 0:
            logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        logger.record("train/critic_value", th.mean(th.stack(current_q_values)).item())
        logger.record("train/target_critic_value", th.mean(target_q_values).item())
示例#9
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        losses = []
        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Compute the next Q-values using the target network
                next_q_values = self.q_net_target(
                    replay_data.next_observations)
                current_q_values = self.q_net_target(replay_data.observations)
                # Follow greedy policy: use the one with the highest value
                next_q_values, _ = (next_q_values -
                                    current_q_values).max(dim=1)
                # Avoid potential broadcast issue
                next_q_values = next_q_values.reshape(-1, 1)
                # 1-step TD target
                target_q_values = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * next_q_values

            # Get current Q-values estimates
            current_q_values = self.q_net(replay_data.observations)
            next_q_values = self.q_net(replay_data.next_observations)

            # Retrieve the q-values for the actions from the replay buffer
            current_q_values = th.gather(current_q_values,
                                         dim=1,
                                         index=replay_data.actions.long())
            next_q_values = th.gather(current_q_values,
                                      dim=1,
                                      index=replay_data.next_actions.long())

            # Compute Huber loss (less sensitive to outliers)
            loss = F.smooth_l1_loss(next_q_values - current_q_values,
                                    target_q_values)
            losses.append(loss.item())

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                        self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/loss", np.mean(losses))
示例#10
0
 def _update_learning_rate(
     self,
     optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer],
 ) -> None:
     super(PPG, self)._update_learning_rate(optimizers)
     logger.record("train/aux_learning_rate",
                   self.aux_lr_schedule(self._current_progress_remaining))
     update_learning_rate(
         self.policy.aux_optimizer,
         self.aux_lr_schedule(self._current_progress_remaining))
示例#11
0
    def _on_step(self) -> None:
        """
        Update the exploration rate and target network if needed.
        This method is called in ``collect_rollouts()`` after each step in the environment.
        """
        if self.num_timesteps % self.target_update_interval == 0:
            polyak_update(self.quantile_net.parameters(), self.quantile_net_target.parameters(), self.tau)

        self.exploration_rate = self.exploration_schedule(self._current_progress_remaining)
        logger.record("rollout/exploration rate", self.exploration_rate)
示例#12
0
 def _dump_logs(self) -> None:
     """
     NOTE: Overriding the off_policy_algorithm's method
     Write log. This is where we are writing additional things to tensorboard
     """
     fps = int(self.num_timesteps / (time.time() - self.start_time))
     if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
         logger.record("rollout/ep_rew", self.ep_info_buffer[-1]["r"])
     logger.record("time/fps", fps)
     # Pass the number of timesteps for tensorboard
     logger.dump(step=self.num_timesteps)
示例#13
0
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            with th.no_grad():
                # Compute the target Q values
                target_q = self.q_net_target(replay_data.next_observations)
                # print("replay_data.next_observations",replay_data.next_observations)

                # Follow greedy policy: use the one with the highest value
                target_q, _ = target_q.max(dim=1)
                # Avoid potential broadcast issue
                target_q = target_q.reshape(-1, 1)
                # 1-step TD target
                target_q = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q - self.rho

            # TODO: aus dem paper seite 12
            # X = (1 − γ)X γ π 1 (s t , a t ) + γ[r t + γ 1 max X γ π 1 (s t+1 , a) − ρ π ]
            # wir ignorieren komplett (1 − γ)X γ π 1 (s t , a t ) und das γ vor der eckigen Klammer
            # --> wir beachten nur was in der eckigen Klammer steht; der Rest ist für exponential smoothing
            # wir benötigen aber kein exp. smoothing, da wir ein neuronales netz nutzen

            # Get current Q estimates
            current_q = self.q_net(replay_data.observations)
            # Retrieve the q-values for the actions from the replay buffer
            current_q = th.gather(current_q,
                                  dim=1,
                                  index=replay_data.actions.long())
            # print("current q values from replay buffer : \n",current_q)
            # Compute Huber loss (less sensitive to outliers)
            loss = F.smooth_l1_loss(current_q, target_q)

            # Optimize the policy
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                        self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude='tensorboard')
示例#14
0
 def step(self, a):
     if self.robot.step_num % 1000 == 0:
         for n in range(len(a)):
             logger.record(f"debug/motor_{n}", a[n])
     self.robot.step_num += 1
     obs, r, done, info = self.super_step(a)
     if self.robot.step_num > self.max_episode_steps:
         done = True
     if done and self.is_eval:
         logger.record(f"eval/body_x", self.robot.body_xyz[0])
     if self.isRender:
         self.camera.move_and_look_at(0,0,0,self.robot.body_xyz[0], self.robot.body_xyz[1], 1)
     return obs, r, done, info
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        last_tested = 0

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )
            for e in self.env.envs:
                e.env.train_return = rollout.episode_reward

            if rollout.continue_training is False:
                break
            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                last_tested += 1
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
                if last_tested > 5:
                    last_tested = 0
                    test_return = self.test(num_episodes=3)
                    logger.record("rollout/test_rew_mean", test_return)

        callback.on_training_end()

        return self
示例#16
0
    def _update_learning_rate(self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer]) -> None:
        """
        Update the optimizers learning rate using the current learning rate schedule
        and the current progress remaining (from 1 to 0).

        :param optimizers: (Union[List[th.optim.Optimizer], th.optim.Optimizer])
            An optimizer or a list of optimizers.
        """
        # Log the current learning rate
        logger.record("train/learning_rate", self.lr_schedule(self._current_progress_remaining))

        if not isinstance(optimizers, list):
            optimizers = [optimizers]
        for optimizer in optimizers:
            update_learning_rate(optimizer, self.lr_schedule(self._current_progress_remaining))
示例#17
0
    def train(self) -> None:
        """
        Update policy using torche currently gatorchered
        rollout buffer (one gradient step over whole data).
        """
        # Update optimizer learning rate
        self._update_learning_rate(self.policy.optimizer)

        # This will only loop once (get all data in one go)
        for rollout_data in self.rollout_buffer.get(batch_size=None):

            actions = rollout_data.actions
            if isinstance(self.action_space, gym.spaces.Discrete):
                # Convert discrete action from float to long
                actions = actions.long().flatten()

            # TODO: avoid second computation of everytorching because of torche gradient
            values, log_prob, entropy = self.policy.evaluate_actions(
                rollout_data.observations, actions)
            values = values.flatten()

            # Normalize advantage (not present in torche original implementation)
            advantages = rollout_data.advantages
            if self.normalize_advantage:
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

            # Policy gradient loss
            policy_loss = -(advantages * log_prob).mean()

            # Value loss using torche TD(gae_lambda) target
            value_loss = F.mse_loss(rollout_data.returns, values)

            # Entropy loss favor exploration
            if entropy is None:
                # Approximate entropy when no analytical form
                entropy_loss = -torch.mean(-log_prob)
            else:
                entropy_loss = -torch.mean(entropy)

            loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

            # Optimization step
            self.policy.optimizer.zero_grad()
            loss.backward()

            # Clip grad norm
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                           self.max_grad_norm)
            self.policy.optimizer.step()

        self._n_updates += 1
        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/entropy_loss", entropy_loss.item())
        logger.record("train/policy_loss", policy_loss.item())
        logger.record("train/value_loss", value_loss.item())
示例#18
0
    def train(
        self,
        n_epochs: int = 100,
        *,
        on_epoch_end: Callable[[dict], None] = None,
        log_interval: int = 100,
    ):
        """Train with supervised learning for some number of epochs.

        Here an 'epoch' is just a complete pass through the expert transition
        dataset.

        Args:
          n_epochs: number of complete passes made through dataset.
          on_epoch_end: optional callback to run at
            the end of each epoch. Will receive all locals from this function as
            dictionary argument (!!).
          log_interval: log stats after every log_interval batches
        """
        assert self.batch_size >= 1
        samples_so_far = 0
        batch_num = 0
        for epoch_num in trange(n_epochs, desc="BC epoch"):
            while samples_so_far < (epoch_num + 1) * self.expert_dataset.size():
                batch_num += 1
                trans = self.expert_dataset.sample(self.batch_size)
                assert len(trans) == self.batch_size
                samples_so_far += self.batch_size

                obs_tensor = th.as_tensor(trans.obs).to(self.policy.device)
                acts_tensor = th.as_tensor(trans.acts).to(self.policy.device)
                loss, stats_dict = self._calculate_loss(obs_tensor, acts_tensor)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                stats_dict["epoch_num"] = epoch_num
                stats_dict["n_updates"] = batch_num
                stats_dict["batch_size"] = len(trans)

                if batch_num % log_interval == 0:
                    for k, v in stats_dict.items():
                        logger.record(k, v)
                    logger.dump(batch_num)

            if on_epoch_end is not None:
                on_epoch_end(locals())
示例#19
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        statistics = []
        for gradient_step in range(gradient_steps):
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)
            stats = self.train_batch(replay_data)
            statistics.append(stats)
            self._n_updates += 1
        actor_losses, critic_losses, ent_coef_losses, ent_coefs = tuple(
            zip(*statistics))

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude='tensorboard')
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
示例#20
0
    def train(
        self,
        *,
        n_epochs: Optional[int] = None,
        n_batches: Optional[int] = None,
        on_epoch_end: Callable[[], None] = None,
        log_interval: int = 100,
    ):
        """Train with supervised learning for some number of epochs.

        Here an 'epoch' is just a complete pass through the expert data loader,
        as set by `self.set_expert_data_loader()`.

        Args:
            n_epochs: Number of complete passes made through expert data before ending
                training. Provide exactly one of `n_epochs` and `n_batches`.
            n_batches: Number of batches loaded from dataset before ending training.
                Provide exactly one of `n_epochs` and `n_batches`.
            on_epoch_end: Optional callback with no parameters to run at the end of each
                epoch.
            log_interval: Log stats after every log_interval batches.
        """
        it = EpochOrBatchIteratorWithProgress(
            self.expert_data_loader,
            n_epochs=n_epochs,
            n_batches=n_batches,
            on_epoch_end=on_epoch_end,
        )

        batch_num = 0
        for batch, stats_dict_it in it:
            loss, stats_dict_loss = self._calculate_loss(
                batch["obs"], batch["acts"])

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if batch_num % log_interval == 0:
                for stats in [stats_dict_it, stats_dict_loss]:
                    for k, v in stats.items():
                        logger.record(k, v)
                logger.dump(batch_num)
            batch_num += 1
示例#21
0
 def _log_curriculum(self):
     logger.record("curriculum/current_stage",
                   self._stage,
                   exclude="tensorboard")
     logger.record("curriculum/current_stage_id",
                   self._stage.value,
                   exclude="stdout")
     logger.record("curriculum/current_success_rate", self._success_rate)
     if self._restart_every_n_steps > 0:
         logger.record("curriculum/steps_until_reset",
                       self._reset_step_counter)
示例#22
0
def test_no_accum(tmpdir):
    logger.configure(tmpdir, ["csv"])
    sb_logger.record("A", 1)
    sb_logger.record("B", 1)
    sb_logger.dump()
    sb_logger.record("A", 2)
    sb_logger.dump()
    sb_logger.record("B", 3)
    sb_logger.dump()
    expect = {"A": [1, 2, ""], "B": [1, "", 3]}
    _compare_csv_lines(osp.join(tmpdir, "progress.csv"), expect)
示例#23
0
    def pretrain_rl(self, gradient_steps: int, batch_size: int = 64) -> None:
        statistics = []
        with trange(gradient_steps) as t:
            for gradient_step in t:
                replay_data = self.replay_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                stats = self.train_batch(replay_data)
                statistics.append(stats)
                self._n_updates += 1
                t.set_postfix(qf_loss=stats[1], policy_loss=stats[0])
        actor_losses, critic_losses, ent_coef_losses, ent_coefs = tuple(
            zip(*statistics))

        logger.record("pretrain/n_updates",
                      self._n_updates,
                      exclude='tensorboard')
        logger.record("pretrain/ent_coef", np.mean(ent_coefs))
        logger.record("pretrain/actor_loss", np.mean(actor_losses))
        logger.record("pretrain/critic_loss", np.mean(critic_losses))
        logger.record("pretrain/ent_coef_loss", np.mean(ent_coef_losses))
示例#24
0
    def train(self):

        total_losses, policy_losses, value_losses, entropy_losses = [], [], [], []

        for epoch in range(self.n_epochs):
            for batch in self.rollout.get(self.batch_size):
                actions = batch.actions.long().flatten()
                old_log_probs = batch.old_log_probs.to(self.device)
                advantages = batch.advantages.to(self.device)
                returns = batch.returns.to(self.device)

                state_values, action_log_probs, entropy = self.policy.evaluate(
                    batch.observations, actions)
                state_values = state_values.squeeze()

                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

                ratio = torch.exp(action_log_probs - old_log_probs)

                policy_loss_1 = advantages * ratio
                policy_loss_2 = advantages * torch.clamp(
                    ratio, 1 - self.clip_range, 1 + self.clip_range)
                policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean()

                value_loss = F.mse_loss(returns, state_values)

                if entropy is None:
                    entropy_loss = -action_log_probs.mean()
                else:
                    entropy_loss = -torch.mean(entropy)

                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()

                total_losses.append(loss.item())
                policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())
                entropy_losses.append(entropy_loss.item())

        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(policy_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/total_loss", np.mean(total_losses))

        self._n_updates += self.n_epochs
    def learn(self,
              total_timesteps,
              n_steps,
              n_iter,
              batch_size,
              save_path,
              tb_log_path=None):
        configure_logger(verbose=self.verbose,
                         tensorboard_log=tb_log_path,
                         tb_log_name="HAC",
                         reset_num_timesteps=True)

        step_count = 0
        i_episode = 1
        while step_count <= total_timesteps:
            self.reward = 0
            self.timestep = 0

            state = self.env.reset()
            # collecting experience in environment
            last_state, done, _step_count = self.run_HAC(self.env,
                                                         self.k_level - 1,
                                                         state,
                                                         self.goal_state,
                                                         is_subgoal_test=False)
            step_count += _step_count

            # updating with collected data
            if step_count > n_steps * i_episode:
                vio_num = get_violation_count(self.env)
                if vio_num is not None:
                    logger.record("rollout/violation", vio_num)
                logger.record(f"rollout/ep_rew_mean", self.reward)

                self.update(n_iter, batch_size)
                i_episode += 1

                logger.dump(step_count)

        self.save(save_path)
        return self
示例#26
0
    def return_terminal(self, reason="Last Date", reward=0):

        state = self.state_memory[-1]
        self.log_step(reason=reason, terminal_reward=reward)
        # Add outputs to logger interface
        reward_pct = self.account_information["total_assets"][
            -1] / self.initial_amount
        logger.record("environment/total_reward_pct", (reward_pct - 1) * 100)
        logger.record(
            "environment/daily_trades",
            self.sum_trades / (self.current_step) / len(self.assets),
        )
        logger.record("environment/completed_steps", self.current_step)
        logger.record("environment/sum_rewards",
                      np.sum(self.account_information["reward"]))
        logger.record(
            "environment/cash_proportion",
            self.account_information["cash"][-1] /
            self.account_information["total_assets"][-1],
        )
        return state, reward, True, {}
示例#27
0
    def pretrain_bc(self, gradient_steps: int, batch_size: int = 64):
        statistics = []
        with trange(gradient_steps) as t:
            for gradient_step in t:
                replay_data = self.bc_buffer.sample(
                    batch_size, env=self._vec_normalize_env)
                dist = self.actor(replay_data.observations)
                actions_pi, log_prob = dist.log_prob_and_rsample()
                actor_loss = -log_prob.mean()
                actor_mse_loss = F.mse_loss(actions_pi.detach(),
                                            replay_data.actions)

                self.actor.optimizer.zero_grad()
                actor_loss.backward()
                self.actor.optimizer.step()

                statistics.append((actor_loss.item(), actor_mse_loss.item()))
                t.set_postfix(mse_loss=actor_mse_loss.item(),
                              policy_loss=actor_loss.item())
        actor_losses, mse_losses = tuple(zip(*statistics))

        logger.record("pretrain/n_updates",
                      self._n_updates,
                      exclude='tensorboard')
        logger.record("pretrain/actor_loss", np.mean(actor_losses))
        logger.record("pretrain/actor_mse_loss", np.mean(mse_losses))
示例#28
0
 def print_statistics(self):
     avg_fwd_time = np.average(self.fwd_times)
     avg_bwd_time = np.average(self.bwd_times)
     fwd_to_bwd_ratio = avg_fwd_time / avg_bwd_time
     print("average rollout collection time: %f" % avg_fwd_time)
     print("average update time: %f" % avg_bwd_time)
     print("ratio forward to backward: %f" % fwd_to_bwd_ratio)
     logger.record("rollout collection time average", avg_fwd_time)
     logger.record("learning time average", avg_bwd_time)
     logger.record("ratio collection to learning", fwd_to_bwd_ratio)
示例#29
0
    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
        # Update optimizers learning rate
        optimizers = [self.actor.optimizer, self.critic.optimizer]
        if self.ent_coef_optimizer is not None:
            optimizers += [self.ent_coef_optimizer]

        # Update learning rate according to lr schedule
        self._update_learning_rate(optimizers)

        ent_coef_losses, ent_coefs = [], []
        actor_losses, critic_losses = [], []

        for gradient_step in range(gradient_steps):
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(
                batch_size, env=self._vec_normalize_env)

            # We need to sample because `log_std` may have changed between two gradient steps
            if self.use_sde:
                self.actor.reset_noise()

            # Action by the current actor for the sampled state
            actions_pi, log_prob = self.actor.action_log_prob(
                replay_data.observations)
            # unique, counts = np.unique(actions_pi.cpu(), return_counts=True)
            # print(dict(zip(unique, counts)))
            log_prob = log_prob.reshape(-1, 1)

            ent_coef_loss = None
            if self.ent_coef_optimizer is not None:
                # Important: detach the variable from the graph
                # so we don't change it with other losses
                # see https://github.com/rail-berkeley/softlearning/issues/60
                ent_coef = th.exp(self.log_ent_coef.detach())
                ent_coef_loss = -(
                    self.log_ent_coef *
                    (log_prob + self.target_entropy).detach()).mean()
                ent_coef_losses.append(ent_coef_loss.item())
            else:
                ent_coef = self.ent_coef_tensor

            ent_coefs.append(ent_coef.item())

            # Optimize entropy coefficient, also called
            # entropy temperature or alpha in the paper
            if ent_coef_loss is not None:
                self.ent_coef_optimizer.zero_grad()
                ent_coef_loss.backward()
                self.ent_coef_optimizer.step()

            with th.no_grad():
                # Select action according to policy
                next_actions, next_log_prob = self.actor.action_log_prob(
                    replay_data.next_observations)
                # Compute the target Q value
                target_q1, target_q2 = self.critic_target(
                    replay_data.next_observations, next_actions)
                target_q = th.min(target_q1, target_q2)
                target_q = replay_data.rewards + (
                    1 - replay_data.dones) * self.gamma * target_q
                # td error + entropy term
                q_backup = target_q - ent_coef * next_log_prob.reshape(-1, 1)

            # Get current Q estimates
            # using action from the replay buffer
            current_q1, current_q2 = self.critic(replay_data.observations,
                                                 replay_data.actions)

            # Compute critic loss
            critic_loss = 0.5 * (F.mse_loss(current_q1, q_backup) +
                                 F.mse_loss(current_q2, q_backup))
            critic_losses.append(critic_loss.item())

            # Optimize the critic
            self.critic.optimizer.zero_grad()
            critic_loss.backward()
            self.critic.optimizer.step()

            # Compute actor loss
            # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
            qf1_pi, qf2_pi = self.critic.forward(replay_data.observations,
                                                 actions_pi)
            min_qf_pi = th.min(qf1_pi, qf2_pi)
            actor_loss = (ent_coef * log_prob - min_qf_pi).mean()
            actor_losses.append(actor_loss.item())

            # Optimize the actor
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # Update target networks
            if gradient_step % self.target_update_interval == 0:
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)

        self._n_updates += gradient_steps

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude='tensorboard')
        logger.record("train/ent_coef", np.mean(ent_coefs))
        logger.record("train/actor_loss", np.mean(actor_losses))
        logger.record("train/critic_loss", np.mean(critic_losses))
        if len(ent_coef_losses) > 0:
            logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))
示例#30
0
    def train_orig(self) -> None:
        """
        Update policy using the currently gathered rollout buffer.
        """
        # Update optimizer learning rate
        self._update_learning_rate(self.policy.optimizer)
        # Compute current clip range
        clip_range = self.clip_range(self._current_progress_remaining)
        # Optional: clip range for the value function
        if self.clip_range_vf is not None:
            clip_range_vf = self.clip_range_vf(
                self._current_progress_remaining)

        entropy_losses, all_kl_divs = [], []
        pg_losses, value_losses = [], []
        clip_fractions = []

        # train for n_epochs epochs
        for epoch in range(self.n_epochs):
            approx_kl_divs = []
            # Do a complete pass on the rollout buffer
            for rollout_data in self.rollout_buffer.get(self.batch_size):
                actions = rollout_data.actions
                if isinstance(self.action_space, spaces.Discrete):
                    # Convert discrete action from float to long
                    actions = rollout_data.actions.long().flatten()

                # Re-sample the noise matrix because the log_std has changed
                # TODO: investigate why there is no issue with the gradient
                # if that line is commented (as in SAC)
                if self.use_sde:
                    self.policy.reset_noise(self.batch_size)

                values, log_prob, entropy = self.policy.evaluate_actions(
                    rollout_data.observations, actions)
                values = values.flatten()
                # Normalize advantage
                advantages = rollout_data.advantages
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

                # ratio between old and new policy, should be one at the first iteration
                ratio = th.exp(log_prob - rollout_data.old_log_prob)

                # clipped surrogate loss
                policy_loss_1 = advantages * ratio
                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range,
                                                      1 + clip_range)
                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()

                # Logging
                pg_losses.append(policy_loss.item())
                clip_fraction = th.mean(
                    (th.abs(ratio - 1) > clip_range).float()).item()
                clip_fractions.append(clip_fraction)

                if self.clip_range_vf is None:
                    # No clipping
                    values_pred = values
                else:
                    # Clip the different between old and new value
                    # NOTE: this depends on the reward scaling
                    values_pred = rollout_data.old_values + th.clamp(
                        values - rollout_data.old_values, -clip_range_vf,
                        clip_range_vf)
                # Value loss using the TD(gae_lambda) target
                value_loss = F.mse_loss(rollout_data.returns, values_pred)
                value_losses.append(value_loss.item())

                # Entropy loss favor exploration
                if entropy is None:
                    # Approximate entropy when no analytical form
                    entropy_loss = -th.mean(-log_prob)
                else:
                    entropy_loss = -th.mean(entropy)

                entropy_losses.append(entropy_loss.item())

                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss

                # Optimization step
                self.policy.optimizer.zero_grad()
                loss.backward()
                # Clip grad norm
                th.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                            self.max_grad_norm)
                self.policy.optimizer.step()
                approx_kl_divs.append(
                    th.mean(rollout_data.old_log_prob -
                            log_prob).detach().cpu().numpy())

            all_kl_divs.append(np.mean(approx_kl_divs))

            if self.target_kl is not None and np.mean(
                    approx_kl_divs) > 1.5 * self.target_kl:
                print(
                    f"Early stopping at step {epoch} due to reaching max kl: {np.mean(approx_kl_divs):.2f}"
                )
                break

        self._n_updates += self.n_epochs
        explained_var = explained_variance(
            self.rollout_buffer.values.flatten(),
            self.rollout_buffer.returns.flatten())

        # Logs
        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(pg_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/approx_kl", np.mean(approx_kl_divs))
        logger.record("train/clip_fraction", np.mean(clip_fractions))
        logger.record("train/loss", loss.item())
        logger.record("train/explained_variance", explained_var)
        if hasattr(self.policy, "log_std"):
            logger.record("train/std",
                          th.exp(self.policy.log_std).mean().item())

        logger.record("train/n_updates",
                      self._n_updates,
                      exclude="tensorboard")
        logger.record("train/clip_range", clip_range)
        if self.clip_range_vf is not None:
            logger.record("train/clip_range_vf", clip_range_vf)