Exemplo n.º 1
0
    def _update_global_policy(self):
        """
        Computes(updates) a new global policy.
        :return:
        """
        dU, dO, T = self.dU, self.dO, self.T
        # Compute target mean, cov(precision), and weight for each sample;
        # and concatenate them.
        obs_data, tgt_mu = ptu.zeros((0, T, dO)), ptu.zeros((0, T, dU))
        tgt_prc, tgt_wt = ptu.zeros((0, T, dU, dU)), ptu.zeros((0, T))
        for m in range(self.M):
            samples = self.cur[m].sample_list
            X = samples['observations']
            N = len(samples)
            traj = self.new_traj_distr[m]
            pol_info = self.cur[m].pol_info
            mu = ptu.zeros((N, T, dU))
            prc = ptu.zeros((N, T, dU, dU))
            wt = ptu.zeros((N, T))
            obs = ptu.FloatTensor(samples['observations'])
            # Get time-indexed actions.
            for t in range(T):
                # Compute actions along this trajectory.
                prc[:, t, :, :] = ptu.FloatTensor(
                    np.tile(traj.inv_pol_covar[t, :, :], [N, 1, 1]))
                for i in range(N):
                    mu[i,
                       t, :] = ptu.FloatTensor(traj.K[t, :, :].dot(X[i,
                                                                     t, :]) +
                                               traj.k[t, :])
                wt[:, t] = pol_info.pol_wt[t]

            tgt_mu = torch.cat((tgt_mu, mu))
            tgt_prc = torch.cat((tgt_prc, prc))
            tgt_wt = torch.cat((tgt_wt, wt))
            obs_data = torch.cat((obs_data, obs))

        self.global_policy_optimization(obs_data, tgt_mu, tgt_prc, tgt_wt)
Exemplo n.º 2
0
    def __init__(
            self,
            env,
            policy,
            explo_policy,
            u_qf,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf=None,
            action_prior='uniform',
            policy_lr=3e-4,
            qf_lr=1e-4,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            qf_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=1e-2,
            u_soft_target_tau=1e-2,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            min_q_value=-np.inf,
            max_q_value=np.inf,
            residual_gradient_weight=0,
            eval_with_target_policy=False,
            save_replay_buffer=False,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Deterministic Policies
        self._policy = policy
        self._target_policy = policy.copy()

        # Exploration Policy
        self._exploration_policy = explo_policy

        # Evaluation Policy
        if eval_with_target_policy:
            eval_policy = self._target_policy
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._exploration_policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(eval_policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=None,
            ) for idx in range(self._n_unintentional)
        ]

        # Important algorithm hyperparameters
        self._action_prior = action_prior

        # Intentional (Main Task) Q-function
        self._i_qf = i_qf
        self._i_target_qf = i_qf.copy()

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf = u_qf
        self._u_target_qf = u_qf.copy()

        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._residual_gradient_weight = residual_gradient_weight

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = ptu.FloatTensor(u_reward_scales)

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')
        self._qf_lr = qf_lr
        self._policy_lr = policy_lr

        # Q-function and V-function Optimization Criteria
        self._u_qf_criterion = nn.MSELoss()
        self._i_qf_criterion = nn.MSELoss()

        # Q-function(s) optimizers(s)
        self._u_qf_optimizer = optimizer_class(self._u_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)
        self._i_qf_optimizer = optimizer_class(self._i_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Policy regularization coefficients (weights)
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            ptu.FloatTensor(u_policy_pre_activation_weight)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Raw Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Action'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Exemplo n.º 3
0
    def _update_softq_fcn(self, batch, unint_idx=None):
        """
        Q-fcn update
        Args:
            batch:

        Returns:

        """

        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        if unint_idx is None:
            rewards = batch['rewards']
        else:
            rewards = batch['reward_vectors'][:, unint_idx].unsqueeze(-1) \
                      * self.reward_scale
        terminals = batch['terminals']
        n_batch = obs.shape[0]

        if unint_idx is None:
            target_q_fcn = self._i_target_qf
            q_fcn = self._i_qf
            q_fcn_optimizer = self._i_qf_optimizer
        else:
            target_q_fcn = self._u_target_qfs[unint_idx]
            q_fcn = self._u_qfs[unint_idx]
            q_fcn_optimizer = self._u_qf_optimizers[unint_idx]

        # The value of the next state is approximated with uniform act. samples.
        uniform_dist = torch.distributions.Uniform(ptu.FloatTensor([-1.0]),
                                                   ptu.FloatTensor([1.0]))
        target_actions = uniform_dist.sample(
            (self._value_n_particles, self._action_dim)).squeeze()
        q_value_targets = \
            target_q_fcn(
                next_obs.unsqueeze(1).expand(n_batch,
                                             self._value_n_particles,
                                             self._obs_dim),
                target_actions.unsqueeze(0).expand(n_batch,
                                                   self._value_n_particles,
                                                   self._action_dim)
            ).squeeze()
        assert_shape(q_value_targets, [n_batch, self._value_n_particles])

        q_values = q_fcn(obs, actions).squeeze()
        assert_shape(q_values, [n_batch])

        # Equation 10: Vsoft: 'Empirical' mean from q_vals_tgts particles
        next_value = log_sum_exp(q_value_targets.squeeze(), dim=1)
        assert_shape(next_value, [n_batch])

        # Importance _weights add just a constant to the value.
        next_value -= torch.log(ptu.FloatTensor([self._value_n_particles]))
        next_value += self._action_dim * np.log(2)

        # \hat Q in Equation 11
        # ys = (self.reward_scale * rewards.squeeze() +  # Current reward
        ys = (
            rewards.squeeze() +  # Scale reward is already done by base class
            (1 - terminals.squeeze()) * self.discount *
            next_value).detach()  # TODO: CHECK IF I AM DETACHING GRADIENT!!!
        assert_shape(ys, [n_batch])

        # Equation 11: Soft-Bellman error
        bellman_residual = 0.5 * torch.mean((ys - q_values)**2)

        # Gradient descent on _i_policy parameters
        q_fcn_optimizer.zero_grad()  # Zero all model var grads
        bellman_residual.backward()  # Compute gradient of surrogate_loss
        q_fcn_optimizer.step()  # Update model vars

        return bellman_residual
Exemplo n.º 4
0
    def _update_q_fcn(self, batch, demon):
        """
        Q-fcn update
        Args:
            batch:

        Returns:

        """
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        rewards = batch['rewards']
        # THE REWARD FOR THIS DEMON IS THE CORRESPONDING REWARD VECTOR
        rewards = batch['reward_vectors'][:, demon].unsqueeze(-1)
        terminals = batch['terminals']
        n_batch = obs.shape[0]

        # The value of the next state is approximated with uniform samples.
        uniform_dist = torch.distributions.Uniform(ptu.FloatTensor([-1.0]),
                                                   ptu.FloatTensor([1.0]))
        target_actions = uniform_dist.sample(
            (self._value_n_particles, self._action_dim)).squeeze()
        # target_actions = (-1 - 1) * torch.tensor(torch.rand(self._value_n_particles,
        #                                        self._action_dim)) \
        #                  + 1

        q_value_targets = \
            self.target_qfs[demon](
                next_obs.unsqueeze(1).expand(n_batch,
                                             self._value_n_particles,
                                             self._obs_dim),
                target_actions.unsqueeze(0).expand(n_batch,
                                                   self._value_n_particles,
                                                   self._action_dim)
            ).squeeze()
        assert_shape(q_value_targets, [n_batch, self._value_n_particles])

        q_values = self.qfs[demon](obs, actions).squeeze()
        assert_shape(q_values, [n_batch])

        # Equation 10: 'Empirical' Vsoft
        next_value = log_sum_exp(q_value_targets.squeeze(), dim=1)
        assert_shape(next_value, [n_batch])

        # Importance _weights add just a constant to the value.
        next_value -= torch.log(ptu.FloatTensor([self._value_n_particles]))
        next_value += self._action_dim * np.log(2)

        # \hat Q in Equation 11
        ys = (
            self.reward_scale * rewards.squeeze() +  # Current reward
            # ys = (rewards.squeeze() +  # IT IS NOT NECESSARY TO SCALE REWARDS (ALREADY DONE)
            (1 - terminals.squeeze()) * self.discount *
            next_value  # Future return
        ).detach()  # TODO: CHECK IF I AM DETACHING GRADIENT!!!
        assert_shape(ys, [n_batch])

        # Equation 11:
        bellman_residual = 0.5 * torch.mean((ys - q_values)**2)

        # Gradient descent on _i_policy parameters
        self.qf_optimizers[demon].zero_grad()  # Zero all model var grads
        bellman_residual.backward()  # Compute gradient of surrogate_loss
        self.qf_optimizers[demon].step()  # Update model vars

        return bellman_residual