def _create_full_tensors(start_states, max_path_length, obs_dim, action_dim):
    num_rollouts = start_states.shape[0]
    observations = ptu.zeros((num_rollouts, max_path_length + 1, obs_dim))
    observations[:, 0] = ptu.from_numpy(start_states)
    actions = ptu.zeros((num_rollouts, max_path_length, action_dim))
    rewards = ptu.zeros((num_rollouts, max_path_length, 1))
    terminals = ptu.zeros((num_rollouts, max_path_length, 1))
    return observations, actions, rewards, terminals
示例#2
0
    def get_plan_values_batch(self, obs, plans):
        """
        Get corresponding values of the plans (higher corresponds to better plans). Classes
        that don't want to plan over actions or use trajectory sampling can reimplement
        convert_plans_to_actions (& convert_plan_to_action) and/or predict_transition.
        plans is input as as torch (horizon_length, num_particles (total), plan_dim).
        We maintain trajectory infos as torch (n_part, info_dim (ex. obs_dim)).
        """

        if self.use_gt_model:
            return self.get_plan_values_batch_gt(obs, plans)

        n_part = plans.shape[
            1]  # *total* number of particles, NOT num_particles

        discount = 1
        returns, dones, infos = ptu.zeros(n_part), ptu.zeros(n_part), dict()

        # The effective planning horizon is self.horizon * self.repeat_length
        for t in range(self.horizon):
            for k in range(self.repeat_length):
                cur_actions = self.convert_plans_to_actions(obs, plans[t])
                obs, cur_rewards, cur_dones = self.predict_transition(
                    obs, cur_actions, infos)
                returns += discount * (1 - dones) * cur_rewards
                discount *= self.discount
                if self.predict_terminal:
                    dones = torch.max(dones, cur_dones.float())

        self.diagnostics.update(
            create_stats_ordered_dict(
                'MPC Termination',
                ptu.get_numpy(dones),
            ))

        if self.value_func is not None:
            terminal_values = self.value_func(
                obs, **self.value_func_kwargs).view(-1)
            returns += discount * (1 - dones) * terminal_values

            self.diagnostics.update(
                create_stats_ordered_dict(
                    'MPC Terminal Values',
                    ptu.get_numpy(terminal_values),
                ))

        return returns
示例#3
0
    def __init__(
            self,
            hidden_sizes,
            output_size,
            input_size,
            init_w=3e-3,
            hidden_activation=F.relu,
            output_activation=identity,
            hidden_init=ptu.fanin_init,
            w_scale=1,
            b_init_value=0.1,
            layer_norm=False,
            batch_norm=False,
            final_init_scale=None,
    ):
        super().__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.layer_norm = layer_norm
        self.batch_norm = batch_norm

        self.fcs = []
        self.layer_norms = []
        self.batch_norms = []

        # data normalization
        self.input_mu = nn.Parameter(ptu.zeros(input_size), requires_grad=False).float()
        self.input_std = nn.Parameter(ptu.ones(input_size), requires_grad=False).float()

        in_size = input_size
        for i, next_size in enumerate(hidden_sizes):
            fc = nn.Linear(in_size, next_size)
            hidden_init(fc.weight, w_scale)
            fc.bias.data.fill_(b_init_value)
            self.__setattr__("fc{}".format(i), fc)
            self.fcs.append(fc)

            if self.layer_norm:
                ln = LayerNorm(next_size)
                self.__setattr__("layer_norm{}".format(i), ln)
                self.layer_norms.append(ln)

            if self.batch_norm:
                bn = nn.BatchNorm1d(next_size)
                self.__setattr__('batch_norm%d' % i, bn)
                self.batch_norms.append(bn)

            in_size = next_size

        self.last_fc = nn.Linear(in_size, output_size)
        if final_init_scale is None:
            self.last_fc.weight.data.uniform_(-init_w, init_w)
            self.last_fc.bias.data.uniform_(-init_w, init_w)
        else:
            ptu.orthogonal_init(self.last_fc.weight, final_init_scale)
            self.last_fc.bias.data.fill_(0)
示例#4
0
    def get_plan_values(self, obs, plans):
        n_part, batch_size = plans.shape[1], 32768

        returns = ptu.zeros(n_part)
        bi, ei = 0, batch_size
        while bi < n_part:
            returns[bi:ei] = self.get_plan_values_batch(
                obs[bi:ei], plans[:, bi:ei])
            bi, ei = bi + batch_size, ei + batch_size

        return returns
示例#5
0
 def get_plan_values_batch_gt(self, obs, plans):
     returns = ptu.zeros(plans.shape[1])
     obs, plans = ptu.get_numpy(obs), ptu.get_numpy(plans)
     final_obs = np.copy(obs)
     for i in range(plans.shape[1]):
         returns[i], final_obs[i] = self._get_true_env_value(
             obs[i], plans[:, i])
     if self.value_func is not None:
         returns += (self.discount**(
             self.horizon * self.repeat_length)) * (self.value_func(
                 ptu.from_numpy(final_obs), **self.value_func_kwargs))
     return returns
示例#6
0
    def __init__(
            self,
            ensemble_size,
            hidden_sizes,
            input_size,
            output_size,
            init_w=3e-3,
            hidden_activation=F.relu,
            output_activation=identity,
            b_init_value=0.0,
            layer_norm=False,
            layer_norm_kwargs=None,
            spectral_norm=False,
    ):
        super().__init__()

        self.ensemble_size = ensemble_size
        self.input_size = input_size
        self.output_size = output_size
        self.elites = [i for i in range(self.ensemble_size)]

        self.hidden_activation = hidden_activation
        self.output_activation = output_activation

        # data normalization
        self.input_mu = nn.Parameter(
            ptu.zeros(input_size), requires_grad=False).float()
        self.input_std = nn.Parameter(
            ptu.ones(input_size), requires_grad=False).float()

        self.fcs = []

        in_size = input_size
        for i, next_size in enumerate(hidden_sizes):
            layer_size = (ensemble_size, in_size, next_size)
            fc = ParallelizedLayer(
                ensemble_size, in_size, next_size,
                w_std_value=1/(2*np.sqrt(in_size)),
                b_init_value=b_init_value,
            )
            if spectral_norm:
                fc = nn.utils.spectral_norm(fc, name='W')
            self.__setattr__('fc%d'% i, fc)
            self.fcs.append(fc)
            in_size = next_size

        self.last_fc = ParallelizedLayer(
            ensemble_size, in_size, output_size,
            w_std_value=1/(2*np.sqrt(in_size)),
            b_init_value=b_init_value,
        )
示例#7
0
    def __init__(
        self,
        ensemble_size,
        input_dim,
        output_dim,
        w_std_value=1.0,
        b_init_value=0.0
    ):
        super().__init__()

        # approximation to truncated normal of 2 stds
        w_init = ptu.randn((ensemble_size, input_dim, output_dim))
        w_init = torch.fmod(w_init, 2) * w_std_value
        self.W = nn.Parameter(w_init, requires_grad=True)

        # constant initialization
        b_init = ptu.zeros((ensemble_size, 1, output_dim)).float()
        b_init += b_init_value
        self.b = nn.Parameter(b_init, requires_grad=True)
示例#8
0
 def rsample_with_pretanh(self):
     z = (self.normal_mean + self.normal_std * MultivariateDiagonalNormal(
         ptu.zeros(self.normal_mean.size()), ptu.ones(
             self.normal_std.size())).sample())
     return torch.tanh(z), z
示例#9
0
 def forward(self, input):
     preds = ptu.zeros((len(self.models), *input.shape[:-1], self.output_size))
     for i in range(len(self.models)):
         preds[i] = self.models[i].forward(input)
     return preds
示例#10
0
    def train_from_torch(self, batch):
        obs = batch['observations']
        next_obs = batch['next_observations']
        actions = batch['actions']
        rewards = batch['rewards']
        terminals = batch.get('terminals', ptu.zeros(rewards.shape[0], 1))

        """
        Policy and Alpha Loss
        """
        _, policy_mean, policy_logstd, *_ = self.policy(obs)
        dist = TanhNormal(policy_mean, policy_logstd.exp())
        new_obs_actions, log_pi = dist.rsample_and_logprob()
        log_pi = log_pi.sum(dim=-1, keepdims=True)
        if self.use_automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
            alpha = self.log_alpha.exp()
        else:
            alpha_loss = 0
            alpha = 1

        q_new_actions = torch.min(
            self.qf1(obs, new_obs_actions),
            self.qf2(obs, new_obs_actions),
        )
        policy_loss = (alpha * log_pi - q_new_actions).mean()

        """
        QF Loss
        """
        q1_pred = self.qf1(obs, actions)
        q2_pred = self.qf2(obs, actions)
        _, next_policy_mean, next_policy_logstd, *_ = self.policy(next_obs)
        next_dist = TanhNormal(next_policy_mean, next_policy_logstd.exp())
        new_next_actions, new_log_pi = next_dist.rsample_and_logprob()
        new_log_pi = new_log_pi.sum(dim=-1, keepdims=True)
        target_q_values = torch.min(
            self.target_qf1(next_obs, new_next_actions),
            self.target_qf2(next_obs, new_next_actions),
        ) - alpha * new_log_pi

        future_values = (1. - terminals) * self.discount * target_q_values
        q_target = self.reward_scale * rewards + future_values
        qf1_loss = self.qf_criterion(q1_pred, q_target.detach())
        qf2_loss = self.qf_criterion(q2_pred, q_target.detach())

        if self.use_automatic_entropy_tuning:
            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.qf1_optimizer.zero_grad()
        qf1_loss.backward()
        self.qf1_optimizer.step()

        self.qf2_optimizer.zero_grad()
        qf2_loss.backward()
        self.qf2_optimizer.step()

        self._n_train_steps_total += 1

        self.try_update_target_networks()

        """
        Save some statistics for eval
        """
        if self._need_to_update_eval_statistics:
            self._need_to_update_eval_statistics = False

            policy_loss = (log_pi - q_new_actions).mean()
            policy_avg_std = torch.exp(policy_logstd).mean()

            self.eval_statistics['QF1 Loss'] = np.mean(ptu.get_numpy(qf1_loss))
            self.eval_statistics['QF2 Loss'] = np.mean(ptu.get_numpy(qf2_loss))
            self.eval_statistics['Policy Loss'] = np.mean(ptu.get_numpy(
                policy_loss
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Q1 Predictions',
                ptu.get_numpy(q1_pred),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Q2 Predictions',
                ptu.get_numpy(q2_pred),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Q Targets',
                ptu.get_numpy(q_target),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Log Pis',
                ptu.get_numpy(log_pi),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Policy mu',
                ptu.get_numpy(policy_mean),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Policy log std',
                ptu.get_numpy(policy_logstd),
            ))
            self.eval_statistics['Policy std'] = np.mean(ptu.get_numpy(policy_avg_std))

            if self.use_automatic_entropy_tuning:
                self.eval_statistics['Alpha'] = alpha.item()
                self.eval_statistics['Alpha Loss'] = alpha_loss.item()

        self._n_train_steps_total += 1
示例#11
0
    def __init__(
            self,
            env,                                # Associated environment for learning
            policy,                             # Associated policy (should be TanhGaussian)
            qf1,                                # Q function #1
            qf2,                                # Q function #2
            target_qf1,                         # Slow updater to Q function #1
            target_qf2,                         # Slow updater to Q function #2

            discount=0.99,                      # Discount factor
            reward_scale=1.0,                   # Scaling of rewards to modulate entropy bonus
            use_automatic_entropy_tuning=True,  # Whether to use the entropy-constrained variant
            target_entropy=None,                # Target entropy for entropy-constraint variant

            policy_lr=3e-4,                     # Learning rate of policy and entropy weight
            qf_lr=3e-4,                         # Learning rate of Q functions
            optimizer_class=optim.Adam,         # Class of optimizer for all networks

            soft_target_tau=5e-3,               # Rate of update of target networks
            target_update_period=1,             # How often to update target networks
    ):
        super().__init__()

        self.env = env
        self.policy = policy
        self.qf1 = qf1
        self.qf2 = qf2
        self.target_qf1 = target_qf1
        self.target_qf2 = target_qf2

        self.discount = discount
        self.reward_scale = reward_scale
        self.soft_target_tau = soft_target_tau
        self.target_update_period = target_update_period

        self.use_automatic_entropy_tuning = use_automatic_entropy_tuning
        if self.use_automatic_entropy_tuning:
            if target_entropy:
                self.target_entropy = target_entropy
            else:
                # Heuristic value: dimension of action space
                self.target_entropy = -np.prod(self.env.action_space.shape).item()
            self.log_alpha = ptu.zeros(1, requires_grad=True)
            self.alpha_optimizer = optimizer_class(
                [self.log_alpha],
                lr=policy_lr,
            )

        self.qf_criterion = nn.MSELoss()
        self.vf_criterion = nn.MSELoss()

        self.policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
        )
        self.qf1_optimizer = optimizer_class(
            self.qf1.parameters(),
            lr=qf_lr,
        )
        self.qf2_optimizer = optimizer_class(
            self.qf2.parameters(),
            lr=qf_lr,
        )

        self.eval_statistics = OrderedDict()
        self._n_train_steps_total = 0
        self._need_to_update_eval_statistics = True