def _create_full_tensors(start_states, max_path_length, obs_dim, action_dim): num_rollouts = start_states.shape[0] observations = ptu.zeros((num_rollouts, max_path_length + 1, obs_dim)) observations[:, 0] = ptu.from_numpy(start_states) actions = ptu.zeros((num_rollouts, max_path_length, action_dim)) rewards = ptu.zeros((num_rollouts, max_path_length, 1)) terminals = ptu.zeros((num_rollouts, max_path_length, 1)) return observations, actions, rewards, terminals
def get_plan_values_batch(self, obs, plans): """ Get corresponding values of the plans (higher corresponds to better plans). Classes that don't want to plan over actions or use trajectory sampling can reimplement convert_plans_to_actions (& convert_plan_to_action) and/or predict_transition. plans is input as as torch (horizon_length, num_particles (total), plan_dim). We maintain trajectory infos as torch (n_part, info_dim (ex. obs_dim)). """ if self.use_gt_model: return self.get_plan_values_batch_gt(obs, plans) n_part = plans.shape[ 1] # *total* number of particles, NOT num_particles discount = 1 returns, dones, infos = ptu.zeros(n_part), ptu.zeros(n_part), dict() # The effective planning horizon is self.horizon * self.repeat_length for t in range(self.horizon): for k in range(self.repeat_length): cur_actions = self.convert_plans_to_actions(obs, plans[t]) obs, cur_rewards, cur_dones = self.predict_transition( obs, cur_actions, infos) returns += discount * (1 - dones) * cur_rewards discount *= self.discount if self.predict_terminal: dones = torch.max(dones, cur_dones.float()) self.diagnostics.update( create_stats_ordered_dict( 'MPC Termination', ptu.get_numpy(dones), )) if self.value_func is not None: terminal_values = self.value_func( obs, **self.value_func_kwargs).view(-1) returns += discount * (1 - dones) * terminal_values self.diagnostics.update( create_stats_ordered_dict( 'MPC Terminal Values', ptu.get_numpy(terminal_values), )) return returns
def __init__( self, hidden_sizes, output_size, input_size, init_w=3e-3, hidden_activation=F.relu, output_activation=identity, hidden_init=ptu.fanin_init, w_scale=1, b_init_value=0.1, layer_norm=False, batch_norm=False, final_init_scale=None, ): super().__init__() self.input_size = input_size self.output_size = output_size self.hidden_activation = hidden_activation self.output_activation = output_activation self.layer_norm = layer_norm self.batch_norm = batch_norm self.fcs = [] self.layer_norms = [] self.batch_norms = [] # data normalization self.input_mu = nn.Parameter(ptu.zeros(input_size), requires_grad=False).float() self.input_std = nn.Parameter(ptu.ones(input_size), requires_grad=False).float() in_size = input_size for i, next_size in enumerate(hidden_sizes): fc = nn.Linear(in_size, next_size) hidden_init(fc.weight, w_scale) fc.bias.data.fill_(b_init_value) self.__setattr__("fc{}".format(i), fc) self.fcs.append(fc) if self.layer_norm: ln = LayerNorm(next_size) self.__setattr__("layer_norm{}".format(i), ln) self.layer_norms.append(ln) if self.batch_norm: bn = nn.BatchNorm1d(next_size) self.__setattr__('batch_norm%d' % i, bn) self.batch_norms.append(bn) in_size = next_size self.last_fc = nn.Linear(in_size, output_size) if final_init_scale is None: self.last_fc.weight.data.uniform_(-init_w, init_w) self.last_fc.bias.data.uniform_(-init_w, init_w) else: ptu.orthogonal_init(self.last_fc.weight, final_init_scale) self.last_fc.bias.data.fill_(0)
def get_plan_values(self, obs, plans): n_part, batch_size = plans.shape[1], 32768 returns = ptu.zeros(n_part) bi, ei = 0, batch_size while bi < n_part: returns[bi:ei] = self.get_plan_values_batch( obs[bi:ei], plans[:, bi:ei]) bi, ei = bi + batch_size, ei + batch_size return returns
def get_plan_values_batch_gt(self, obs, plans): returns = ptu.zeros(plans.shape[1]) obs, plans = ptu.get_numpy(obs), ptu.get_numpy(plans) final_obs = np.copy(obs) for i in range(plans.shape[1]): returns[i], final_obs[i] = self._get_true_env_value( obs[i], plans[:, i]) if self.value_func is not None: returns += (self.discount**( self.horizon * self.repeat_length)) * (self.value_func( ptu.from_numpy(final_obs), **self.value_func_kwargs)) return returns
def __init__( self, ensemble_size, hidden_sizes, input_size, output_size, init_w=3e-3, hidden_activation=F.relu, output_activation=identity, b_init_value=0.0, layer_norm=False, layer_norm_kwargs=None, spectral_norm=False, ): super().__init__() self.ensemble_size = ensemble_size self.input_size = input_size self.output_size = output_size self.elites = [i for i in range(self.ensemble_size)] self.hidden_activation = hidden_activation self.output_activation = output_activation # data normalization self.input_mu = nn.Parameter( ptu.zeros(input_size), requires_grad=False).float() self.input_std = nn.Parameter( ptu.ones(input_size), requires_grad=False).float() self.fcs = [] in_size = input_size for i, next_size in enumerate(hidden_sizes): layer_size = (ensemble_size, in_size, next_size) fc = ParallelizedLayer( ensemble_size, in_size, next_size, w_std_value=1/(2*np.sqrt(in_size)), b_init_value=b_init_value, ) if spectral_norm: fc = nn.utils.spectral_norm(fc, name='W') self.__setattr__('fc%d'% i, fc) self.fcs.append(fc) in_size = next_size self.last_fc = ParallelizedLayer( ensemble_size, in_size, output_size, w_std_value=1/(2*np.sqrt(in_size)), b_init_value=b_init_value, )
def __init__( self, ensemble_size, input_dim, output_dim, w_std_value=1.0, b_init_value=0.0 ): super().__init__() # approximation to truncated normal of 2 stds w_init = ptu.randn((ensemble_size, input_dim, output_dim)) w_init = torch.fmod(w_init, 2) * w_std_value self.W = nn.Parameter(w_init, requires_grad=True) # constant initialization b_init = ptu.zeros((ensemble_size, 1, output_dim)).float() b_init += b_init_value self.b = nn.Parameter(b_init, requires_grad=True)
def rsample_with_pretanh(self): z = (self.normal_mean + self.normal_std * MultivariateDiagonalNormal( ptu.zeros(self.normal_mean.size()), ptu.ones( self.normal_std.size())).sample()) return torch.tanh(z), z
def forward(self, input): preds = ptu.zeros((len(self.models), *input.shape[:-1], self.output_size)) for i in range(len(self.models)): preds[i] = self.models[i].forward(input) return preds
def train_from_torch(self, batch): obs = batch['observations'] next_obs = batch['next_observations'] actions = batch['actions'] rewards = batch['rewards'] terminals = batch.get('terminals', ptu.zeros(rewards.shape[0], 1)) """ Policy and Alpha Loss """ _, policy_mean, policy_logstd, *_ = self.policy(obs) dist = TanhNormal(policy_mean, policy_logstd.exp()) new_obs_actions, log_pi = dist.rsample_and_logprob() log_pi = log_pi.sum(dim=-1, keepdims=True) if self.use_automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() alpha = self.log_alpha.exp() else: alpha_loss = 0 alpha = 1 q_new_actions = torch.min( self.qf1(obs, new_obs_actions), self.qf2(obs, new_obs_actions), ) policy_loss = (alpha * log_pi - q_new_actions).mean() """ QF Loss """ q1_pred = self.qf1(obs, actions) q2_pred = self.qf2(obs, actions) _, next_policy_mean, next_policy_logstd, *_ = self.policy(next_obs) next_dist = TanhNormal(next_policy_mean, next_policy_logstd.exp()) new_next_actions, new_log_pi = next_dist.rsample_and_logprob() new_log_pi = new_log_pi.sum(dim=-1, keepdims=True) target_q_values = torch.min( self.target_qf1(next_obs, new_next_actions), self.target_qf2(next_obs, new_next_actions), ) - alpha * new_log_pi future_values = (1. - terminals) * self.discount * target_q_values q_target = self.reward_scale * rewards + future_values qf1_loss = self.qf_criterion(q1_pred, q_target.detach()) qf2_loss = self.qf_criterion(q2_pred, q_target.detach()) if self.use_automatic_entropy_tuning: self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.qf1_optimizer.zero_grad() qf1_loss.backward() self.qf1_optimizer.step() self.qf2_optimizer.zero_grad() qf2_loss.backward() self.qf2_optimizer.step() self._n_train_steps_total += 1 self.try_update_target_networks() """ Save some statistics for eval """ if self._need_to_update_eval_statistics: self._need_to_update_eval_statistics = False policy_loss = (log_pi - q_new_actions).mean() policy_avg_std = torch.exp(policy_logstd).mean() self.eval_statistics['QF1 Loss'] = np.mean(ptu.get_numpy(qf1_loss)) self.eval_statistics['QF2 Loss'] = np.mean(ptu.get_numpy(qf2_loss)) self.eval_statistics['Policy Loss'] = np.mean(ptu.get_numpy( policy_loss )) self.eval_statistics.update(create_stats_ordered_dict( 'Q1 Predictions', ptu.get_numpy(q1_pred), )) self.eval_statistics.update(create_stats_ordered_dict( 'Q2 Predictions', ptu.get_numpy(q2_pred), )) self.eval_statistics.update(create_stats_ordered_dict( 'Q Targets', ptu.get_numpy(q_target), )) self.eval_statistics.update(create_stats_ordered_dict( 'Log Pis', ptu.get_numpy(log_pi), )) self.eval_statistics.update(create_stats_ordered_dict( 'Policy mu', ptu.get_numpy(policy_mean), )) self.eval_statistics.update(create_stats_ordered_dict( 'Policy log std', ptu.get_numpy(policy_logstd), )) self.eval_statistics['Policy std'] = np.mean(ptu.get_numpy(policy_avg_std)) if self.use_automatic_entropy_tuning: self.eval_statistics['Alpha'] = alpha.item() self.eval_statistics['Alpha Loss'] = alpha_loss.item() self._n_train_steps_total += 1
def __init__( self, env, # Associated environment for learning policy, # Associated policy (should be TanhGaussian) qf1, # Q function #1 qf2, # Q function #2 target_qf1, # Slow updater to Q function #1 target_qf2, # Slow updater to Q function #2 discount=0.99, # Discount factor reward_scale=1.0, # Scaling of rewards to modulate entropy bonus use_automatic_entropy_tuning=True, # Whether to use the entropy-constrained variant target_entropy=None, # Target entropy for entropy-constraint variant policy_lr=3e-4, # Learning rate of policy and entropy weight qf_lr=3e-4, # Learning rate of Q functions optimizer_class=optim.Adam, # Class of optimizer for all networks soft_target_tau=5e-3, # Rate of update of target networks target_update_period=1, # How often to update target networks ): super().__init__() self.env = env self.policy = policy self.qf1 = qf1 self.qf2 = qf2 self.target_qf1 = target_qf1 self.target_qf2 = target_qf2 self.discount = discount self.reward_scale = reward_scale self.soft_target_tau = soft_target_tau self.target_update_period = target_update_period self.use_automatic_entropy_tuning = use_automatic_entropy_tuning if self.use_automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: # Heuristic value: dimension of action space self.target_entropy = -np.prod(self.env.action_space.shape).item() self.log_alpha = ptu.zeros(1, requires_grad=True) self.alpha_optimizer = optimizer_class( [self.log_alpha], lr=policy_lr, ) self.qf_criterion = nn.MSELoss() self.vf_criterion = nn.MSELoss() self.policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, ) self.qf1_optimizer = optimizer_class( self.qf1.parameters(), lr=qf_lr, ) self.qf2_optimizer = optimizer_class( self.qf2.parameters(), lr=qf_lr, ) self.eval_statistics = OrderedDict() self._n_train_steps_total = 0 self._need_to_update_eval_statistics = True