def get_traj_info(self): """ Run the policy for a full trajectory (for logging purposes). """ env_state = self.env.sim.get_state() if self.mujoco else None infos = traj.eval_traj( copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=self.H, gamma=self.gamma, act_mode='deter', pt=(self.TD3, 0), tvel=self.tvel ) return infos
def get_traj_info(self): """ Run the policy for a full trajectory and return details about the trajectory. """ env_state = self.env.sim.get_state() if self.mujoco else None infos = traj.eval_traj(copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=self.H, gamma=self.gamma, act_mode='deter', pt=(self.pol, 0), terminal=self.val_ens, tvel=self.tvel) return infos
def test_policy(self): """ Run the TD3 action selection mechanism. """ env = copy.deepcopy(self.env) obs = env.reset() if self.tvel is not None: env.set_target_vel(self.tvel) obs = env._get_obs() env_state = env.sim.get_state() if self.mujoco else None infos = traj.eval_traj( env, env_state, obs, mujoco=self.mujoco, perturb=self.perturb, H=self.eval_len, gamma=1, act_mode='deter', pt=(self.TD3, 0), tvel=self.tvel ) self.hist['pol_test'][self.time] = infos[3]
def get_action(self, terminal=None, prior=None): """ Get the action as the first action of an iteratively improved planned trajectory. Optionally, use a terminal value function to help evaluate trajectories, or start with a specific prior trajectory. """ self.extend_plan(self.H) # Log initial trajectory env_state = self.env.sim.get_state() if self.mujoco else None terminal = terminal if self.use_terminal else None init_states, _, init_rews, cum_rew, emp_rew = \ traj.eval_traj( copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=self.H, gamma=self.gamma, act_mode='fixed', pt=(np.array(self.planned_actions), 0), terminal=terminal, tvel=self.tvel ) self.hist['init_plan'][self.time] = [cum_rew, emp_rew] self.init_cache = (init_states, init_rews, emp_rew) # Use the prior as the starting trajectory, if better if prior is not None: pol_val = self.hist['pols'][self.time][0] ratio = (cum_rew-pol_val) / abs(pol_val) if pol_val > cum_rew: self.hist['prior_des'].append('pol') self.planned_actions = [] for i in range(prior.shape[0]): self.planned_actions.append(prior[i]) # Store for calculations later init_states = self.pol_cache[0] init_rews = self.pol_cache[1] else: self.hist['prior_des'].append('plan') else: ratio = 1 # Determine if we should use a lower planning horizon H = self.H if self.has_aop: # First, check std threshold std = self.hist['vals'][self.time][2] force_full_H = std > self.params['aop']['std_thres'] # Pick Bellman error based on future values rews = init_rews states_t = torch.tensor(init_states, dtype=self.dtype) states_t = states_t.to(device=self.device) vals = self.val_ens.forward(states_t) vals = torch.squeeze(vals, dim=-1) vals = vals.detach().cpu().numpy() cval = self.hist['vals'][self.time][0] # Calculate the Bellman error from each state belerrs, run_rew = np.zeros(rews.shape[0]), 0 for k in reversed(range(self.H)): run_rew = rews[k] + self.gamma * run_rew cv = vals[k-1] if k > 0 else cval belerrs[k] = abs(run_rew + vals[-1] - cv) if force_full_H: H = self.H else: # Use the first H where the Bellman error is too high H = 1 BE_thres = self.params['aop']['bellman_thres'] for k in reversed(range(self.H)): if belerrs[k] > BE_thres: H += k break # Get the value of the plan w.r.t. new H cum_rew = (self.gamma ** H) * vals[H-1] for t in range(H): cum_rew += rews[t] # Logging for AOP self.hist['H_des'][self.time] = H if self.has_aop: self.hist['plan_belerr'][self.time].append(belerrs) # Store best plan found by MPC best_plan_val = cum_rew best_plan = copy.deepcopy(self.planned_actions) old_plan_val = cum_rew # Run multiple iterations of trajectory optimization for i in range(self.params['mpc']['num_iter']): # Continue if the ratio threshold is met _continue = True if self.has_aop: thres = self.params['aop']['init_thres'] if i == 0 else \ self.params['aop']['ratio_thres'] rprob = random.random() < self.params['aop']['eps_plan'] _continue = (ratio > thres) or rprob if not _continue: break num_rollouts = self.params['mpc']['num_rollouts'] self.planned_actions = copy.deepcopy(best_plan) self.extend_plan(H) # Run MPC optimization self.update_plan(terminal, num_rollouts) self.finished_plan_update(i) # Update best plan found by MPC new_plan_val = self.hist['plan'][self.time][i][2] ratio = (new_plan_val - old_plan_val) / abs(old_plan_val) if new_plan_val > best_plan_val: best_plan_val = new_plan_val best_plan = copy.deepcopy(self.planned_actions) old_plan_val = new_plan_val self.planned_actions = copy.deepcopy(best_plan) # Measure final planning information metrics env_state = self.env.sim.get_state() if self.mujoco else None fin_states, _, fin_rews, fin_cum_rew, fin_emp_rew = \ traj.eval_traj( copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=len(self.planned_actions), gamma=self.gamma, act_mode='fixed', pt=(np.array(self.planned_actions), 0), terminal=terminal, tvel=self.tvel ) if len(self.cache) == 0: self.cache = ([], []) self.cache = ( self.cache[0], self.cache[1], fin_states, fin_rews ) # Increment the trajectory by a timestep action = self.advance_plan() return action
def update_plan(self, terminal=None, num_rollouts=None): """ Update the plan by generating noisy rollouts and then running MPPI. """ H = len(self.planned_actions) if num_rollouts is None: num_rollouts = self.params['mpc']['num_rollouts'] # Generate and execute noisy rollouts plan = np.array(self.planned_actions) filter_coefs = self.params['mpc']['filter_coefs'] env_state = self.env.sim.get_state() if self.mujoco else None paths = traj.generate_trajectories( num_rollouts, self.env, env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=H, gamma=self.gamma, act_mode='fixed', pt=(plan, filter_coefs), terminal=None, tvel=self.tvel, num_cpu=self.params['mpc']['num_cpu'] ) num_rollouts = len(paths) # Evaluate rollouts with the terminal value function if terminal is not None: final_states = np.zeros((num_rollouts, self.N)) for i in range(num_rollouts): final_states[i] = paths[i][0][-1] # Calculate terminal values fs = torch.tensor(final_states, dtype=self.dtype) terminal_vals = terminal.forward(fs).detach().cpu().numpy() # Append terminal values to cum_rew for i in range(num_rollouts): paths[i][3] += terminal_vals[i] self.use_paths(paths) plan_rews = np.zeros(num_rollouts) plan_rews_emp = np.zeros(num_rollouts) states = np.zeros((num_rollouts, H, self.N)) actions = np.zeros((num_rollouts, H, self.M)) for i in range(num_rollouts): plan_rews[i] = paths[i][3] plan_rews_emp[i] = paths[i][4] states[i] = paths[i][0] actions[i] = paths[i][1] if self.params['mpc']['temp'] is not None: # Use MPPI to combine actions R = plan_rews advs = (R - np.min(R)) / (np.max(R) - np.min(R) + 1e-6) S = np.exp(advs / self.params['mpc']['temp']) weighted_seq = S * actions.T planned_actions = np.sum(weighted_seq.T, axis=0) planned_actions /= np.sum(S) + 1e-6 # We could also do CEM, etc. here else: # As temp --> 0, MPPI becomes random shooting ind = np.argmax(plan_rews) planned_actions = actions[ind] # Turn planned_actions back into a list self.planned_actions = [] for i in range(planned_actions.shape[0]): self.planned_actions.append(planned_actions[i]) # Calculate information for logging ro_mean = np.mean(plan_rews) ro_std = np.std(plan_rews) emp_mean = np.mean(plan_rews_emp) emp_std = np.std(plan_rews_emp) # Calculate information for MPC's generated plan env_state = self.env.sim.get_state() if self.mujoco else None fin_states, _, fin_rews, fin_rew, emp_rew = \ traj.eval_traj( copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=len(self.planned_actions), gamma=self.gamma, act_mode='fixed', pt=(np.array(self.planned_actions), 0), terminal=terminal, tvel=self.tvel ) # Store information in history self.hist['plan'][self.time].append( [ro_mean, ro_std, fin_rew, emp_mean, emp_std] ) self.cache = (states, plan_rews, fin_states, fin_rews) return states, plan_rews