def do_rollouts(self, num_rollouts=1, rollout_length=None, initial_state=None): # Data structures mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], [] mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], [] mb_penetration = [] mb_n_steps = 0 for _ in range(num_rollouts): ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps = self.rollout( rollout_length, initial_state) multi_append([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ], [ ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration ]) mb_n_steps += n_steps return [ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_n_steps, mb_penetration ]
def evaluate_real_world(self, residual_parameters): # TODO: Parallelize this function # Copy parameters to residual self.residual.load_state_dict(residual_parameters) self.controller.reconfigure_heuristic(self.get_residual) self.controller.reconfigure_dynamics(self.get_dynamics_residual) mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns = [], [], [], [], [] mb_obs_model_next = [] for traj in range(self.args.num_real_traj_eval): ep_obs, ep_actions, ep_qpos, ep_qvel = [], [], [], [] ep_obs_model_next = [] current_return = 0. observation = set_sim_state_and_goal(self.env, self.eval_qpos[traj], self.eval_qvel[traj], self.eval_goals[traj]) obs = observation['observation'] for _ in range(self.env_params['max_timesteps']): qpos = observation['sim_state'].qpos.copy() qvel = observation['sim_state'].qvel.copy() goal = observation['desired_goal'].copy() ac, info = self.controller.act(observation) observation_new, rew, _, _ = self.env.step(ac) if self.args.render: self.env.render() # Set model to the same state _ = set_sim_state_and_goal( self.planning_env, qpos, qvel, goal, ) model_observation_next, _, _, _ = self.planning_env.step(ac) obs_model_next = model_observation_next['observation'] self.n_real_steps += 1 obs_new = observation_new['observation'] multi_append([ep_obs, ep_actions, ep_qpos, ep_qvel, ep_obs_model_next], [ obs.copy(), ac.copy(), qpos.copy(), qvel.copy(), obs_model_next.copy()]) current_return += -rew obs = obs_new.copy() observation = observation_new ep_obs.append(obs.copy()) multi_append([mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns, mb_obs_model_next], [ ep_obs, ep_actions, ep_qpos, ep_qvel, current_return, ep_obs_model_next]) mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next = np.array(mb_obs), np.array( mb_actions), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_obs_model_next) self.dynamics_dataset.store_episode( [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next]) self._update_dynamics_normalizer( [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next]) return np.mean(mb_returns)
def collect_trajectories(self, num_traj): ''' This function collects trajectories based on the controller and learned residuals ''' logger.debug("Rolling out") mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [ ], [], [], [], [], [], [], [], [] for traj in range(num_traj): ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [ ], [], [], [], [], [], [], [], [] # observation = self.planning_env.reset() observation = set_sim_state_and_goal( self.planning_env, self.eval_qpos[traj], self.eval_qvel[traj], self.eval_goals[traj], ) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) for _ in range(self.env_params['max_timesteps']): qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel ac, info = self.controller.act(observation) ac_ind = self.planning_env.discrete_actions[tuple(ac)] logger.debug('Heuristic', info['start_node_h']) logger.debug('Action', ac) observation_new, rew, _, _ = self.planning_env.step(ac) # Apply dynamics residual observation_new, rew = self.apply_dynamics_residual( observation, ac, observation_new, rew) self.n_planning_steps += 1 obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self.args.render: self.planning_env.render() multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f], [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()]) obs = obs_new.copy() ag = ag_new.copy() observation = observation_new s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) multi_append([ep_obs, ep_ag, ep_s_h, ep_f], [obs.copy(), ag.copy(), s_h, f.copy()]) multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f], [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f]) mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array( mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f) self.dataset.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f]) # Update normalizer self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
def online_rollout(self, initial_observation): n_steps = 0 mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], [] mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], [] mb_penetration = [] # Set initial state observation = copy.deepcopy(initial_observation) # Data structures r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], [] r_reward, r_qpos, r_qvel, r_features = [], [], [], [] r_penetration = [] # Start obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(), g.copy()) features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) for _ in range(self.env_params['max_timesteps']): ac, _ = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] next_observation, rew, _, info = self.planning_env.step(ac) penetration = info['penetration'] if self.args.agent == 'rts': rew = apply_discrepancy_penalty(observation, ac, rew, self.controller.discrepancy_fn) if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn': next_observation, rew = apply_dynamics_residual( self.planning_env, self.get_residual_dynamics, observation, info, ac, next_observation) n_steps += 1 # Add to data structures multi_append([ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel, r_features, r_penetration ], [ obs.copy(), ag.copy(), g.copy(), ac_ind, heuristic, rew, qpos.copy(), qvel.copy(), features.copy(), penetration ]) # Move to next step observation = copy.deepcopy(next_observation) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) multi_append( [r_obs, r_ag, r_heuristic, r_features], [obs.copy(), ag.copy(), heuristic, features.copy()]) multi_append([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ], [ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel, r_features, r_penetration ]) mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration = convert_to_list_of_np_arrays( [ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ]) # Store in memory self.memory.store_internal_model_rollout([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ]) # Update normalizer self.features_normalizer.update_normalizer([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ], self.sampler) return n_steps
def rollout(self, rollout_length=None, initial_state=None): self.env.reset() if initial_state: # Load initial state if given qpos = initial_state['qpos'].copy() qvel = initial_state['qvel'].copy() goal = initial_state['goal'].copy() set_sim_state_and_goal(self.env, qpos, qvel, goal) # Data structures n_steps = 0 ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic = [], [], [], [], [] ep_reward, ep_qpos, ep_qvel, ep_features = [], [], [], [] ep_penetration = [] # Start rollout observation = self.env.get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) if rollout_length is None: if self.args.offline: rollout_length = self.env_params['offline_max_timesteps'] else: rollout_length = self.env_params['max_timesteps'] for _ in range(rollout_length): qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel ac, _ = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] observation_new, rew, _, info = self.env.step(ac) penetration = info['penetration'] n_steps += 1 if self.kdtrees_set: assert self.args.agent == 'rts' rew = apply_discrepancy_penalty(observation, ac, rew, self.controller.discrepancy_fn) elif self.residual_dynamics_set: assert self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp' next_observation, rew = apply_dynamics_residual( self.env, self.get_residual_dynamics, observation, info, ac, observation_new) next_observation['sim_state'] = copy.deepcopy( self.env.env.sim.get_state()) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] multi_append([ ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration ], [ obs.copy(), ag.copy(), g.copy(), ac_ind, heuristic, rew, qpos.copy(), qvel.copy(), features.copy(), penetration ]) obs = obs_new.copy() ag = ag_new.copy() observation = copy.deepcopy(observation_new) heuristic = self.controller.heuristic_obs_g(obs, g) features = self.env.extract_features(obs, g) multi_append( [ep_obs, ep_ag, ep_heuristic, ep_features], [obs.copy(), ag.copy(), heuristic, features.copy()]) return ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps
def collect_internal_model_trajectories(self, num_rollouts, rollout_length, initial_observations=None): n_steps = 0 mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], [] mb_reward, mb_features = [], [] # Start rollouts for n in range(num_rollouts): # Set initial state if initial_observations is not None: observation = copy.deepcopy(initial_observations[n]) else: observation = self.env.get_obs() # Data structures r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], [] r_reward, r_features = [], [] # Start obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) for _ in range(rollout_length): ac, _ = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] # Get the next observation and reward using the learned model observation_new = get_next_observation( observation, ac, self.preproc_dynamics_inputs, self.learned_model_dynamics) rew = self.env.compute_reward(observation_new['achieved_goal'], observation_new['desired_goal'], {}) n_steps += 1 # Add to data structures multi_append([ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_features ], [ obs.copy(), ag.copy(), g.copy(), ac_ind, heuristic, rew, features.copy() ]) # Move to next step observation = copy.deepcopy(observation_new) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) multi_append( [r_obs, r_ag, r_heuristic, r_features], [obs.copy(), ag.copy(), heuristic, features.copy()]) multi_append([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features ], [ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_features ]) mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features = convert_to_list_of_np_arrays( [ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features ]) # Store in memory self.memory.store_internal_model_rollout([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features ], sim=False) # Update normalizer self._update_normalizer([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features ]) return n_steps