def get_qvalue_obs_ac(self, obs, g, qpos, qvel, ac): set_sim_state_and_goal(self.model, qpos.copy(), qvel.copy(), g.copy()) next_observation, rew, _, _ = self.model.step(np.array(ac)) next_node = Node(next_observation) next_heuristic = self.heuristic(next_node, augment=False) return (-rew) + next_heuristic
def evaluate_real_world(self, residual_parameters): # TODO: Parallelize this function # Copy parameters to residual self.residual.load_state_dict(residual_parameters) self.controller.reconfigure_heuristic(self.get_residual) self.controller.reconfigure_dynamics(self.get_dynamics_residual) mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns = [], [], [], [], [] mb_obs_model_next = [] for traj in range(self.args.num_real_traj_eval): ep_obs, ep_actions, ep_qpos, ep_qvel = [], [], [], [] ep_obs_model_next = [] current_return = 0. observation = set_sim_state_and_goal(self.env, self.eval_qpos[traj], self.eval_qvel[traj], self.eval_goals[traj]) obs = observation['observation'] for _ in range(self.env_params['max_timesteps']): qpos = observation['sim_state'].qpos.copy() qvel = observation['sim_state'].qvel.copy() goal = observation['desired_goal'].copy() ac, info = self.controller.act(observation) observation_new, rew, _, _ = self.env.step(ac) if self.args.render: self.env.render() # Set model to the same state _ = set_sim_state_and_goal( self.planning_env, qpos, qvel, goal, ) model_observation_next, _, _, _ = self.planning_env.step(ac) obs_model_next = model_observation_next['observation'] self.n_real_steps += 1 obs_new = observation_new['observation'] multi_append([ep_obs, ep_actions, ep_qpos, ep_qvel, ep_obs_model_next], [ obs.copy(), ac.copy(), qpos.copy(), qvel.copy(), obs_model_next.copy()]) current_return += -rew obs = obs_new.copy() observation = observation_new ep_obs.append(obs.copy()) multi_append([mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns, mb_obs_model_next], [ ep_obs, ep_actions, ep_qpos, ep_qvel, current_return, ep_obs_model_next]) mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next = np.array(mb_obs), np.array( mb_actions), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_obs_model_next) self.dynamics_dataset.store_episode( [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next]) self._update_dynamics_normalizer( [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next]) return np.mean(mb_returns)
def get_qvalue(self, observation, ac): # Set the model state set_sim_state_and_goal(self.model, observation['sim_state'].qpos.copy(), observation['sim_state'].qvel.copy(), observation['desired_goal'].copy()) # Get next observation next_observation, rew, _, _ = self.model.step(np.array(ac)) # Get heuristic of the next observation next_node = Node(next_observation) next_heuristic = self.heuristic(next_node, augment=False) return (-rew) + next_heuristic
def collect_trajectories(self, num_traj): ''' This function collects trajectories based on the controller and learned residuals ''' logger.debug("Rolling out") mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [ ], [], [], [], [], [], [], [], [] for traj in range(num_traj): ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [ ], [], [], [], [], [], [], [], [] # observation = self.planning_env.reset() observation = set_sim_state_and_goal( self.planning_env, self.eval_qpos[traj], self.eval_qvel[traj], self.eval_goals[traj], ) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) for _ in range(self.env_params['max_timesteps']): qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel ac, info = self.controller.act(observation) ac_ind = self.planning_env.discrete_actions[tuple(ac)] logger.debug('Heuristic', info['start_node_h']) logger.debug('Action', ac) observation_new, rew, _, _ = self.planning_env.step(ac) # Apply dynamics residual observation_new, rew = self.apply_dynamics_residual( observation, ac, observation_new, rew) self.n_planning_steps += 1 obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self.args.render: self.planning_env.render() multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f], [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()]) obs = obs_new.copy() ag = ag_new.copy() observation = observation_new s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) multi_append([ep_obs, ep_ag, ep_s_h, ep_f], [obs.copy(), ag.copy(), s_h, f.copy()]) multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f], [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f]) mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array( mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f) self.dataset.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f]) # Update normalizer self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
def get_successors(self, node, action): obs = node.obs set_sim_state_and_goal(self.model, obs['sim_state'].qpos.copy(), obs['sim_state'].qvel.copy(), obs['desired_goal'].copy()) next_obs, rew, _, info = self.model.step(np.array(action)) if self.discrepancy_fn is not None: rew = apply_discrepancy_penalty(obs, action, rew, self.discrepancy_fn) if self.residual_dynamics_fn is not None: next_obs, rew = apply_dynamics_residual(self.model, self.residual_dynamics_fn, obs, info, action, next_obs) mj_sim_state = copy.deepcopy(self.model.env.sim.get_state()) next_obs['sim_state'] = mj_sim_state next_node = Node(next_obs) return next_node, -rew
def learn_online_in_real_world(self, max_timesteps=None): # If any pre-existing model is given, load it if self.args.load_dir: self.load() # Reset the environment observation = self.env.reset() # Configure heuristic for controller self.controller.reconfigure_heuristic( lambda obs: get_state_value_residual(obs, self.preproc_inputs, self .state_value_residual)) # Configure dynamics for controller if self.args.agent == 'rts': self.controller.reconfigure_discrepancy( lambda obs, ac: get_discrepancy_neighbors( obs, ac, self.construct_4d_point, self.kdtrees, self.args. neighbor_radius)) # Configure dynamics for controller if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp': self.controller.reconfigure_residual_dynamics( self.get_residual_dynamics) # Count of total number of steps total_n_steps = 0 while True: obs = observation['observation'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel # Get action from the controller ac, info = self.controller.act(observation) if self.args.agent == 'rts': assert self.controller.residual_dynamics_fn is None if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp': assert self.controller.discrepancy_fn is None # Get discrete action index ac_ind = self.env.discrete_actions[tuple(ac)] # Get the next observation next_observation, rew, _, _ = self.env.step(ac) # if np.array_equal(obs, next_observation['observation']): # import ipdb # ipdb.set_trace() # print('ACTION', ac) # print('VALUE PREDICTED', info['start_node_h']) # print('COST', -rew) if self.args.render: self.env.render() total_n_steps += 1 # Check if we reached the goal if self.env.env._is_success(next_observation['achieved_goal'], g): print('REACHED GOAL!') break # Get the next obs obs_next = next_observation['observation'] # Get the sim next obs set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(), g.copy()) next_observation_sim, _, _, _ = self.planning_env.step(ac) obs_sim_next = next_observation_sim['observation'] # Store transition transition = [ obs.copy(), g.copy(), ac_ind, qpos.copy(), qvel.copy(), obs_next.copy(), obs_sim_next.copy() ] dynamics_losses = [] # RTS if self.args.agent == 'rts' and self._check_dynamics_transition( transition): # print('DISCREPANCY IN DYNAMICS') self.memory.store_real_world_transition(transition) # # Fit model self._update_discrepancy_model() # MBPO elif self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp': self.memory.store_real_world_transition(transition) # Update the dynamics if self.args.agent == 'mbpo': for _ in range(self.args.n_online_planning_updates): # Update dynamics loss = self._update_batch_residual_dynamics() dynamics_losses.append(loss.item()) else: loss = self._update_residual_dynamics() dynamics_losses.append(loss) # # Plan in the model value_loss = self.plan_online_in_model( n_planning_updates=self.args.n_online_planning_updates, initial_observation=copy.deepcopy(observation)) # Log logger.record_tabular('n_steps', total_n_steps) if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp': logger.record_tabular('dynamics loss', np.mean(dynamics_losses)) logger.record_tabular('residual_loss', value_loss) # logger.dump_tabular() # Move to next iteration observation = copy.deepcopy(next_observation) if max_timesteps and total_n_steps >= max_timesteps: break return total_n_steps
def online_rollout(self, initial_observation): n_steps = 0 mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], [] mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], [] mb_penetration = [] # Set initial state observation = copy.deepcopy(initial_observation) # Data structures r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], [] r_reward, r_qpos, r_qvel, r_features = [], [], [], [] r_penetration = [] # Start obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(), g.copy()) features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) for _ in range(self.env_params['max_timesteps']): ac, _ = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] next_observation, rew, _, info = self.planning_env.step(ac) penetration = info['penetration'] if self.args.agent == 'rts': rew = apply_discrepancy_penalty(observation, ac, rew, self.controller.discrepancy_fn) if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn': next_observation, rew = apply_dynamics_residual( self.planning_env, self.get_residual_dynamics, observation, info, ac, next_observation) n_steps += 1 # Add to data structures multi_append([ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel, r_features, r_penetration ], [ obs.copy(), ag.copy(), g.copy(), ac_ind, heuristic, rew, qpos.copy(), qvel.copy(), features.copy(), penetration ]) # Move to next step observation = copy.deepcopy(next_observation) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) multi_append( [r_obs, r_ag, r_heuristic, r_features], [obs.copy(), ag.copy(), heuristic, features.copy()]) multi_append([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ], [ r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel, r_features, r_penetration ]) mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration = convert_to_list_of_np_arrays( [ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ]) # Store in memory self.memory.store_internal_model_rollout([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ]) # Update normalizer self.features_normalizer.update_normalizer([ mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration ], self.sampler) return n_steps
def rollout(self, rollout_length=None, initial_state=None): self.env.reset() if initial_state: # Load initial state if given qpos = initial_state['qpos'].copy() qvel = initial_state['qvel'].copy() goal = initial_state['goal'].copy() set_sim_state_and_goal(self.env, qpos, qvel, goal) # Data structures n_steps = 0 ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic = [], [], [], [], [] ep_reward, ep_qpos, ep_qvel, ep_features = [], [], [], [] ep_penetration = [] # Start rollout observation = self.env.get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] features = self.env.extract_features(obs, g) heuristic = self.controller.heuristic_obs_g(obs, g) if rollout_length is None: if self.args.offline: rollout_length = self.env_params['offline_max_timesteps'] else: rollout_length = self.env_params['max_timesteps'] for _ in range(rollout_length): qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel ac, _ = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] observation_new, rew, _, info = self.env.step(ac) penetration = info['penetration'] n_steps += 1 if self.kdtrees_set: assert self.args.agent == 'rts' rew = apply_discrepancy_penalty(observation, ac, rew, self.controller.discrepancy_fn) elif self.residual_dynamics_set: assert self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp' next_observation, rew = apply_dynamics_residual( self.env, self.get_residual_dynamics, observation, info, ac, observation_new) next_observation['sim_state'] = copy.deepcopy( self.env.env.sim.get_state()) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] multi_append([ ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration ], [ obs.copy(), ag.copy(), g.copy(), ac_ind, heuristic, rew, qpos.copy(), qvel.copy(), features.copy(), penetration ]) obs = obs_new.copy() ag = ag_new.copy() observation = copy.deepcopy(observation_new) heuristic = self.controller.heuristic_obs_g(obs, g) features = self.env.extract_features(obs, g) multi_append( [ep_obs, ep_ag, ep_heuristic, ep_features], [obs.copy(), ag.copy(), heuristic, features.copy()]) return ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps
def learn_online_in_real_world(self, max_timesteps=None): # If any pre-existing model is given, load it if self.args.load_dir: self.load() # Reset the environment observation = self.env.reset() # Configure heuristic for controller self.controller.reconfigure_heuristic( lambda obs: get_state_value_residual(obs, self.preproc_inputs, self .state_value_residual)) # Configure dynamics for controller self.controller.reconfigure_residual_dynamics( self.get_residual_dynamics) # Count total number of steps total_n_steps = 0 while True: obs = observation['observation'] g = observation['desired_goal'] qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel # Get action from controller ac, info = self.controller.act(observation) # Get discrete action index ac_ind = self.env.discrete_actions[tuple(ac)] # Get next observation next_observation, rew, _, _ = self.env.step(ac) # Increment counter total_n_steps += 1 if self.env.env._is_success(next_observation['achieved_goal'], g): print('REACHED GOAL!') break if self.args.render: self.env.render() # Get next obs obs_next = next_observation['observation'] # GEt sim next obs set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(), g.copy()) next_observation_sim, _, _, _ = self.planning_env.step(ac) obs_sim_next = next_observation_sim['observation'] # Store transition in real world memory transition = [ obs.copy(), g.copy(), ac_ind, qpos.copy(), qvel.copy(), obs_next.copy(), obs_sim_next.copy() ] self.memory.store_real_world_transition(transition) # Update the dynamics dynamics_losses = [] for _ in range(self.args.n_online_planning_updates): # Update dynamics loss = self._update_residual_dynamics() dynamics_losses.append(loss.item()) # Update state value residual value_loss = self.plan_online_in_model( self.args.n_online_planning_updates, initial_observation=copy.deepcopy(observation)) # log logger.record_tabular('n_steps', total_n_steps) logger.record_tabular('dynamics_loss', np.mean(dynamics_losses)) logger.record_tabular('residual_loss', value_loss) logger.dump_tabular() # Move to next iteration observation = copy.deepcopy(next_observation) if max_timesteps and total_n_steps >= max_timesteps: break return total_n_steps