예제 #1
0
    def do_rollouts(self,
                    num_rollouts=1,
                    rollout_length=None,
                    initial_state=None):
        # Data structures
        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], []
        mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], []
        mb_penetration = []
        mb_n_steps = 0
        for _ in range(num_rollouts):
            ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps = self.rollout(
                rollout_length, initial_state)
            multi_append([
                mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
                mb_qpos, mb_qvel, mb_features, mb_penetration
            ], [
                ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward,
                ep_qpos, ep_qvel, ep_features, ep_penetration
            ])
            mb_n_steps += n_steps

        return [
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_n_steps, mb_penetration
        ]
예제 #2
0
    def evaluate_real_world(self, residual_parameters):
        # TODO: Parallelize this function
        # Copy parameters to residual
        self.residual.load_state_dict(residual_parameters)
        self.controller.reconfigure_heuristic(self.get_residual)
        self.controller.reconfigure_dynamics(self.get_dynamics_residual)
        mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns = [], [], [], [], []
        mb_obs_model_next = []
        for traj in range(self.args.num_real_traj_eval):
            ep_obs, ep_actions, ep_qpos, ep_qvel = [], [], [], []
            ep_obs_model_next = []
            current_return = 0.
            observation = set_sim_state_and_goal(self.env,
                                                 self.eval_qpos[traj],
                                                 self.eval_qvel[traj],
                                                 self.eval_goals[traj])
            obs = observation['observation']
            for _ in range(self.env_params['max_timesteps']):
                qpos = observation['sim_state'].qpos.copy()
                qvel = observation['sim_state'].qvel.copy()
                goal = observation['desired_goal'].copy()
                ac, info = self.controller.act(observation)
                observation_new, rew, _, _ = self.env.step(ac)
                if self.args.render:
                    self.env.render()
                # Set model to the same state
                _ = set_sim_state_and_goal(
                    self.planning_env,
                    qpos,
                    qvel,
                    goal,
                )
                model_observation_next, _, _, _ = self.planning_env.step(ac)
                obs_model_next = model_observation_next['observation']
                self.n_real_steps += 1
                obs_new = observation_new['observation']
                multi_append([ep_obs, ep_actions, ep_qpos, ep_qvel, ep_obs_model_next], [
                             obs.copy(), ac.copy(), qpos.copy(), qvel.copy(), obs_model_next.copy()])
                current_return += -rew
                obs = obs_new.copy()
                observation = observation_new

            ep_obs.append(obs.copy())
            multi_append([mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns, mb_obs_model_next], [
                         ep_obs, ep_actions, ep_qpos, ep_qvel, current_return, ep_obs_model_next])

        mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next = np.array(mb_obs), np.array(
            mb_actions), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_obs_model_next)
        self.dynamics_dataset.store_episode(
            [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next])
        self._update_dynamics_normalizer(
            [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next])
        return np.mean(mb_returns)
예제 #3
0
 def collect_trajectories(self, num_traj):
     '''
     This function collects trajectories based on the controller and learned residuals
     '''
     logger.debug("Rolling out")
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [
     ], [], [], [], [], [], [], [], []
     for traj in range(num_traj):
         ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [
         ], [], [], [], [], [], [], [], []
         # observation = self.planning_env.reset()
         observation = set_sim_state_and_goal(
             self.planning_env,
             self.eval_qpos[traj],
             self.eval_qvel[traj],
             self.eval_goals[traj],
         )
         obs = observation['observation']
         ag = observation['achieved_goal']
         g = observation['desired_goal']
         s_h = self.controller.heuristic_obs_g(obs, g)
         f = self.planning_env.extract_features(obs, g)
         for _ in range(self.env_params['max_timesteps']):
             qpos = observation['sim_state'].qpos
             qvel = observation['sim_state'].qvel
             ac, info = self.controller.act(observation)
             ac_ind = self.planning_env.discrete_actions[tuple(ac)]
             logger.debug('Heuristic', info['start_node_h'])
             logger.debug('Action', ac)
             observation_new, rew, _, _ = self.planning_env.step(ac)
             # Apply dynamics residual
             observation_new, rew = self.apply_dynamics_residual(
                 observation, ac, observation_new, rew)
             self.n_planning_steps += 1
             obs_new = observation_new['observation']
             ag_new = observation_new['achieved_goal']
             if self.args.render:
                 self.planning_env.render()
             multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f],
                          [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()])
             obs = obs_new.copy()
             ag = ag_new.copy()
             observation = observation_new
             s_h = self.controller.heuristic_obs_g(obs, g)
             f = self.planning_env.extract_features(obs, g)
         multi_append([ep_obs, ep_ag, ep_s_h, ep_f],
                      [obs.copy(), ag.copy(), s_h, f.copy()])
         multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f],
                      [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f])
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array(
         mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f)
     self.dataset.store_episode(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
     # Update normalizer
     self._update_normalizer(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
예제 #4
0
    def online_rollout(self, initial_observation):
        n_steps = 0
        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], []
        mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], []
        mb_penetration = []
        # Set initial state
        observation = copy.deepcopy(initial_observation)
        # Data structures
        r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], []
        r_reward, r_qpos, r_qvel, r_features = [], [], [], []
        r_penetration = []
        # Start
        obs = observation['observation']
        ag = observation['achieved_goal']
        g = observation['desired_goal']
        qpos = observation['sim_state'].qpos
        qvel = observation['sim_state'].qvel
        set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(),
                               g.copy())
        features = self.env.extract_features(obs, g)
        heuristic = self.controller.heuristic_obs_g(obs, g)
        for _ in range(self.env_params['max_timesteps']):
            ac, _ = self.controller.act(observation)
            ac_ind = self.env.discrete_actions[tuple(ac)]
            next_observation, rew, _, info = self.planning_env.step(ac)
            penetration = info['penetration']
            if self.args.agent == 'rts':
                rew = apply_discrepancy_penalty(observation, ac, rew,
                                                self.controller.discrepancy_fn)
            if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn':
                next_observation, rew = apply_dynamics_residual(
                    self.planning_env, self.get_residual_dynamics, observation,
                    info, ac, next_observation)
            n_steps += 1
            # Add to data structures
            multi_append([
                r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos,
                r_qvel, r_features, r_penetration
            ], [
                obs.copy(),
                ag.copy(),
                g.copy(), ac_ind, heuristic, rew,
                qpos.copy(),
                qvel.copy(),
                features.copy(), penetration
            ])
            # Move to next step
            observation = copy.deepcopy(next_observation)
            obs = observation['observation']
            ag = observation['achieved_goal']
            g = observation['desired_goal']
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel
            features = self.env.extract_features(obs, g)
            heuristic = self.controller.heuristic_obs_g(obs, g)
        multi_append(
            [r_obs, r_ag, r_heuristic, r_features],
            [obs.copy(), ag.copy(), heuristic,
             features.copy()])
        multi_append([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ], [
            r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel,
            r_features, r_penetration
        ])

        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration = convert_to_list_of_np_arrays(
            [
                mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
                mb_qpos, mb_qvel, mb_features, mb_penetration
            ])

        # Store in memory
        self.memory.store_internal_model_rollout([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ])
        # Update normalizer
        self.features_normalizer.update_normalizer([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ], self.sampler)

        return n_steps
예제 #5
0
    def rollout(self, rollout_length=None, initial_state=None):
        self.env.reset()
        if initial_state:
            # Load initial state if given
            qpos = initial_state['qpos'].copy()
            qvel = initial_state['qvel'].copy()
            goal = initial_state['goal'].copy()
            set_sim_state_and_goal(self.env, qpos, qvel, goal)

        # Data structures
        n_steps = 0
        ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic = [], [], [], [], []
        ep_reward, ep_qpos, ep_qvel, ep_features = [], [], [], []
        ep_penetration = []

        # Start rollout
        observation = self.env.get_obs()
        obs = observation['observation']
        ag = observation['achieved_goal']
        g = observation['desired_goal']
        features = self.env.extract_features(obs, g)
        heuristic = self.controller.heuristic_obs_g(obs, g)
        if rollout_length is None:
            if self.args.offline:
                rollout_length = self.env_params['offline_max_timesteps']
            else:
                rollout_length = self.env_params['max_timesteps']
        for _ in range(rollout_length):
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel
            ac, _ = self.controller.act(observation)
            ac_ind = self.env.discrete_actions[tuple(ac)]
            observation_new, rew, _, info = self.env.step(ac)
            penetration = info['penetration']
            n_steps += 1
            if self.kdtrees_set:
                assert self.args.agent == 'rts'
                rew = apply_discrepancy_penalty(observation, ac, rew,
                                                self.controller.discrepancy_fn)
            elif self.residual_dynamics_set:
                assert self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp'
                next_observation, rew = apply_dynamics_residual(
                    self.env, self.get_residual_dynamics, observation, info,
                    ac, observation_new)
                next_observation['sim_state'] = copy.deepcopy(
                    self.env.env.sim.get_state())
            obs_new = observation_new['observation']
            ag_new = observation_new['achieved_goal']
            multi_append([
                ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward,
                ep_qpos, ep_qvel, ep_features, ep_penetration
            ], [
                obs.copy(),
                ag.copy(),
                g.copy(), ac_ind, heuristic, rew,
                qpos.copy(),
                qvel.copy(),
                features.copy(), penetration
            ])
            obs = obs_new.copy()
            ag = ag_new.copy()
            observation = copy.deepcopy(observation_new)
            heuristic = self.controller.heuristic_obs_g(obs, g)
            features = self.env.extract_features(obs, g)

        multi_append(
            [ep_obs, ep_ag, ep_heuristic, ep_features],
            [obs.copy(), ag.copy(), heuristic,
             features.copy()])

        return ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps
예제 #6
0
    def collect_internal_model_trajectories(self,
                                            num_rollouts,
                                            rollout_length,
                                            initial_observations=None):
        n_steps = 0
        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], []
        mb_reward, mb_features = [], []
        # Start rollouts
        for n in range(num_rollouts):
            # Set initial state
            if initial_observations is not None:
                observation = copy.deepcopy(initial_observations[n])
            else:
                observation = self.env.get_obs()
            # Data structures
            r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], []
            r_reward, r_features = [], []
            # Start
            obs = observation['observation']
            ag = observation['achieved_goal']
            g = observation['desired_goal']
            features = self.env.extract_features(obs, g)
            heuristic = self.controller.heuristic_obs_g(obs, g)
            for _ in range(rollout_length):
                ac, _ = self.controller.act(observation)
                ac_ind = self.env.discrete_actions[tuple(ac)]
                # Get the next observation and reward using the learned model
                observation_new = get_next_observation(
                    observation, ac, self.preproc_dynamics_inputs,
                    self.learned_model_dynamics)
                rew = self.env.compute_reward(observation_new['achieved_goal'],
                                              observation_new['desired_goal'],
                                              {})
                n_steps += 1
                # Add to data structures
                multi_append([
                    r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward,
                    r_features
                ], [
                    obs.copy(),
                    ag.copy(),
                    g.copy(), ac_ind, heuristic, rew,
                    features.copy()
                ])
                # Move to next step
                observation = copy.deepcopy(observation_new)
                obs = observation['observation']
                ag = observation['achieved_goal']
                g = observation['desired_goal']
                features = self.env.extract_features(obs, g)
                heuristic = self.controller.heuristic_obs_g(obs, g)
            multi_append(
                [r_obs, r_ag, r_heuristic, r_features],
                [obs.copy(), ag.copy(), heuristic,
                 features.copy()])
            multi_append([
                mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
                mb_features
            ], [
                r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_features
            ])

        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_features = convert_to_list_of_np_arrays(
            [
                mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
                mb_features
            ])
        # Store in memory
        self.memory.store_internal_model_rollout([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
            mb_features
        ],
                                                 sim=False)
        # Update normalizer
        self._update_normalizer([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
            mb_features
        ])

        return n_steps