Пример #1
0
    def get_qvalue_obs_ac(self, obs, g, qpos, qvel, ac):
        set_sim_state_and_goal(self.model, qpos.copy(), qvel.copy(), g.copy())
        next_observation, rew, _, _ = self.model.step(np.array(ac))
        next_node = Node(next_observation)
        next_heuristic = self.heuristic(next_node, augment=False)

        return (-rew) + next_heuristic
Пример #2
0
    def evaluate_real_world(self, residual_parameters):
        # TODO: Parallelize this function
        # Copy parameters to residual
        self.residual.load_state_dict(residual_parameters)
        self.controller.reconfigure_heuristic(self.get_residual)
        self.controller.reconfigure_dynamics(self.get_dynamics_residual)
        mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns = [], [], [], [], []
        mb_obs_model_next = []
        for traj in range(self.args.num_real_traj_eval):
            ep_obs, ep_actions, ep_qpos, ep_qvel = [], [], [], []
            ep_obs_model_next = []
            current_return = 0.
            observation = set_sim_state_and_goal(self.env,
                                                 self.eval_qpos[traj],
                                                 self.eval_qvel[traj],
                                                 self.eval_goals[traj])
            obs = observation['observation']
            for _ in range(self.env_params['max_timesteps']):
                qpos = observation['sim_state'].qpos.copy()
                qvel = observation['sim_state'].qvel.copy()
                goal = observation['desired_goal'].copy()
                ac, info = self.controller.act(observation)
                observation_new, rew, _, _ = self.env.step(ac)
                if self.args.render:
                    self.env.render()
                # Set model to the same state
                _ = set_sim_state_and_goal(
                    self.planning_env,
                    qpos,
                    qvel,
                    goal,
                )
                model_observation_next, _, _, _ = self.planning_env.step(ac)
                obs_model_next = model_observation_next['observation']
                self.n_real_steps += 1
                obs_new = observation_new['observation']
                multi_append([ep_obs, ep_actions, ep_qpos, ep_qvel, ep_obs_model_next], [
                             obs.copy(), ac.copy(), qpos.copy(), qvel.copy(), obs_model_next.copy()])
                current_return += -rew
                obs = obs_new.copy()
                observation = observation_new

            ep_obs.append(obs.copy())
            multi_append([mb_obs, mb_actions, mb_qpos, mb_qvel, mb_returns, mb_obs_model_next], [
                         ep_obs, ep_actions, ep_qpos, ep_qvel, current_return, ep_obs_model_next])

        mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next = np.array(mb_obs), np.array(
            mb_actions), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_obs_model_next)
        self.dynamics_dataset.store_episode(
            [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next])
        self._update_dynamics_normalizer(
            [mb_obs, mb_actions, mb_qpos, mb_qvel, mb_obs_model_next])
        return np.mean(mb_returns)
Пример #3
0
    def get_qvalue(self, observation, ac):
        # Set the model state
        set_sim_state_and_goal(self.model,
                               observation['sim_state'].qpos.copy(),
                               observation['sim_state'].qvel.copy(),
                               observation['desired_goal'].copy())
        # Get next observation
        next_observation, rew, _, _ = self.model.step(np.array(ac))
        # Get heuristic of the next observation
        next_node = Node(next_observation)
        next_heuristic = self.heuristic(next_node, augment=False)

        return (-rew) + next_heuristic
Пример #4
0
 def collect_trajectories(self, num_traj):
     '''
     This function collects trajectories based on the controller and learned residuals
     '''
     logger.debug("Rolling out")
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [
     ], [], [], [], [], [], [], [], []
     for traj in range(num_traj):
         ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [
         ], [], [], [], [], [], [], [], []
         # observation = self.planning_env.reset()
         observation = set_sim_state_and_goal(
             self.planning_env,
             self.eval_qpos[traj],
             self.eval_qvel[traj],
             self.eval_goals[traj],
         )
         obs = observation['observation']
         ag = observation['achieved_goal']
         g = observation['desired_goal']
         s_h = self.controller.heuristic_obs_g(obs, g)
         f = self.planning_env.extract_features(obs, g)
         for _ in range(self.env_params['max_timesteps']):
             qpos = observation['sim_state'].qpos
             qvel = observation['sim_state'].qvel
             ac, info = self.controller.act(observation)
             ac_ind = self.planning_env.discrete_actions[tuple(ac)]
             logger.debug('Heuristic', info['start_node_h'])
             logger.debug('Action', ac)
             observation_new, rew, _, _ = self.planning_env.step(ac)
             # Apply dynamics residual
             observation_new, rew = self.apply_dynamics_residual(
                 observation, ac, observation_new, rew)
             self.n_planning_steps += 1
             obs_new = observation_new['observation']
             ag_new = observation_new['achieved_goal']
             if self.args.render:
                 self.planning_env.render()
             multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f],
                          [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()])
             obs = obs_new.copy()
             ag = ag_new.copy()
             observation = observation_new
             s_h = self.controller.heuristic_obs_g(obs, g)
             f = self.planning_env.extract_features(obs, g)
         multi_append([ep_obs, ep_ag, ep_s_h, ep_f],
                      [obs.copy(), ag.copy(), s_h, f.copy()])
         multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f],
                      [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f])
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array(
         mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f)
     self.dataset.store_episode(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
     # Update normalizer
     self._update_normalizer(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
Пример #5
0
    def get_successors(self, node, action):
        obs = node.obs
        set_sim_state_and_goal(self.model, obs['sim_state'].qpos.copy(),
                               obs['sim_state'].qvel.copy(),
                               obs['desired_goal'].copy())
        next_obs, rew, _, info = self.model.step(np.array(action))

        if self.discrepancy_fn is not None:
            rew = apply_discrepancy_penalty(obs, action, rew,
                                            self.discrepancy_fn)

        if self.residual_dynamics_fn is not None:
            next_obs, rew = apply_dynamics_residual(self.model,
                                                    self.residual_dynamics_fn,
                                                    obs, info, action,
                                                    next_obs)

        mj_sim_state = copy.deepcopy(self.model.env.sim.get_state())
        next_obs['sim_state'] = mj_sim_state
        next_node = Node(next_obs)

        return next_node, -rew
Пример #6
0
    def learn_online_in_real_world(self, max_timesteps=None):
        # If any pre-existing model is given, load it
        if self.args.load_dir:
            self.load()

        # Reset the environment
        observation = self.env.reset()
        # Configure heuristic for controller
        self.controller.reconfigure_heuristic(
            lambda obs: get_state_value_residual(obs, self.preproc_inputs, self
                                                 .state_value_residual))
        # Configure dynamics for controller
        if self.args.agent == 'rts':
            self.controller.reconfigure_discrepancy(
                lambda obs, ac: get_discrepancy_neighbors(
                    obs, ac, self.construct_4d_point, self.kdtrees, self.args.
                    neighbor_radius))

        # Configure dynamics for controller
        if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp':
            self.controller.reconfigure_residual_dynamics(
                self.get_residual_dynamics)
        # Count of total number of steps
        total_n_steps = 0
        while True:
            obs = observation['observation']
            g = observation['desired_goal']
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel
            # Get action from the controller
            ac, info = self.controller.act(observation)
            if self.args.agent == 'rts':
                assert self.controller.residual_dynamics_fn is None
            if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp':
                assert self.controller.discrepancy_fn is None
            # Get discrete action index
            ac_ind = self.env.discrete_actions[tuple(ac)]
            # Get the next observation
            next_observation, rew, _, _ = self.env.step(ac)
            # if np.array_equal(obs, next_observation['observation']):
            #     import ipdb
            #     ipdb.set_trace()
            # print('ACTION', ac)
            # print('VALUE PREDICTED', info['start_node_h'])
            # print('COST', -rew)
            if self.args.render:
                self.env.render()
            total_n_steps += 1
            # Check if we reached the goal
            if self.env.env._is_success(next_observation['achieved_goal'], g):
                print('REACHED GOAL!')
                break
            # Get the next obs
            obs_next = next_observation['observation']
            # Get the sim next obs
            set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(),
                                   g.copy())
            next_observation_sim, _, _, _ = self.planning_env.step(ac)
            obs_sim_next = next_observation_sim['observation']
            # Store transition
            transition = [
                obs.copy(),
                g.copy(), ac_ind,
                qpos.copy(),
                qvel.copy(),
                obs_next.copy(),
                obs_sim_next.copy()
            ]
            dynamics_losses = []
            # RTS
            if self.args.agent == 'rts' and self._check_dynamics_transition(
                    transition):
                # print('DISCREPANCY IN DYNAMICS')
                self.memory.store_real_world_transition(transition)
                # # Fit model
                self._update_discrepancy_model()
            # MBPO
            elif self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp':
                self.memory.store_real_world_transition(transition)
                # Update the dynamics
                if self.args.agent == 'mbpo':
                    for _ in range(self.args.n_online_planning_updates):
                        # Update dynamics
                        loss = self._update_batch_residual_dynamics()
                        dynamics_losses.append(loss.item())
                else:
                    loss = self._update_residual_dynamics()
                    dynamics_losses.append(loss)
            # # Plan in the model
            value_loss = self.plan_online_in_model(
                n_planning_updates=self.args.n_online_planning_updates,
                initial_observation=copy.deepcopy(observation))

            # Log
            logger.record_tabular('n_steps', total_n_steps)
            if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp':
                logger.record_tabular('dynamics loss',
                                      np.mean(dynamics_losses))
            logger.record_tabular('residual_loss', value_loss)
            # logger.dump_tabular()
            # Move to next iteration
            observation = copy.deepcopy(next_observation)

            if max_timesteps and total_n_steps >= max_timesteps:
                break

        return total_n_steps
Пример #7
0
    def online_rollout(self, initial_observation):
        n_steps = 0
        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic = [], [], [], [], []
        mb_reward, mb_qpos, mb_qvel, mb_features = [], [], [], []
        mb_penetration = []
        # Set initial state
        observation = copy.deepcopy(initial_observation)
        # Data structures
        r_obs, r_ag, r_g, r_actions, r_heuristic = [], [], [], [], []
        r_reward, r_qpos, r_qvel, r_features = [], [], [], []
        r_penetration = []
        # Start
        obs = observation['observation']
        ag = observation['achieved_goal']
        g = observation['desired_goal']
        qpos = observation['sim_state'].qpos
        qvel = observation['sim_state'].qvel
        set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(),
                               g.copy())
        features = self.env.extract_features(obs, g)
        heuristic = self.controller.heuristic_obs_g(obs, g)
        for _ in range(self.env_params['max_timesteps']):
            ac, _ = self.controller.act(observation)
            ac_ind = self.env.discrete_actions[tuple(ac)]
            next_observation, rew, _, info = self.planning_env.step(ac)
            penetration = info['penetration']
            if self.args.agent == 'rts':
                rew = apply_discrepancy_penalty(observation, ac, rew,
                                                self.controller.discrepancy_fn)
            if self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn':
                next_observation, rew = apply_dynamics_residual(
                    self.planning_env, self.get_residual_dynamics, observation,
                    info, ac, next_observation)
            n_steps += 1
            # Add to data structures
            multi_append([
                r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos,
                r_qvel, r_features, r_penetration
            ], [
                obs.copy(),
                ag.copy(),
                g.copy(), ac_ind, heuristic, rew,
                qpos.copy(),
                qvel.copy(),
                features.copy(), penetration
            ])
            # Move to next step
            observation = copy.deepcopy(next_observation)
            obs = observation['observation']
            ag = observation['achieved_goal']
            g = observation['desired_goal']
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel
            features = self.env.extract_features(obs, g)
            heuristic = self.controller.heuristic_obs_g(obs, g)
        multi_append(
            [r_obs, r_ag, r_heuristic, r_features],
            [obs.copy(), ag.copy(), heuristic,
             features.copy()])
        multi_append([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ], [
            r_obs, r_ag, r_g, r_actions, r_heuristic, r_reward, r_qpos, r_qvel,
            r_features, r_penetration
        ])

        mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos, mb_qvel, mb_features, mb_penetration = convert_to_list_of_np_arrays(
            [
                mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward,
                mb_qpos, mb_qvel, mb_features, mb_penetration
            ])

        # Store in memory
        self.memory.store_internal_model_rollout([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ])
        # Update normalizer
        self.features_normalizer.update_normalizer([
            mb_obs, mb_ag, mb_g, mb_actions, mb_heuristic, mb_reward, mb_qpos,
            mb_qvel, mb_features, mb_penetration
        ], self.sampler)

        return n_steps
Пример #8
0
    def rollout(self, rollout_length=None, initial_state=None):
        self.env.reset()
        if initial_state:
            # Load initial state if given
            qpos = initial_state['qpos'].copy()
            qvel = initial_state['qvel'].copy()
            goal = initial_state['goal'].copy()
            set_sim_state_and_goal(self.env, qpos, qvel, goal)

        # Data structures
        n_steps = 0
        ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic = [], [], [], [], []
        ep_reward, ep_qpos, ep_qvel, ep_features = [], [], [], []
        ep_penetration = []

        # Start rollout
        observation = self.env.get_obs()
        obs = observation['observation']
        ag = observation['achieved_goal']
        g = observation['desired_goal']
        features = self.env.extract_features(obs, g)
        heuristic = self.controller.heuristic_obs_g(obs, g)
        if rollout_length is None:
            if self.args.offline:
                rollout_length = self.env_params['offline_max_timesteps']
            else:
                rollout_length = self.env_params['max_timesteps']
        for _ in range(rollout_length):
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel
            ac, _ = self.controller.act(observation)
            ac_ind = self.env.discrete_actions[tuple(ac)]
            observation_new, rew, _, info = self.env.step(ac)
            penetration = info['penetration']
            n_steps += 1
            if self.kdtrees_set:
                assert self.args.agent == 'rts'
                rew = apply_discrepancy_penalty(observation, ac, rew,
                                                self.controller.discrepancy_fn)
            elif self.residual_dynamics_set:
                assert self.args.agent == 'mbpo' or self.args.agent == 'mbpo_knn' or self.args.agent == 'mbpo_gp'
                next_observation, rew = apply_dynamics_residual(
                    self.env, self.get_residual_dynamics, observation, info,
                    ac, observation_new)
                next_observation['sim_state'] = copy.deepcopy(
                    self.env.env.sim.get_state())
            obs_new = observation_new['observation']
            ag_new = observation_new['achieved_goal']
            multi_append([
                ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward,
                ep_qpos, ep_qvel, ep_features, ep_penetration
            ], [
                obs.copy(),
                ag.copy(),
                g.copy(), ac_ind, heuristic, rew,
                qpos.copy(),
                qvel.copy(),
                features.copy(), penetration
            ])
            obs = obs_new.copy()
            ag = ag_new.copy()
            observation = copy.deepcopy(observation_new)
            heuristic = self.controller.heuristic_obs_g(obs, g)
            features = self.env.extract_features(obs, g)

        multi_append(
            [ep_obs, ep_ag, ep_heuristic, ep_features],
            [obs.copy(), ag.copy(), heuristic,
             features.copy()])

        return ep_obs, ep_ag, ep_g, ep_actions, ep_heuristic, ep_reward, ep_qpos, ep_qvel, ep_features, ep_penetration, n_steps
Пример #9
0
    def learn_online_in_real_world(self, max_timesteps=None):
        # If any pre-existing model is given, load it
        if self.args.load_dir:
            self.load()

        # Reset the environment
        observation = self.env.reset()
        # Configure heuristic for controller
        self.controller.reconfigure_heuristic(
            lambda obs: get_state_value_residual(obs, self.preproc_inputs, self
                                                 .state_value_residual))

        # Configure dynamics for controller
        self.controller.reconfigure_residual_dynamics(
            self.get_residual_dynamics)

        # Count total number of steps
        total_n_steps = 0
        while True:
            obs = observation['observation']
            g = observation['desired_goal']
            qpos = observation['sim_state'].qpos
            qvel = observation['sim_state'].qvel

            # Get action from controller
            ac, info = self.controller.act(observation)
            # Get discrete action index
            ac_ind = self.env.discrete_actions[tuple(ac)]
            # Get next observation
            next_observation, rew, _, _ = self.env.step(ac)
            # Increment counter
            total_n_steps += 1
            if self.env.env._is_success(next_observation['achieved_goal'], g):
                print('REACHED GOAL!')
                break
            if self.args.render:
                self.env.render()
            # Get next obs
            obs_next = next_observation['observation']
            # GEt sim next obs
            set_sim_state_and_goal(self.planning_env, qpos.copy(), qvel.copy(),
                                   g.copy())
            next_observation_sim, _, _, _ = self.planning_env.step(ac)
            obs_sim_next = next_observation_sim['observation']
            # Store transition in real world memory
            transition = [
                obs.copy(),
                g.copy(), ac_ind,
                qpos.copy(),
                qvel.copy(),
                obs_next.copy(),
                obs_sim_next.copy()
            ]
            self.memory.store_real_world_transition(transition)

            # Update the dynamics
            dynamics_losses = []
            for _ in range(self.args.n_online_planning_updates):
                # Update dynamics
                loss = self._update_residual_dynamics()
                dynamics_losses.append(loss.item())

            # Update state value residual
            value_loss = self.plan_online_in_model(
                self.args.n_online_planning_updates,
                initial_observation=copy.deepcopy(observation))
            # log
            logger.record_tabular('n_steps', total_n_steps)
            logger.record_tabular('dynamics_loss', np.mean(dynamics_losses))
            logger.record_tabular('residual_loss', value_loss)
            logger.dump_tabular()

            # Move to next iteration
            observation = copy.deepcopy(next_observation)

            if max_timesteps and total_n_steps >= max_timesteps:
                break

        return total_n_steps