Exemplo n.º 1
0
 def get_qvalue(self, obs, ac):
     if self.env.check_goal(obs):
         return 0
     cell = obs['observation'].copy()
     goal_cell = obs['desired_goal'].copy()
     value = compute_heuristic(cell, goal_cell, self.args.goal_threshold)
     features = compute_features(cell, goal_cell, self.env.carry_cell,
                                 self.env.obstacle_cell_aa,
                                 self.env.obstacle_cell_bb,
                                 self.args.grid_size,
                                 self.env._grid_to_continuous)
     features_norm = self.feature_normalizer_q.normalize(features)
     ac_idx = self.actions_index[ac]
     residual_state_action_value = get_state_action_value_residual(
         features_norm, ac_idx, self.state_action_value_residual)
     return value + residual_state_action_value
Exemplo n.º 2
0
    def get_state_value(self, obs, inflated=False):
        if self.env.check_goal(obs):
            return 0
        cell = obs['observation'].copy()
        goal_cell = obs['desired_goal'].copy()
        value = compute_heuristic(cell, goal_cell, self.args.goal_threshold)
        features = compute_features(cell, goal_cell, self.env.carry_cell,
                                    self.env.obstacle_cell_aa,
                                    self.env.obstacle_cell_bb,
                                    self.args.grid_size,
                                    self.env._grid_to_continuous)
        features_norm = self.feature_normalizer.normalize(features)

        # Use inflated if need be
        if inflated:
            state_value_residual = self.inflated_state_value_residual
        else:
            state_value_residual = self.state_value_residual

        residual_value = get_state_value_residual(features_norm,
                                                  state_value_residual)
        return value + residual_value
Exemplo n.º 3
0
    def __init__(self, args):
        self.args = args
        self.env = pr2_7d_xyzrpy_env(args,
                                     mass=0.01,
                                     use_gui=False,
                                     no_dynamics=True)

        self.start_cell = self.env.start_cell
        self.goal_cells = self.env.goal_cells
        self.num_features = compute_features(
            self.start_cell, self.goal_cells[0], self.env.carry_cell,
            self.env.obstacle_cell_aa, self.env.obstacle_cell_bb,
            self.args.grid_size, self.env._grid_to_continuous).shape[0]
        self.representation_size = compute_representation(
            self.start_cell, self.args.grid_size,
            self.env._grid_to_continuous).shape[0]

        if self.args.agent == 'cmax':
            self.controller = pr2_7d_controller(
                self.env, num_expansions=self.args.num_expansions)
        elif self.args.agent == 'cmaxpp':
            self.controller = pr2_7d_q_controller(
                self.env, num_expansions=self.args.num_expansions)
        elif self.args.agent == 'adaptive_cmaxpp':
            self.controller = pr2_7d_q_controller(
                self.env, num_expansions=self.args.num_expansions)
            self.controller_inflated = pr2_7d_controller(
                self.env, num_expansions=self.args.num_expansions)
        elif self.args.agent in ['model', 'knn']:
            self.controller = pr2_7d_model_controller(
                self.env, num_expansions=self.args.num_expansions)
        elif self.args.agent == 'qlearning':
            self.controller = pr2_7d_qlearning_controller(self.env)

        self.actions = self.controller.actions
        self.actions_index = {}
        for ac_idx in range(len(self.actions)):
            self.actions_index[self.actions[ac_idx]] = ac_idx

        self.state_value_residual = StateValueResidual(
            in_dim=self.num_features, out_dim=1)
        self.state_action_value_residual = StateActionValueResidual(
            in_dim=self.num_features, out_dim=len(self.actions))
        self.inflated_state_value_residual = StateValueResidual(
            in_dim=self.num_features, out_dim=1)

        if self.args.agent == 'model':
            # Global function approximator for dynamics residual
            self.dynamics_residual = DynamicsResidual(
                in_dim=self.representation_size,
                num_actions=len(self.actions),
                out_dim=7)

        if self.args.agent == 'knn':
            # Local function approximators for dynamics residual
            self.knn_dynamics_residuals = [
                KNNDynamicsResidual(in_dim=7,
                                    radius=self.args.knn_radius,
                                    out_dim=7)
                for _ in range(len(self.actions))
            ]

        self.kdtrees = {}
        for ac in self.actions:
            self.kdtrees[ac] = None
        self.delta = self.args.delta

        self.feature_normalizer = FeatureNormalizer(self.num_features)
        self.feature_normalizer_q = FeatureNormalizer(self.num_features)
        self.representation_normalizer_dyn = FeatureNormalizer(
            self.representation_size)

        # Configure heuristic and discrepancy for controller
        def get_state_value(obs):
            return self.get_state_value(obs, inflated=False)

        self.controller.reconfigure_heuristic(get_state_value)
        self.controller.reconfigure_discrepancy(self.get_discrepancy)
        if self.args.agent in ['cmaxpp', 'adaptive_cmaxpp']:
            self.controller.reconfigure_qvalue_fn(self.get_qvalue)

        if self.args.agent == 'model':
            self.controller.reconfigure_residual_dynamics(
                self.get_dynamics_residual)
        if self.args.agent == 'knn':
            self.controller.reconfigure_residual_dynamics(
                self.get_knn_dynamics_residual)

        # Configure heuristic and discrepancy for controller_inflated
        if self.args.agent == 'adaptive_cmaxpp':

            def get_state_value_inflated(obs):
                return self.get_state_value(obs, inflated=True)

            self.controller_inflated.reconfigure_heuristic(
                get_state_value_inflated)
            self.controller_inflated.reconfigure_discrepancy(
                self.get_discrepancy)
Exemplo n.º 4
0
    def update_state_action_value_residual_workers(self):
        if len(self.transition_buffer) == 0:
            # No incorrect transitions yet
            return

        # Sample a batch of transitions
        transitions = self._sample_transition_batch()
        # Get all the next observations as we need to query the controller
        # for their best estimate of cost-to-go
        observations_next = [
            transition['obs_next'] for transition in transitions
        ]
        batch_size = len(observations_next)

        # Split jobs among workers
        num_workers = self.args.n_workers
        if batch_size < num_workers:
            num_workers = batch_size
        num_per_worker = batch_size // num_workers
        # Put state value residual in object store
        state_value_residual_state_dict_id = ray.put(
            self.state_value_target_residual.state_dict())
        # Put kdtrees in object store
        kdtrees_serialized_id = ray.put(pickle.dumps(self.kdtrees))
        # Put feature normalizer in object store
        feature_normalizer_state_dict_id = ray.put(
            self.feature_normalizer.state_dict())
        # Put feature normalizer q in object store
        feature_normalizer_q_state_dict_id = ray.put(
            self.feature_normalizer_q.state_dict())
        # Put state action value target residual in object store
        state_action_value_residual_state_dict_id = ray.put(
            self.state_action_value_target_residual.state_dict())

        results, count = [], 0
        for worker_id in range(num_workers):
            if worker_id == num_workers - 1:
                # last worker takes the remaining load
                num_per_worker = batch_size - count

            # Set parameters
            ray.get(self.workers[worker_id].set_worker_params.remote(
                state_value_residual_state_dict_id, kdtrees_serialized_id,
                feature_normalizer_state_dict_id,
                state_action_value_residual_state_dict_id,
                feature_normalizer_q_state_dict_id))

            # send job
            results.append(self.workers[worker_id].lookahead_batch.remote(
                observations_next[count:count + num_per_worker]))
            # Increment count
            count += num_per_worker
        # Check if all observations have been accounted for
        assert count == batch_size
        # Get all targets
        results = ray.get(results)
        target_infos = [item for sublist in results for item in sublist]

        cells = [
            transition['obs']['observation'] for transition in transitions
        ]
        goal_cells = [
            transition['obs']['desired_goal'] for transition in transitions
        ]
        actions = [transition['ac'] for transition in transitions]
        ac_idxs = np.array([self.actions_index[ac] for ac in actions],
                           dtype=np.int32)
        costs = np.array([transition['cost'] for transition in transitions],
                         dtype=np.float32)
        heuristics = np.array([
            compute_heuristic(cells[i], goal_cells[i],
                              self.args.goal_threshold)
            for i in range(len(cells))
        ],
                              dtype=np.float32)
        features = np.array([
            compute_features(cells[i], goal_cells[i], self.env.carry_cell,
                             self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                            dtype=np.float32)
        features_norm = self.feature_normalizer_q.normalize(features)

        # Get next state value
        value_next = np.array([info['best_node_f'] for info in target_infos],
                              dtype=np.float32)
        assert value_next.shape[0] == heuristics.shape[0]

        # Compute targets
        targets = costs + value_next
        residual_targets = targets - heuristics
        # Clip the residual targets such that the residual is always positive
        residual_targets = np.maximum(residual_targets, 0)
        # Clip the residual targets so that the residual is not super big
        residual_targets = np.minimum(residual_targets, 20)

        loss = self._fit_state_action_value_residual(features_norm, ac_idxs,
                                                     residual_targets)
        # Update normalizer
        self.feature_normalizer_q.update_normalizer(features)
        return loss
Exemplo n.º 5
0
    def update_state_action_value_residual_qlearning(self):
        if len(self.transition_buffer) == 0:
            # No transitions yet
            return

        # Sample a batch of transitions
        transitions = self._sample_transition_batch()

        cells = [
            transition['obs']['observation'] for transition in transitions
        ]
        goal_cells = [
            transition['obs']['desired_goal'] for transition in transitions
        ]
        actions = [transition['ac'] for transition in transitions]
        ac_idxs = np.array([self.actions_index[ac] for ac in actions],
                           dtype=np.int32)
        costs = np.array([transition['cost'] for transition in transitions],
                         dtype=np.float32)
        cells_next = [
            transition['obs_next']['observation'] for transition in transitions
        ]
        goal_cells_next = [
            transition['obs_next']['desired_goal']
            for transition in transitions
        ]
        heuristics = np.array(compute_lookahead_heuristic_using_workers(
            self.workers, cells, goal_cells, actions),
                              dtype=np.float32)
        # heuristics = np.array(
        #     [compute_lookahead_heuristic(cells[i], goal_cells[i], actions[i],
        #                                  self.controller, self.args.goal_threshold)
        #      for i in range(len(cells))]
        # )

        # heuristics_next = np.array(
        #     [compute_heuristic(cells_next[i],
        #                        goal_cells_next[i],
        #                        self.args.goal_threshold) for i in range(len(cells))],
        #     dtype=np.float32)
        heuristics_next = []
        for ac in self.actions:
            heuristics_next.append(
                compute_lookahead_heuristic_using_workers(
                    self.workers, cells_next, goal_cells_next,
                    [ac for _ in range(len(cells_next))]))
            # heuristics_next.append(
            #     [compute_lookahead_heuristic(cells_next[i], goal_cells_next[i],
            #                                  ac, self.controller, self.args.goal_threshold)
            #      for i in range(len(cells_next))]
            # )
        heuristics_next = np.transpose(
            np.array(heuristics_next, dtype=np.float32))
        features = np.array([
            compute_features(cells[i], goal_cells[i], self.env.carry_cell,
                             self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                            dtype=np.float32)
        features_norm = self.feature_normalizer_q.normalize(features)

        features_next = np.array([
            compute_features(cells_next[i], goal_cells_next[i],
                             self.env.carry_cell, self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                                 dtype=np.float32)
        features_next_norm = self.feature_normalizer_q.normalize(features_next)

        # Compute next state value using the target state action value residual
        features_next_norm_tensor = torch.from_numpy(features_next_norm)
        with torch.no_grad():
            qvalues_target_residual_next = self.state_action_value_target_residual(
                features_next_norm_tensor).detach().numpy()
            # Double Q-learning update
            qvalues_residual_next = self.state_action_value_residual(
                features_next_norm_tensor).detach().numpy()
            target_ac = np.argmin(qvalues_residual_next + heuristics_next,
                                  axis=1)
            qvalues_target_residual_next_chosen = np.take_along_axis(
                qvalues_target_residual_next, target_ac.reshape(-1, 1),
                axis=1).squeeze()
            heuristics_next_chosen = np.take_along_axis(heuristics_next,
                                                        target_ac.reshape(
                                                            -1, 1),
                                                        axis=1).squeeze()
            qvalues_target_next = qvalues_target_residual_next_chosen + \
                heuristics_next_chosen

        # Compute targets
        targets = costs + qvalues_target_next
        residual_targets = targets - heuristics
        # Clip the residual targets such that the residual is always positive
        residual_targets = np.maximum(residual_targets, 0)
        # Clip the residual targets so that the residual is not super big
        residual_targets = np.minimum(residual_targets, 20)

        loss = self._fit_state_action_value_residual(features_norm, ac_idxs,
                                                     residual_targets)
        # Update normalizer
        self.feature_normalizer_q.update_normalizer(features)
        self.feature_normalizer_q.update_normalizer(features_next)

        return loss
Exemplo n.º 6
0
    def update_state_action_value_residual(self):
        if len(self.transition_buffer) == 0:
            # No transitions yet
            return
        # Sample a batch of transitions
        transitions = self._sample_transition_batch()

        cells = [
            transition['obs']['observation'] for transition in transitions
        ]
        goal_cells = [
            transition['obs']['desired_goal'] for transition in transitions
        ]
        actions = [transition['ac'] for transition in transitions]
        ac_idxs = np.array([self.actions_index[ac] for ac in actions],
                           dtype=np.int32)
        costs = np.array([transition['cost'] for transition in transitions],
                         dtype=np.float32)
        cells_next = [
            transition['obs_next']['observation'] for transition in transitions
        ]
        goal_cells_next = [
            transition['obs_next']['desired_goal']
            for transition in transitions
        ]
        heuristics = np.array([
            compute_heuristic(cells[i], goal_cells[i],
                              self.args.goal_threshold)
            for i in range(len(cells))
        ],
                              dtype=np.float32)
        heuristics_next = np.array([
            compute_heuristic(cells_next[i], goal_cells_next[i],
                              self.args.goal_threshold)
            for i in range(len(cells))
        ],
                                   dtype=np.float32)
        features = np.array([
            compute_features(cells[i], goal_cells[i], self.env.carry_cell,
                             self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                            dtype=np.float32)
        features_norm = self.feature_normalizer_q.normalize(features)

        features_next = np.array([
            compute_features(cells_next[i], goal_cells_next[i],
                             self.env.carry_cell, self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                                 dtype=np.float32)
        features_next_norm = self.feature_normalizer.normalize(features_next)

        # Compute next state value
        features_next_norm_tensor = torch.from_numpy(features_next_norm)
        with torch.no_grad():
            residual_next_tensor = self.state_value_target_residual(
                features_next_norm_tensor)
            residual_next = residual_next_tensor.detach().numpy().squeeze()
        value_next = residual_next + heuristics_next

        # Compute targets
        targets = costs + value_next
        residual_targets = targets - heuristics
        # Clip the residual targets such that the residual is always positive
        residual_targets = np.maximum(residual_targets, 0)
        # Clip the residual targets so that the residual is not super big
        residual_targets = np.minimum(residual_targets, 20)

        loss = self._fit_state_action_value_residual(features_norm, ac_idxs,
                                                     residual_targets)
        # Update normalizer
        self.feature_normalizer_q.update_normalizer(features)
        self.feature_normalizer.update_normalizer(features_next)

        return loss
Exemplo n.º 7
0
    def update_state_value_residual(self, inflated=False):
        # Sample batch of states
        observations = self._sample_batch(inflated)
        batch_size = len(observations)

        num_workers = self.args.n_workers
        if batch_size < num_workers:
            num_workers = batch_size
        num_per_worker = batch_size // num_workers
        # Put state value target residual in object store
        state_value_residual_state_dict_id = ray.put(
            self.state_value_target_residual.state_dict())
        # Put kdtrees in object store
        kdtrees_serialized_id = ray.put(pickle.dumps(self.kdtrees))
        # Put feature normalizer in object store
        feature_normalizer_state_dict_id = ray.put(
            self.feature_normalizer.state_dict())

        if self.args.agent in ['cmaxpp', 'adaptive_cmaxpp']:
            # Put feature normalizer q in object store
            feature_normalizer_q_state_dict_id = ray.put(
                self.feature_normalizer_q.state_dict())
            # Put state action value target residual in object store
            state_action_value_residual_state_dict_id = ray.put(
                self.state_action_value_target_residual.state_dict())
        else:
            feature_normalizer_q_state_dict_id = None
            state_action_value_residual_state_dict_id = None

        if self.args.agent == 'adaptive_cmaxpp':
            # Put inflated state value target residual in object store
            inflated_state_value_residual_state_dict_id = ray.put(
                self.inflated_state_value_target_residual.state_dict())
        else:
            inflated_state_value_residual_state_dict_id = None

        if self.args.agent == 'model':
            dynamics_residual_state_dict_id = ray.put(
                self.dynamics_residual.state_dict())
            representation_normalizer_dyn_state_dict_id = ray.put(
                self.representation_normalizer_dyn.state_dict())
        else:
            dynamics_residual_state_dict_id = None
            representation_normalizer_dyn_state_dict_id = None

        if self.args.agent == 'knn':
            knn_dynamics_residuals_serialized_id = ray.put(
                pickle.dumps(self.knn_dynamics_residuals))
        else:
            knn_dynamics_residuals_serialized_id = None

        results, count = [], 0
        for worker_id in range(num_workers):
            if worker_id == num_workers - 1:
                # last worker takes the remaining load
                num_per_worker = batch_size - count

            # Set parameters
            ray.get(self.workers[worker_id].set_worker_params.remote(
                state_value_residual_state_dict_id, kdtrees_serialized_id,
                feature_normalizer_state_dict_id,
                state_action_value_residual_state_dict_id,
                feature_normalizer_q_state_dict_id,
                inflated_state_value_residual_state_dict_id,
                dynamics_residual_state_dict_id,
                knn_dynamics_residuals_serialized_id,
                representation_normalizer_dyn_state_dict_id))

            # send job
            results.append(self.workers[worker_id].lookahead_batch.remote(
                observations[count:count + num_per_worker], inflated))
            # Increment count
            count += num_per_worker
        # Check if all observations have been accounted for
        assert count == batch_size
        # Get all targets
        results = ray.get(results)
        target_infos = [item for sublist in results for item in sublist]

        cells = [
            k.obs['observation'].copy() for info in target_infos
            for k in info['closed']
        ]
        intended_goals = [
            k.obs['desired_goal'].copy() for info in target_infos
            for k in info['closed']
        ]
        assert len(cells) == len(intended_goals)
        heuristics = np.array([
            compute_heuristic(cells[i], intended_goals[i],
                              self.args.goal_threshold)
            for i in range(len(cells))
        ],
                              dtype=np.float32)
        targets = np.array([
            info['best_node_f'] - k._g for info in target_infos
            for k in info['closed']
        ],
                           dtype=np.float32)
        residual_targets = targets - heuristics
        # Clip the residual targets such that the residual is always positive
        residual_targets = np.maximum(residual_targets, 0)
        # Clip the residual targets so that the residual is not super big
        residual_targets = np.minimum(residual_targets, 20)

        # Compute features of the cell
        features = np.array([
            compute_features(cells[i], intended_goals[i], self.env.carry_cell,
                             self.env.obstacle_cell_aa,
                             self.env.obstacle_cell_bb, self.args.grid_size,
                             self.env._grid_to_continuous)
            for i in range(len(cells))
        ],
                            dtype=np.float32)
        features_norm = self.feature_normalizer.normalize(features)

        loss = self._fit_state_value_residual(features_norm, residual_targets,
                                              inflated)
        # Update target network
        # if not inflated:
        #     self._update_target_network(self.state_value_target_residual,
        #                                 self.state_value_residual)
        # else:
        #     self._update_target_network(self.inflated_state_value_target_residual,
        #                                 self.inflated_state_value_residual)
        # Update normalizer
        self.feature_normalizer.update_normalizer(features)
        return loss
Exemplo n.º 8
0
 def get_num_features(self):
     start_cell_features = compute_features(
         self.start_cell, self.goal_cells[0], self.env.carry_cell,
         self.env.obstacle_cell_aa, self.env.obstacle_cell_bb,
         self.args.grid_size, self.env._grid_to_continuous)
     return start_cell_features.shape[0]