def get_qvalue(self, obs, ac): if self.env.check_goal(obs): return 0 cell = obs['observation'].copy() goal_cell = obs['desired_goal'].copy() value = compute_heuristic(cell, goal_cell, self.args.goal_threshold) features = compute_features(cell, goal_cell, self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) features_norm = self.feature_normalizer_q.normalize(features) ac_idx = self.actions_index[ac] residual_state_action_value = get_state_action_value_residual( features_norm, ac_idx, self.state_action_value_residual) return value + residual_state_action_value
def get_state_value(self, obs, inflated=False): if self.env.check_goal(obs): return 0 cell = obs['observation'].copy() goal_cell = obs['desired_goal'].copy() value = compute_heuristic(cell, goal_cell, self.args.goal_threshold) features = compute_features(cell, goal_cell, self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) features_norm = self.feature_normalizer.normalize(features) # Use inflated if need be if inflated: state_value_residual = self.inflated_state_value_residual else: state_value_residual = self.state_value_residual residual_value = get_state_value_residual(features_norm, state_value_residual) return value + residual_value
def __init__(self, args): self.args = args self.env = pr2_7d_xyzrpy_env(args, mass=0.01, use_gui=False, no_dynamics=True) self.start_cell = self.env.start_cell self.goal_cells = self.env.goal_cells self.num_features = compute_features( self.start_cell, self.goal_cells[0], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous).shape[0] self.representation_size = compute_representation( self.start_cell, self.args.grid_size, self.env._grid_to_continuous).shape[0] if self.args.agent == 'cmax': self.controller = pr2_7d_controller( self.env, num_expansions=self.args.num_expansions) elif self.args.agent == 'cmaxpp': self.controller = pr2_7d_q_controller( self.env, num_expansions=self.args.num_expansions) elif self.args.agent == 'adaptive_cmaxpp': self.controller = pr2_7d_q_controller( self.env, num_expansions=self.args.num_expansions) self.controller_inflated = pr2_7d_controller( self.env, num_expansions=self.args.num_expansions) elif self.args.agent in ['model', 'knn']: self.controller = pr2_7d_model_controller( self.env, num_expansions=self.args.num_expansions) elif self.args.agent == 'qlearning': self.controller = pr2_7d_qlearning_controller(self.env) self.actions = self.controller.actions self.actions_index = {} for ac_idx in range(len(self.actions)): self.actions_index[self.actions[ac_idx]] = ac_idx self.state_value_residual = StateValueResidual( in_dim=self.num_features, out_dim=1) self.state_action_value_residual = StateActionValueResidual( in_dim=self.num_features, out_dim=len(self.actions)) self.inflated_state_value_residual = StateValueResidual( in_dim=self.num_features, out_dim=1) if self.args.agent == 'model': # Global function approximator for dynamics residual self.dynamics_residual = DynamicsResidual( in_dim=self.representation_size, num_actions=len(self.actions), out_dim=7) if self.args.agent == 'knn': # Local function approximators for dynamics residual self.knn_dynamics_residuals = [ KNNDynamicsResidual(in_dim=7, radius=self.args.knn_radius, out_dim=7) for _ in range(len(self.actions)) ] self.kdtrees = {} for ac in self.actions: self.kdtrees[ac] = None self.delta = self.args.delta self.feature_normalizer = FeatureNormalizer(self.num_features) self.feature_normalizer_q = FeatureNormalizer(self.num_features) self.representation_normalizer_dyn = FeatureNormalizer( self.representation_size) # Configure heuristic and discrepancy for controller def get_state_value(obs): return self.get_state_value(obs, inflated=False) self.controller.reconfigure_heuristic(get_state_value) self.controller.reconfigure_discrepancy(self.get_discrepancy) if self.args.agent in ['cmaxpp', 'adaptive_cmaxpp']: self.controller.reconfigure_qvalue_fn(self.get_qvalue) if self.args.agent == 'model': self.controller.reconfigure_residual_dynamics( self.get_dynamics_residual) if self.args.agent == 'knn': self.controller.reconfigure_residual_dynamics( self.get_knn_dynamics_residual) # Configure heuristic and discrepancy for controller_inflated if self.args.agent == 'adaptive_cmaxpp': def get_state_value_inflated(obs): return self.get_state_value(obs, inflated=True) self.controller_inflated.reconfigure_heuristic( get_state_value_inflated) self.controller_inflated.reconfigure_discrepancy( self.get_discrepancy)
def update_state_action_value_residual_workers(self): if len(self.transition_buffer) == 0: # No incorrect transitions yet return # Sample a batch of transitions transitions = self._sample_transition_batch() # Get all the next observations as we need to query the controller # for their best estimate of cost-to-go observations_next = [ transition['obs_next'] for transition in transitions ] batch_size = len(observations_next) # Split jobs among workers num_workers = self.args.n_workers if batch_size < num_workers: num_workers = batch_size num_per_worker = batch_size // num_workers # Put state value residual in object store state_value_residual_state_dict_id = ray.put( self.state_value_target_residual.state_dict()) # Put kdtrees in object store kdtrees_serialized_id = ray.put(pickle.dumps(self.kdtrees)) # Put feature normalizer in object store feature_normalizer_state_dict_id = ray.put( self.feature_normalizer.state_dict()) # Put feature normalizer q in object store feature_normalizer_q_state_dict_id = ray.put( self.feature_normalizer_q.state_dict()) # Put state action value target residual in object store state_action_value_residual_state_dict_id = ray.put( self.state_action_value_target_residual.state_dict()) results, count = [], 0 for worker_id in range(num_workers): if worker_id == num_workers - 1: # last worker takes the remaining load num_per_worker = batch_size - count # Set parameters ray.get(self.workers[worker_id].set_worker_params.remote( state_value_residual_state_dict_id, kdtrees_serialized_id, feature_normalizer_state_dict_id, state_action_value_residual_state_dict_id, feature_normalizer_q_state_dict_id)) # send job results.append(self.workers[worker_id].lookahead_batch.remote( observations_next[count:count + num_per_worker])) # Increment count count += num_per_worker # Check if all observations have been accounted for assert count == batch_size # Get all targets results = ray.get(results) target_infos = [item for sublist in results for item in sublist] cells = [ transition['obs']['observation'] for transition in transitions ] goal_cells = [ transition['obs']['desired_goal'] for transition in transitions ] actions = [transition['ac'] for transition in transitions] ac_idxs = np.array([self.actions_index[ac] for ac in actions], dtype=np.int32) costs = np.array([transition['cost'] for transition in transitions], dtype=np.float32) heuristics = np.array([ compute_heuristic(cells[i], goal_cells[i], self.args.goal_threshold) for i in range(len(cells)) ], dtype=np.float32) features = np.array([ compute_features(cells[i], goal_cells[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_norm = self.feature_normalizer_q.normalize(features) # Get next state value value_next = np.array([info['best_node_f'] for info in target_infos], dtype=np.float32) assert value_next.shape[0] == heuristics.shape[0] # Compute targets targets = costs + value_next residual_targets = targets - heuristics # Clip the residual targets such that the residual is always positive residual_targets = np.maximum(residual_targets, 0) # Clip the residual targets so that the residual is not super big residual_targets = np.minimum(residual_targets, 20) loss = self._fit_state_action_value_residual(features_norm, ac_idxs, residual_targets) # Update normalizer self.feature_normalizer_q.update_normalizer(features) return loss
def update_state_action_value_residual_qlearning(self): if len(self.transition_buffer) == 0: # No transitions yet return # Sample a batch of transitions transitions = self._sample_transition_batch() cells = [ transition['obs']['observation'] for transition in transitions ] goal_cells = [ transition['obs']['desired_goal'] for transition in transitions ] actions = [transition['ac'] for transition in transitions] ac_idxs = np.array([self.actions_index[ac] for ac in actions], dtype=np.int32) costs = np.array([transition['cost'] for transition in transitions], dtype=np.float32) cells_next = [ transition['obs_next']['observation'] for transition in transitions ] goal_cells_next = [ transition['obs_next']['desired_goal'] for transition in transitions ] heuristics = np.array(compute_lookahead_heuristic_using_workers( self.workers, cells, goal_cells, actions), dtype=np.float32) # heuristics = np.array( # [compute_lookahead_heuristic(cells[i], goal_cells[i], actions[i], # self.controller, self.args.goal_threshold) # for i in range(len(cells))] # ) # heuristics_next = np.array( # [compute_heuristic(cells_next[i], # goal_cells_next[i], # self.args.goal_threshold) for i in range(len(cells))], # dtype=np.float32) heuristics_next = [] for ac in self.actions: heuristics_next.append( compute_lookahead_heuristic_using_workers( self.workers, cells_next, goal_cells_next, [ac for _ in range(len(cells_next))])) # heuristics_next.append( # [compute_lookahead_heuristic(cells_next[i], goal_cells_next[i], # ac, self.controller, self.args.goal_threshold) # for i in range(len(cells_next))] # ) heuristics_next = np.transpose( np.array(heuristics_next, dtype=np.float32)) features = np.array([ compute_features(cells[i], goal_cells[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_norm = self.feature_normalizer_q.normalize(features) features_next = np.array([ compute_features(cells_next[i], goal_cells_next[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_next_norm = self.feature_normalizer_q.normalize(features_next) # Compute next state value using the target state action value residual features_next_norm_tensor = torch.from_numpy(features_next_norm) with torch.no_grad(): qvalues_target_residual_next = self.state_action_value_target_residual( features_next_norm_tensor).detach().numpy() # Double Q-learning update qvalues_residual_next = self.state_action_value_residual( features_next_norm_tensor).detach().numpy() target_ac = np.argmin(qvalues_residual_next + heuristics_next, axis=1) qvalues_target_residual_next_chosen = np.take_along_axis( qvalues_target_residual_next, target_ac.reshape(-1, 1), axis=1).squeeze() heuristics_next_chosen = np.take_along_axis(heuristics_next, target_ac.reshape( -1, 1), axis=1).squeeze() qvalues_target_next = qvalues_target_residual_next_chosen + \ heuristics_next_chosen # Compute targets targets = costs + qvalues_target_next residual_targets = targets - heuristics # Clip the residual targets such that the residual is always positive residual_targets = np.maximum(residual_targets, 0) # Clip the residual targets so that the residual is not super big residual_targets = np.minimum(residual_targets, 20) loss = self._fit_state_action_value_residual(features_norm, ac_idxs, residual_targets) # Update normalizer self.feature_normalizer_q.update_normalizer(features) self.feature_normalizer_q.update_normalizer(features_next) return loss
def update_state_action_value_residual(self): if len(self.transition_buffer) == 0: # No transitions yet return # Sample a batch of transitions transitions = self._sample_transition_batch() cells = [ transition['obs']['observation'] for transition in transitions ] goal_cells = [ transition['obs']['desired_goal'] for transition in transitions ] actions = [transition['ac'] for transition in transitions] ac_idxs = np.array([self.actions_index[ac] for ac in actions], dtype=np.int32) costs = np.array([transition['cost'] for transition in transitions], dtype=np.float32) cells_next = [ transition['obs_next']['observation'] for transition in transitions ] goal_cells_next = [ transition['obs_next']['desired_goal'] for transition in transitions ] heuristics = np.array([ compute_heuristic(cells[i], goal_cells[i], self.args.goal_threshold) for i in range(len(cells)) ], dtype=np.float32) heuristics_next = np.array([ compute_heuristic(cells_next[i], goal_cells_next[i], self.args.goal_threshold) for i in range(len(cells)) ], dtype=np.float32) features = np.array([ compute_features(cells[i], goal_cells[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_norm = self.feature_normalizer_q.normalize(features) features_next = np.array([ compute_features(cells_next[i], goal_cells_next[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_next_norm = self.feature_normalizer.normalize(features_next) # Compute next state value features_next_norm_tensor = torch.from_numpy(features_next_norm) with torch.no_grad(): residual_next_tensor = self.state_value_target_residual( features_next_norm_tensor) residual_next = residual_next_tensor.detach().numpy().squeeze() value_next = residual_next + heuristics_next # Compute targets targets = costs + value_next residual_targets = targets - heuristics # Clip the residual targets such that the residual is always positive residual_targets = np.maximum(residual_targets, 0) # Clip the residual targets so that the residual is not super big residual_targets = np.minimum(residual_targets, 20) loss = self._fit_state_action_value_residual(features_norm, ac_idxs, residual_targets) # Update normalizer self.feature_normalizer_q.update_normalizer(features) self.feature_normalizer.update_normalizer(features_next) return loss
def update_state_value_residual(self, inflated=False): # Sample batch of states observations = self._sample_batch(inflated) batch_size = len(observations) num_workers = self.args.n_workers if batch_size < num_workers: num_workers = batch_size num_per_worker = batch_size // num_workers # Put state value target residual in object store state_value_residual_state_dict_id = ray.put( self.state_value_target_residual.state_dict()) # Put kdtrees in object store kdtrees_serialized_id = ray.put(pickle.dumps(self.kdtrees)) # Put feature normalizer in object store feature_normalizer_state_dict_id = ray.put( self.feature_normalizer.state_dict()) if self.args.agent in ['cmaxpp', 'adaptive_cmaxpp']: # Put feature normalizer q in object store feature_normalizer_q_state_dict_id = ray.put( self.feature_normalizer_q.state_dict()) # Put state action value target residual in object store state_action_value_residual_state_dict_id = ray.put( self.state_action_value_target_residual.state_dict()) else: feature_normalizer_q_state_dict_id = None state_action_value_residual_state_dict_id = None if self.args.agent == 'adaptive_cmaxpp': # Put inflated state value target residual in object store inflated_state_value_residual_state_dict_id = ray.put( self.inflated_state_value_target_residual.state_dict()) else: inflated_state_value_residual_state_dict_id = None if self.args.agent == 'model': dynamics_residual_state_dict_id = ray.put( self.dynamics_residual.state_dict()) representation_normalizer_dyn_state_dict_id = ray.put( self.representation_normalizer_dyn.state_dict()) else: dynamics_residual_state_dict_id = None representation_normalizer_dyn_state_dict_id = None if self.args.agent == 'knn': knn_dynamics_residuals_serialized_id = ray.put( pickle.dumps(self.knn_dynamics_residuals)) else: knn_dynamics_residuals_serialized_id = None results, count = [], 0 for worker_id in range(num_workers): if worker_id == num_workers - 1: # last worker takes the remaining load num_per_worker = batch_size - count # Set parameters ray.get(self.workers[worker_id].set_worker_params.remote( state_value_residual_state_dict_id, kdtrees_serialized_id, feature_normalizer_state_dict_id, state_action_value_residual_state_dict_id, feature_normalizer_q_state_dict_id, inflated_state_value_residual_state_dict_id, dynamics_residual_state_dict_id, knn_dynamics_residuals_serialized_id, representation_normalizer_dyn_state_dict_id)) # send job results.append(self.workers[worker_id].lookahead_batch.remote( observations[count:count + num_per_worker], inflated)) # Increment count count += num_per_worker # Check if all observations have been accounted for assert count == batch_size # Get all targets results = ray.get(results) target_infos = [item for sublist in results for item in sublist] cells = [ k.obs['observation'].copy() for info in target_infos for k in info['closed'] ] intended_goals = [ k.obs['desired_goal'].copy() for info in target_infos for k in info['closed'] ] assert len(cells) == len(intended_goals) heuristics = np.array([ compute_heuristic(cells[i], intended_goals[i], self.args.goal_threshold) for i in range(len(cells)) ], dtype=np.float32) targets = np.array([ info['best_node_f'] - k._g for info in target_infos for k in info['closed'] ], dtype=np.float32) residual_targets = targets - heuristics # Clip the residual targets such that the residual is always positive residual_targets = np.maximum(residual_targets, 0) # Clip the residual targets so that the residual is not super big residual_targets = np.minimum(residual_targets, 20) # Compute features of the cell features = np.array([ compute_features(cells[i], intended_goals[i], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) for i in range(len(cells)) ], dtype=np.float32) features_norm = self.feature_normalizer.normalize(features) loss = self._fit_state_value_residual(features_norm, residual_targets, inflated) # Update target network # if not inflated: # self._update_target_network(self.state_value_target_residual, # self.state_value_residual) # else: # self._update_target_network(self.inflated_state_value_target_residual, # self.inflated_state_value_residual) # Update normalizer self.feature_normalizer.update_normalizer(features) return loss
def get_num_features(self): start_cell_features = compute_features( self.start_cell, self.goal_cells[0], self.env.carry_cell, self.env.obstacle_cell_aa, self.env.obstacle_cell_bb, self.args.grid_size, self.env._grid_to_continuous) return start_cell_features.shape[0]