def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__':False} info_dict = {} # Targets move (t -> t+1) for n in range(self.nb_targets): self.targets[n].update() self.belief_targets[n].predict() # Belief state at t+1 # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] # Locations of all targets and agents in order to maintain a margin between them margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] for p, ids in enumerate(action_dict): if agent_id != ids: margin_pos.append(np.array(self.agents[p].state[:2])) _ = self.agents[ii].update(action_vw, margin_pos) # Target and map observations observed = np.zeros(self.nb_targets, dtype=bool) # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) # if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) # Update beliefs of targets for jj in range(self.nb_targets): # Observe obs, z_t = self.observation(self.targets[jj], self.agents[ii]) observed[jj] = obs if obs: # if observed, update the target belief. self.belief_targets[jj].update(z_t, self.agents[ii].state) r_b, alpha_b = util.relative_distance_polar(self.belief_targets[jj].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[jj].state[:2], self.belief_targets[jj].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[jj].cov)), float(obs), obstacles_pt[0], obstacles_pt[1]]) # Assign target for agent_id in obs_dict: obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) obs_dict[agent_id] = obs_dict[agent_id][None,self.greedy_dict[agent_id]] # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward(obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict
def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.num_targets): self.targets[n].update() # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets]) observed = [] for jj in range(self.num_targets): # Observe obs = self.observation(self.targets[jj], self.agents[ii]) observed.append(obs[0]) self.belief_targets[jj].predict() # Belief state at t+1 if obs[0]: # if observed, update the target belief. self.belief_targets[jj].update(obs[1], self.agents[ii].state) obstacles_pt = map_utils.get_closest_obstacle( self.MAP, self.agents[ii].state) if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) for kk in range(self.num_targets): r_b, alpha_b = util.relative_distance_polar( self.belief_targets[kk].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[kk].state[:2], self.belief_targets[kk].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].extend([ r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[kk].cov)), float(observed[kk]) ]) obs_dict[agent_id].extend([obstacles_pt[0], obstacles_pt[1]]) # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict
def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.nb_targets): self.targets[n].update() self.belief_targets[n].predict() # Belief state at t+1 # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] # Locations of all targets and agents in order to maintain a margin between them margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] for p, ids in enumerate(action_dict): if agent_id != ids: margin_pos.append(np.array(self.agents[p].state[:2])) _ = self.agents[ii].update(action_vw, margin_pos) # Target and map observations observed = np.zeros(self.nb_targets, dtype=bool) # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) # if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) # Update beliefs of targets for jj in range(self.nb_targets): # Observe obs, z_t = self.observation(self.targets[jj], self.agents[ii]) observed[jj] = obs if obs: # if observed, update the target belief. self.belief_targets[jj].update(z_t, self.agents[ii].state) r_b, alpha_b = util.relative_distance_polar( self.targets[jj].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.targets[jj].state[:2], self.targets[jj].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].append( [r_b, alpha_b, r_dot_b, alpha_dot_b, float(obs)]) else: # if no obs, take old obs + kalman like prediction (random noise) # self._obs_dict[agent_id][jj][:2] += self.np_random.multivariate_normal(np.zeros(2,), # self.observation_noise(self._obs_dict[agent_id][jj][:2])) # self._obs_dict[agent_id][jj][2:4] += self.np_random.multivariate_normal(np.zeros(2,), # self.observation_noise(self._obs_dict[agent_id][jj][:2])) # self._obs_dict[agent_id][jj][4] = float(obs) # obs_dict[agent_id].append(self._obs_dict[agent_id][jj]) obs_dict[agent_id].append([0.0, 0.0, 0.0, 0.0, float(obs)]) obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) # store obs dict before shuffle self._obs_dict[agent_id] = obs_dict[agent_id] # shuffle obs to promote permutation invariance self.rng.shuffle(obs_dict[agent_id]) # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict