def reset(self, **kwargs): """ Agents are given random positions in the map, targets are given random positions near a random agent. Return a full state dict with agent ids (keys) that refer to their observation and global state """ obs_dict = {} locations = [] global_state = {} full_state = {} init_pose = self.get_init_pose(**kwargs) # Initialize agents for ii in range(self.num_agents): self.agents[ii].reset(init_pose['agents'][ii]) obs_dict[self.agents[ii].agent_id] = [] # Initialize targets and beliefs for jj in range(self.num_targets): self.belief_targets[jj].reset(init_state=np.concatenate( (init_pose['belief_targets'][jj][:2], np.zeros(2))), init_cov=self.target_init_cov) self.targets[jj].reset( np.concatenate( (init_pose['targets'][jj][:2], self.target_init_vel))) locations.append(self.targets[jj].state[:2]) #For each agent calculate belief of all targets for kk in range(self.num_agents): r, alpha = util.relative_distance_polar( self.belief_targets[jj].state[:2], xy_base=self.agents[kk].state[:2], theta_base=self.agents[kk].state[2]) logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) obs_dict[self.agents[kk].agent_id].extend( [r, alpha, 0.0, 0.0, logdetcov, 0.0]) # All targets and agents locations relative to map origin (targets then agents) for n in range(self.num_agents): locations.append(self.agents[n].state[:2]) global_state = util.global_relative_measure(np.array(locations), self.MAP.origin) # Full state dict for m, agent_id in enumerate(obs_dict): obs_dict[agent_id].extend([self.sensor_r, np.pi]) # Relative location and past action of all other agents for p, ids in enumerate(obs_dict): if agent_id != ids: r, alpha = util.relative_distance_polar( np.array(self.agents[p].state[:2]), xy_base=self.agents[m].state[:2], theta_base=self.agents[m].state[2]) obs_dict[agent_id].extend([r, alpha]) full_state[agent_id] = { 'obs': np.asarray(obs_dict[agent_id]), 'state': np.concatenate((obs_dict[agent_id], global_state)) } return full_state
def reset(self, **kwargs): """ Agents are given random positions in the map, targets are given random positions near a random agent. Return an observation state dict with agent ids (keys) that refer to their observation """ obs_dict = {} init_pose = self.get_init_pose(**kwargs) # Initialize agents for ii in range(self.num_agents): self.agents[ii].reset(init_pose['agents'][ii]) obs_dict[self.agents[ii].agent_id] = [] # Initialize targets and beliefs for jj in range(self.num_targets): self.belief_targets[jj].reset(init_state=np.concatenate( (init_pose['belief_targets'][jj][:2], np.zeros(2))), init_cov=self.target_init_cov) self.targets[jj].reset( np.concatenate( (init_pose['targets'][jj][:2], self.target_init_vel))) #For each agent calculate belief of all targets for kk in range(self.num_agents): r, alpha = util.relative_distance_polar( self.belief_targets[jj].state[:2], xy_base=self.agents[kk].state[:2], theta_base=self.agents[kk].state[2]) logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) obs_dict[self.agents[kk].agent_id].extend( [r, alpha, 0.0, 0.0, logdetcov, 0.0]) for agent_id in obs_dict: obs_dict[agent_id].extend([self.sensor_r, np.pi]) return obs_dict
def reset(self,**kwargs): """ Random initialization of number of targets at the reset of the env epsiode. Agents are given random positions in the map, targets are given random positions near a random agent. Return an observation state dict with agent ids (keys) that refer to their observation """ try: self.nb_targets = kwargs['nb_targets'] except: self.nb_targets = np.random.random_integers(1, self.num_targets) obs_dict = {} init_pose = self.get_init_pose(**kwargs) # Initialize agents for ii in range(self.num_agents): self.agents[ii].reset(init_pose['agents'][ii]) obs_dict[self.agents[ii].agent_id] = [] # Initialize all targets and beliefs for nn in range(self.num_targets): self.belief_targets[nn].reset( init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), init_cov=self.target_init_cov) self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) # For each agent calculate belief of targets assigned for jj in range(self.nb_targets): for kk in range(self.num_agents): r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], xy_base=self.agents[kk].state[:2], theta_base=self.agents[kk].state[2]) logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, logdetcov, 0.0, self.sensor_r, np.pi]) for agent_id in obs_dict: obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) return obs_dict
def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.nb_targets): self.targets[n].update() # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] _ = self.agents[ii].update( action_vw, [t.state[:2] for t in self.targets[:self.nb_targets]]) observed = [] # Update beliefs of all targets for jj in range(self.num_targets): # Observe obs = self.observation(self.targets[jj], self.agents[ii]) observed.append(obs[0]) self.belief_targets[jj].predict() # Belief state at t+1 if obs[0]: # if observed, update the target belief. self.belief_targets[jj].update(obs[1], self.agents[ii].state) obstacles_pt = map_utils.get_closest_obstacle( self.MAP, self.agents[ii].state) if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) # Calculate beliefs on only assigned targets for kk in range(self.nb_targets): r_b, alpha_b = util.relative_distance_polar( self.belief_targets[kk].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[kk].state[:2], self.belief_targets[kk].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].append([ r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[kk].cov)), float(observed[kk]), obstacles_pt[0], obstacles_pt[1] ]) obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict
def reset(self, **kwargs): """ Random initialization a number of agents and targets at the reset of the env epsiode. Agents are given random positions in the map, targets are given random positions near a random agent. Return an observation state dict with agent ids (keys) that refer to their observation """ try: self.nb_agents = kwargs['nb_agents'] self.nb_targets = kwargs['nb_targets'] except: self.nb_agents = np.random.random_integers(1, self.num_agents) self.nb_targets = np.random.random_integers(1, self.num_targets) # self.nb_targets=1 obs_dict = {} init_pose = self.get_init_pose(**kwargs) # Initialize all agents for ii in range(self.num_agents): self.agents[ii].reset(init_pose['agents'][ii]) # Only for nb agents in this episode if ii < self.nb_agents: obs_dict[self.agents[ii].agent_id] = [] # Initialize all targets and beliefs for nn in range(self.num_targets): self.belief_targets[nn].reset(init_state=np.concatenate( (init_pose['belief_targets'][nn][:2], np.zeros(2))), init_cov=self.target_init_cov) self.targets[nn].reset( np.concatenate( (init_pose['targets'][nn][:2], self.target_init_vel))) # For nb agents calculate belief of targets assigned for jj in range(self.nb_targets): for kk in range(self.nb_agents): r, alpha = util.relative_distance_polar( self.belief_targets[jj].state[:2], xy_base=self.agents[kk].state[:2], theta_base=self.agents[kk].state[2]) logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) obs_dict[self.agents[kk].agent_id].append( [r, alpha, 0.0, 0.0, logdetcov, 0.0, self.sensor_r, np.pi]) # Greedily assign agents to closest target in order, all targets assigned if agents > targets mask = np.ones(self.nb_targets, bool) if self.nb_targets > self.nb_agents: oracle = 1 else: oracle = 0 for agent_id in obs_dict: obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) if np.sum(mask) != np.maximum( 0, self.nb_targets - self.nb_agents + oracle): idx = np.flatnonzero(mask) close = idx[np.argmin(obs_dict[agent_id][:, 0][mask])] obs_dict[agent_id] = obs_dict[agent_id][None, close] mask[close] = False return obs_dict
def observation(self, target, agent): r, alpha = util.relative_distance_polar(target.state[:2], xy_base=agent.state[:2], theta_base=agent.state[2]) observed = (r <= self.sensor_r) \ & (abs(alpha) <= self.fov/2/180*np.pi) \ & (not(map_utils.is_blocked(self.MAP, agent.state, target.state))) z = None if observed: z = np.array([r, alpha]) # z += np.random.multivariate_normal(np.zeros(2,), self.observation_noise(z)) z += self.np_random.multivariate_normal(np.zeros(2, ), self.observation_noise(z)) '''For some reason, self.np_random is needed only here instead of np.random in order for the RNG seed to work, if used in the gen_rand_pose functions RNG seed will NOT work ''' return observed, z
def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.nb_targets): self.targets[n].update() # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] # Locations of all targets and agents in order to maintain a margin between them margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] for p, ids in enumerate(action_dict): if agent_id != ids: margin_pos.append(np.array(self.agents[p].state[:2])) _ = self.agents[ii].update(action_vw, margin_pos) # _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets[:self.nb_targets]]) observed = [] # Update beliefs of all targets for jj in range(self.num_targets): # Observe obs = self.observation(self.targets[jj], self.agents[ii]) observed.append(obs[0]) self.belief_targets[jj].predict() # Belief state at t+1 if obs[0]: # if observed, update the target belief. self.belief_targets[jj].update(obs[1], self.agents[ii].state) # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) # if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) # Calculate beliefs on only assigned targets for kk in range(self.nb_targets): r_b, alpha_b = util.relative_distance_polar( self.belief_targets[kk].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[kk].state[:2], self.belief_targets[kk].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].append([ r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[kk].cov)), float(observed[kk]), obstacles_pt[0], obstacles_pt[1] ]) # Greedily assign agents to closest target in order, all targets assigned if agents > targets mask = np.ones(self.nb_targets, bool) if self.nb_targets > self.nb_agents: oracle = 1 else: oracle = 0 for agent_id in obs_dict: obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) if np.sum(mask) != np.maximum( 0, self.nb_targets - self.nb_agents + oracle): idx = np.flatnonzero(mask) close = idx[np.argmin(obs_dict[agent_id][:, 0][mask])] obs_dict[agent_id] = obs_dict[agent_id][None, close] mask[close] = False # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict
def step(self, action_dict): obs_dict = {} locations = [] full_state = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.num_targets): self.targets[n].update() locations.append(self.targets[n].state[:2]) # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets]) locations.append(self.agents[ii].state[:2]) observed = [] for jj in range(self.num_targets): # Observe obs = self.observation(self.targets[jj], self.agents[ii]) observed.append(obs[0]) self.belief_targets[jj].predict() # Belief state at t+1 if obs[0]: # if observed, update the target belief. self.belief_targets[jj].update(obs[1], self.agents[ii].state) obstacles_pt = map_utils.get_closest_obstacle( self.MAP, self.agents[ii].state) if obstacles_pt is None: obstacles_pt = (self.sensor_r, np.pi) for kk in range(self.num_targets): r_b, alpha_b = util.relative_distance_polar( self.belief_targets[kk].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[kk].state[:2], self.belief_targets[kk].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].extend([ r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[kk].cov)), float(observed[kk]) ]) obs_dict[agent_id].extend([obstacles_pt[0], obstacles_pt[1]]) #Global state for each agent (ref is origin) global_state = util.global_relative_measure(np.array(locations), self.MAP.origin) # Full state dict for m, agent_id in enumerate(obs_dict): for p, ids in enumerate(obs_dict): if agent_id != ids: # Relative location and recent action of all other agents r, alpha = util.relative_distance_polar( np.array(self.agents[p].state[:2]), xy_base=self.agents[m].state[:2], theta_base=self.agents[m].state[2]) obs_dict[agent_id].extend([r, alpha]) full_state[agent_id] = { 'obs': np.asarray(obs_dict[agent_id]), 'state': np.concatenate((obs_dict[agent_id], global_state)) } # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return full_state, reward_dict, done_dict, info_dict
def step(self, action_dict): obs_dict = {} reward_dict = {} done_dict = {'__all__': False} info_dict = {} # Targets move (t -> t+1) for n in range(self.nb_targets): self.targets[n].update() self.belief_targets[n].predict() # Belief state at t+1 # Agents move (t -> t+1) and observe the targets for ii, agent_id in enumerate(action_dict): obs_dict[self.agents[ii].agent_id] = [] reward_dict[self.agents[ii].agent_id] = [] done_dict[self.agents[ii].agent_id] = [] action_vw = self.action_map[action_dict[agent_id]] # Locations of all targets and agents in order to maintain a margin between them margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] for p, ids in enumerate(action_dict): if agent_id != ids: margin_pos.append(np.array(self.agents[p].state[:2])) _ = self.agents[ii].update(action_vw, margin_pos) # _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets[:self.nb_targets]]) observed = np.zeros(self.nb_targets, dtype=bool) obstacles_pt = (self.sensor_r, np.pi) # Update beliefs of all targets for jj in range(self.nb_targets): # Observe obs, z_t = self.observation(self.targets[jj], self.agents[ii]) observed[jj] = obs if obs: # if observed, update the target belief. self.belief_targets[jj].update(z_t, self.agents[ii].state) # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) # if obstacles_pt is None: # Calculate beliefs on only assigned targets # for kk in range(self.nb_targets): r_b, alpha_b = util.relative_distance_polar( self.belief_targets[jj].state[:2], xy_base=self.agents[ii].state[:2], theta_base=self.agents[ii].state[-1]) r_dot_b, alpha_dot_b = util.relative_velocity_polar( self.belief_targets[jj].state[:2], self.belief_targets[jj].state[2:], self.agents[ii].state[:2], self.agents[ii].state[-1], action_vw[0], action_vw[1]) obs_dict[agent_id].append([ r_b, alpha_b, r_dot_b, alpha_dot_b, np.log(LA.det(self.belief_targets[jj].cov)), float(obs), obstacles_pt[0], obstacles_pt[1] ]) obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) # Get all rewards after all agents and targets move (t -> t+1) reward, done, mean_nlogdetcov = self.get_reward( obstacles_pt, observed, self.is_training) reward_dict['__all__'], done_dict['__all__'], info_dict[ 'mean_nlogdetcov'] = reward, done, mean_nlogdetcov return obs_dict, reward_dict, done_dict, info_dict