def load_dataset(file): D = np.load(file) assert D.shape[1] == 32 world_size = (int(D[0][0]), int(D[0][1])) dataset = [] for d in D: s = d[2:12] a = d[12:20] R = d[20] s_ = d[21:31] T = True if d[31] == 1 else 0 S = PursuitState.from_features(s, world_size) S_ = PursuitState.from_features(s_, world_size) A = [] for i in range(4): direction = (int(a[i * 2]), int(a[i * 2 + 1])) action = agent_directions().index(direction) A.append(action) A = tuple(A) datapoint = S, A, R, S_, T dataset.append(datapoint) return dataset
def extract_features(self, state: PursuitState): if self._feature_extraction_mode == "default": return state.features() elif self._feature_extraction_mode == "relative agent": return state.features_relative_agent(agent_id=0) elif self._feature_extraction_mode == "relative prey": return state.features_relative_prey() else: raise ValueError( f"Invalid feature extraction mode {self._feature_extraction_mode}" )
def pursuit_datapoint(timestep, world_size): from environment.PursuitState import PursuitState obs, action, reward, next_obs, terminal, info = timestep state = PursuitState.from_features(obs, world_size) next_state = PursuitState.from_features(next_obs, world_size) joint_actions = [action] + [ teammate_action for teammate_action in info["teammates actions"].values() ] datapoint = state, tuple(joint_actions), reward, next_state, terminal return datapoint
def transition(pursuit_state, joint_action, deterministic=False): action_space = agent_directions() world_size = pursuit_state.world_size num_agents = len(pursuit_state.agents_positions) num_preys = len(pursuit_state.prey_positions) occupied_positions = set(pursuit_state.prey_positions) | set( pursuit_state.agents_positions) directions = [action_space[a] for a in joint_action] agents_positions = [None] * num_agents prey_positions = [None] * num_preys agent_indices = [(i, True) for i in range(num_agents) ] + [(i, False) for i in range(num_preys)] if not deterministic: np.random.shuffle(agent_indices) for i, is_agent in agent_indices: if is_agent: position = pursuit_state.agents_positions[i] direction = directions[i] else: position = pursuit_state.prey_positions[i] direction = PursuitState.move_prey_randomly() new_position = move(position, direction, world_size) # If collision is detected, just go to the original position if new_position in occupied_positions: new_position = position occupied_positions.remove(position) occupied_positions.add(new_position) if is_agent: agents_positions[i] = new_position else: prey_positions[i] = new_position next_pursuit_state = PursuitState(tuple(agents_positions), tuple(prey_positions), world_size) reward = 100 if next_pursuit_state.is_terminal else -1.0 return next_pursuit_state, reward
def predict_teammate_policy(self, teammate_id, state): x = PursuitState.features_relative_agent(state, teammate_id) x.reshape(1, -1) model = self.models[teammate_id - 1] scores = model.predict(x) policy = softmax(scores, dim=0).numpy() policy /= policy.sum() return policy
def build_mdp_features(state, joint_actions): """Features used for environment models""" coordinates = PursuitState.features(state) actions_one_hot = actions_one_hot_encoding(joint_actions) # St + At (one hot encoded) mdp_features = np.concatenate((coordinates, actions_one_hot)) return mdp_features
def __init__(self, teammates="greedy", num_teammates=3, world_size=(5, 5), features="default", deterministic=False, initial_state=None): super(Pursuit, self).__init__() self.action_space = Discrete(4) self.reward_range = (-np.inf, np.inf) self.metadata = {} self._num_agents = num_teammates + 1 self._action_descriptions = action_meanings() self._feature_extraction_mode = features self.num_actions = 4 self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self.num_features, ), dtype=np.float64) self.name = "Pursuit" self._world_size = world_size self._team_name = teammates self._teammates = self._initialize_teammates(teammates, num_teammates) self._pursuit_state = None self._first_render = True if deterministic and initial_state is not None: self._initial_state = lambda: initial_state elif deterministic: self._initial_state = PursuitState.random_state( num_teammates + 1, world_size) else: self._initial_state = lambda: PursuitState.random_state( self._num_agents, self._world_size) self._deterministic = deterministic
def prepare_individual_batches(self, batch): m = len(batch) F = self.features N = len(self.models) X = [np.zeros((m, F)) for _ in range(N)] Y = [np.zeros(m) for _ in range(N)] for i, datapoint in enumerate(batch): state, joint_actions, reward, next_state, terminal = datapoint for t in range(N): teammate = t + 1 teammate_action = joint_actions[teammate] x = PursuitState.features_relative_agent(state, teammate) X[t][i] = x Y[t][i] = teammate_action return X, Y
def remove_collisions(state, next_state): """Randomly rolls back colliding agents in a state st+1 back to their original positions in state st""" vacancy_constant = state.world_size[0] + 99 old_positions = state.features() positions = next_state.features() num_agents = 4 indices = list(range(num_agents + 1)) random.shuffle(indices) collision = False for target in indices: x1 = positions[target * 2 + 0] y1 = positions[target * 2 + 1] for other in range(num_agents + 1): if target == other: continue x2 = positions[other * 2 + 0] y2 = positions[other * 2 + 1] collision = x2 == x1 and y2 == y1 if collision: break positions = roll_back(target, positions, old_positions, vacancy_constant) if collision else positions corrected_next_state = PursuitState.from_features(positions, state.world_size) return corrected_next_state
def policy(self, observation): pursuit_state = PursuitState.from_features(observation, self.world_size) action = self.select_action_according_to_model( pursuit_state, self.most_likely_model()) return deterministic_policy(action, num_actions=4)