def test_step_await_combines_brain_info(self): all_brain_info_env0 = { "MockBrain": BrainInfo([], [[1.0, 2.0], [1.0, 2.0]], [], agents=[1, 2], memory=np.zeros((0, 0))) } all_brain_info_env1 = { "MockBrain": BrainInfo([], [[3.0, 4.0]], [], agents=[3], memory=np.zeros((0, 0))) } env_worker_0 = MockEnvWorker(0) env_worker_0.recv.return_value = EnvironmentResponse( "step", 0, all_brain_info_env0) env_worker_1 = MockEnvWorker(1) env_worker_1.recv.return_value = EnvironmentResponse( "step", 1, all_brain_info_env1) env = SubprocessUnityEnvironment(mock_env_factory, 0) env.envs = [env_worker_0, env_worker_1] env.waiting = True combined_braininfo = env.step_await()["MockBrain"] self.assertEqual( combined_braininfo.vector_observations.tolist(), [[1.0, 2.0], [1.0, 2.0], [3.0, 4.0]], ) self.assertEqual(combined_braininfo.agents, ["0-1", "0-2", "1-3"])
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents info which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations = [[]] vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) text_observations.append( agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index]) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info
def _revert_actions(self, brain_info: BrainInfo): action_in = brain_info.previous_vector_actions action = np.zeros(len(self._mlagent_action_space), dtype=int) i = 0 for k, v in self._mlagent_action_space.items(): VIEW_STEP = 1 if k == 'camera_left_right': velocity = action_in['camera'][1] rand = np.random.random_sample() * VIEW_STEP if velocity > 0 and velocity > rand: action[i] = 2 elif velocity < 0 and velocity < -rand: action[i] = 1 elif k == 'camera_up_down': velocity = action_in['camera'][0] rand = np.random.random_sample() * VIEW_STEP if velocity > 0 and velocity > rand: action[i] = 2 elif velocity < 0 and velocity < -rand: action[i] = 1 elif k in [ 'craft', 'equip', 'nearbyCraft', 'nearbySmelt', 'place' ]: action[i] = action_in[k] else: for move_i, move in enumerate(v): if move in action_in and action_in[move] == 1: action[i] = move_i i += 1 brain_info.previous_vector_actions = [action] # TODO action masks # vector_action_space_size = [len(v) for k,v in self._mlagent_action_space.items()] return brain_info
def step( self, vector_action: Dict[str, Any] = None, memory: Dict[str, Any] = None, text_action: Dict[str, Any] = None, value: Dict[str, Any] = None, ) -> AllBrainInfo: assert vector_action is not None delta = vector_action[BRAIN_NAME][0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: reward = SUCCESS_REWARD * self.position else: reward = -TIME_PENALTY agent_info = AgentInfoProto( stacked_vector_observation=[self.position] * OBS_SIZE, reward=reward, done=done, ) if done: self._reset_agent() return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def reset( self, config: Dict[str, float] = None, train_mode: bool = True, custom_reset_parameters: Any = None, ) -> AllBrainInfo: # type: ignore self._reset_agent() agent_info = AgentInfoProto( stacked_vector_observation=[self.position] * OBS_SIZE, done=False, max_step_reached=False, ) return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def create_brain_info(ob, agent_id, brain_params, reward=None, done=None, info=None, action=None, max_reached=False) -> BrainInfo: vector_obs = [] vis_obs = [] for k, v in ob.items(): if k == 'pov': v = ob['pov'] v = v.reshape(1, v.shape[0], v.shape[1], v.shape[2]) vis_obs = v elif k == 'equipped_items': v = float(ob['equipped_items']['mainhand']['damage']) max_damage = int(ob['equipped_items']['mainhand']['maxDamage']) v = v if max_damage == 0 else v / max_damage vector_obs.append(v) num_items = 9 types = [ 'none', 'air', 'wooden_axe', 'wooden_pickaxe', 'stone_axe', 'stone_pickaxe', 'iron_axe', 'iron_pickaxe', 'other' ] item_type = ob['equipped_items']['mainhand']['type'] if type(item_type) is str: item_type = types.index(item_type) item_type = np.array(int(item_type)) item_type_onehot = np.squeeze( np.eye(num_items)[item_type.reshape(-1)]) vector_obs.extend(item_type_onehot.tolist()) elif k == 'inventory': inventory = np.array([b for a, b in ob['inventory'].items()]) has_one_item = np.clip(inventory, 0, 1) num_items = np.clip(inventory, 0, 10) / 10 vector_obs.extend(has_one_item.tolist()) vector_obs.extend(num_items.tolist()) # OrderedDict([('coal', 0), ('cobblestone', 4), ('crafting_table', 1), ('dirt', 10), ('furnace', 1), ('iron_axe', 0), # ('iron_ingot', 0), ('iron_ore', 8), ('iron_pickaxe', 0), ('log', 0), ('planks', 15), ('stick', 0), ('stone', 0), ('stone_axe', 0), ...]) # 2 planks for stick v = 1 if ob['inventory']['planks'] >= 2 else 0 vector_obs.append(v) # 4 planks for crafting table v = 1 if ob['inventory']['planks'] >= 4 else 0 vector_obs.append(v) # 3 planks and two sticks for wooden sword v = 1 if ob['inventory']['planks'] >= 3 and ob['inventory'][ 'stick'] >= 2 else 0 vector_obs.append(v) # 8 cobblestones for furnace v = 1 if ob['inventory']['cobblestone'] >= 8 else 0 vector_obs.append(v) # 3 cobblestone and two sticks for stone pick axe v = 1 if ob['inventory']['cobblestone'] >= 3 and ob[ 'inventory']['stick'] >= 2 else 0 vector_obs.append(v) # 3 iron and two sticks for iron pick axe v = 1 if ob['inventory']['iron_ingot'] >= 3 and ob[ 'inventory']['stick'] >= 2 else 0 vector_obs.append(v) elif type(v) is dict or type(v) is OrderedDict: for a, b in ob['inventory'].items(): vector_obs.append((float)(b)) inventory = np.array([b for a, b in ob['inventory'].items()]) else: vector_obs.append((float)(v)) vector_obs = np.array(vector_obs) vector_obs = vector_obs.reshape(1, vector_obs.shape[0]) # vector_obs: List[np.ndarray] = [vector_obs] # vis_obs = vis_obs.reshape(1, vis_obs.shape[0], vis_obs.shape[1], vis_obs.shape[2]) vis_obs = [vis_obs] text_obs = [] memory = np.zeros((0, 0)) rew = reward if reward is not None else 0.0 rew = rew if not np.isnan(rew) else 0.0 rew = [rew] local_done = [done] if done is not None else [False] text_action = [] max_reached = [max_reached] agents = [agent_id] total_num_actions = sum(brain_params.vector_action_space_size) mask_actions = np.ones((len(agents), total_num_actions)) vector_action = action if action is not None else np.zeros( (len(agents), len(brain_params.vector_action_space_size))) custom_observations = [] brain_info = BrainInfo(visual_observation=vis_obs, vector_observation=vector_obs, text_observations=text_obs, memory=memory, reward=rew, agents=agents, local_done=local_done, vector_action=vector_action, text_action=text_action, max_reached=max_reached, action_mask=mask_actions, custom_observations=custom_observations) return brain_info