def test_initialization(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') with pytest.raises(UnityActionException): env.step([0]) assert env.brain_names[0] == 'RealFakeBrain' env.close()
class BananaEnvironment: def __init__(self, file_name=None, **kwargs): self.__env = UnityEnvironment(file_name=file_name, seed=1234) # create environment self.__brain_name = self.__env.brain_names[0] self.__env.reset() self.__state_dim = self.__env.brains[ self.__brain_name].vector_observation_space_size self.__action_dim = self.__env.brains[ self.__brain_name].vector_action_space_size def step(self, action): env_info = self.__env.step(action)[self.__brain_name] # step next_state = env_info.vector_observations[0][ np.newaxis] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished return next_state, reward, done def reset(self, train_mode=True): env_info = self.__env.reset(train_mode=train_mode)[self.__brain_name] state = env_info.vector_observations[0][np.newaxis] return state def get_state_dim(self): return self.__state_dim def get_action_dim(self): return self.__action_dim def close(self): self.__env.close()
def test(unity_environment: str, checkpoint: str, seed: int = 42): env = UnityEnvironment(file_name=unity_environment, worker_id=42) print("start testing") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] agent = Agent(state_size=len(env_info.vector_observations[0]), action_size=brain.vector_action_space_size, seed=42) agent.qnetwork_local.load_state_dict(torch.load(checkpoint)) score = 0 env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] for j in range(200): action = agent.act(state) env_info = env.step(action)[brain_name] state = env_info.vector_observations[0] score += env_info.rewards[0] done = env_info.local_done[0] if done: print(f"Score: {score}") env.close() return score
class UnityEnv(): def __init__(self, env_file='data/Tennis_Windows_x86_64/Tennis.exe', no_graphics=True): self.env = UnityEnvironment(file_name=env_file, no_graphics=no_graphics) self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] self.action_size = brain.vector_action_space_size if type(self.action_size) != int: self.action_size = self.action_size[0] env_info = self.env.reset(train_mode=True)[self.brain_name] self.state_size = env_info.vector_observations.shape[1] self.num_agents = len(env_info.agents) def reset(self, train=True): env_info = self.env.reset(train_mode=train)[self.brain_name] # combine both agent state return env_info.vector_observations def close(self): self.env.close() def step(self, actions): actions = np.clip(actions, -1, 1) env_info = self.env.step(actions)[self.brain_name] # combine both agent state next_states = env_info.vector_observations rewards = env_info.rewards dones = np.array(env_info.local_done).astype(np.float) return next_states, np.array(rewards), dones @property def action_shape(self): return (self.num_agents, self.action_size)
def test(self, params): model_path = "navigation_{}.pth" env = UnityEnvironment(file_name="Reacher.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] num_states = len(env_info.vector_observations[0]) num_actions = brain.vector_action_space_size agent = Agent(params, num_states, num_actions, None) agent.load_model(model_path) state = env_info.vector_observations[0] score = 0 while True: action = agent.get_action(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward state = next_state if done: break print("test score: {}".format(score)) env.close()
class TennisEnv: def __init__(self): #self.env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe") self.env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64") # get the default brain self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] states, env_info = self.reset(True) # number of agents self.num_agents = len(env_info.agents) print('Number of agents:', self.num_agents) # size of each action self.action_size = self.brain.vector_action_space_size print('Size of each action:', self.action_size) # examine the state space self.state_size = states.shape[-1] print('There are {} agents. Each observes a state with length: {}'.format(2, self.state_size)) print('The state for the first agent looks like:', states[0, :]) print('The state for the second agent looks like:', states[1, :]) def reset(self, train_mode=True): env_info = self.env.reset(train_mode=train_mode)[self.brain_name] states = env_info.vector_observations return states, env_info def step(self, actions): env_info = self.env.step(actions)[self.brain_name] # send all actions to the environment next_states = env_info.vector_observations rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done return next_states, rewards, dones, env_info def close(self): self.env.close()
class UnityEnvWrapper: """ This class provides gym-like wrapper around the unity environment """ def __init__(self, env_file: str = 'Banana_Linux_NoVis/Banana.x86_64'): self._env = UnityEnvironment(file_name=env_file) self._brain_name = self._env.brain_names[0] self._brain = self._env.brains[self._brain_name] env_info = self._env.reset(train_mode=True)[self._brain_name] state = env_info.vector_observations[0] self.state_space_dim = len(state) self.action_space_size = self._brain.vector_action_space_size def reset(self, train_mode: bool = False): env_info = self._env.reset(train_mode)[self._brain_name] state = env_info.vector_observations[0] return state def step(self, action): env_info = self._env.step(action)[ self._brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished return next_state, reward, done, None def close(self): self._env.close()
class ProjectEnv(): def __init__(self, env_file_name): self.env = UnityEnvironment(file_name=env_file_name) self.brain_name = self.env.brain_names[0] env_info = self.env.reset(train_mode=True)[self.brain_name] self.action_size = self.env.brains[ self.brain_name].vector_action_space_size self.state_size = len(env_info.vector_observations[0]) def reset(self, train_mode=True): env_info = self.env.reset(train_mode)[self.brain_name] next_state = env_info.vector_observations[0] return next_state def step(self, action): env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return next_state, reward, done, env_info def close(self): self.env.close()
class EnvWrapper: """A wrapper for the unity environment which implements functionalies similar to openai gym Params ====== path(string): relative/absolute path to env executable """ def __init__(self, path, no_graphics=True): self.env = UnityEnvironment(file_name=path, no_graphics=no_graphics) self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.env_info = None self.reset() self.action_space = self.brain.vector_action_space_size self.observation_space = self.env_info.vector_observations.shape[1] def step(self, actions): self.env_info = self.env.step(actions)[self.brain_name] next_state = self.env_info.vector_observations reward = self.env_info.rewards done = self.env_info.local_done return next_state, reward, done, None def reset(self): self.env_info = self.env.reset(train_mode=True)[self.brain_name] return self.env_info.vector_observations
class BananaMazeEnv(EnvInterface): def __init__(self, env_binary='../bin/unity_banana_maze/Banana.x86_64', train_mode=True): self.env = UnityEnvironment(file_name=env_binary) self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.train_mode = train_mode self.info = self.env.reset(train_mode=self.train_mode)[self.brain_name] def reset(self): env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name] state = env_info.vector_observations[0] return state def step(self, action): env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return (next_state, reward, int(done)) def close(self): self.env.close() @property def state_size(self): state = self.info.vector_observations[0] return len(state) @property def action_size(self): action_size = self.brain.vector_action_space_size return action_size
class UnityMLVectorMultiAgent(): """Multi-agent UnityML environment with vector observations.""" def __init__(self, evaluation_only=False, seed=0): """Load platform specific file and initialize the environment.""" os = platform.system() if os == 'Darwin': file_name = 'Tennis.app' elif os == 'Linux': file_name = 'Tennis_Linux/Tennis.x86_64' self.env = UnityEnvironment(file_name='unity_envs/' + file_name, seed=seed) self.brain_name = self.env.brain_names[0] self.evaluation_only = evaluation_only def reset(self): """Reset the environment.""" info = self.env.reset( train_mode=not self.evaluation_only)[self.brain_name] state = info.vector_observations return state def step(self, action): """Take a step in the environment.""" info = self.env.step(action)[self.brain_name] state = info.vector_observations reward = info.rewards done = info.local_done return state, reward, done
class UnityMultiAgentEnvWrapper: """ This class provides gym-like wrapper around the unity environment with multiple agents """ def __init__(self, env_file: str, train_mode: bool = False): self._env = UnityEnvironment(file_name=env_file) self._train_mode = train_mode self._brain_name = self._env.brain_names[0] self._brain = self._env.brains[self._brain_name] env_info = self._env.reset(train_mode=True)[self._brain_name] self.num_agents = len(env_info.agents) self.state_space_dim = env_info.vector_observations.shape[1] self.action_space_dim = self._brain.vector_action_space_size def reset(self): env_info = self._env.reset(self._train_mode)[self._brain_name] state = env_info.vector_observations return state def step(self, action): env_info = self._env.step(action)[self._brain_name] # send the action to the environment next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done return next_states, rewards, dones, None def close(self): self._env.close()
class Reacher: def __init__(self): self.env = UnityEnvironment(file_name="files/Reacher.app") log.info("Reacher environment set up") def reset(self, train_mode=True) -> np.ndarray: """Reset the environment :param train_mode boolean to indicate whether to start environment in train mode :return the initial state """ env_info = self.env.reset(train_mode=train_mode)["ReacherBrain"] log.info("Reacher environment reset with train_mode=%s", train_mode) return env_info.vector_observations def step(self, action: np.ndarray) -> StepResult: """Take the action in the environment and collect the relevant information to return from the environment""" log.debug("taking step") assert action.shape == (20, 4) action_clipped = np.clip(action, -1, 1) env_info = self.env.step(action_clipped)["ReacherBrain"] # check the assumption that the end of an interaction is determined by a number of steps, hence all are done # at the same time if np.any(env_info.local_done): assert np.all(env_info.local_done) return StepResult(done=env_info.local_done, rewards=env_info.rewards, next_state=env_info.vector_observations)
class BananaEnv(gym.Env): def __init__(self, filename): super(BananaEnv, self).__init__() self.unity_env = UnityEnvironment(file_name=filename) self.brain_name = self.unity_env.brain_names[0] self.brain = self.unity_env.brains[self.brain_name] action_size = int(self.brain.vector_action_space_size) self.action_space = gym.spaces.Discrete(action_size) self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(37, )) def reset(self, train_mode=True): env_info = self.unity_env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations[0] return state def step(self, action): env_info = self.unity_env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return next_state, reward, done, env_info def close(self): return self.unity_env.close() def render(self, mode='human'): pass
class CollabCompeteEnv: def __init__(self, mode='train'): if mode != 'train': print('[Mode] Setting to Test Mode') self.train = False else: print('[Mode] Setting to Train Mode') self.train = True self.base_env = UnityEnvironment(file_name='Tennis.app') self.brain_name = self.base_env.brain_names[0] self.brain = self.base_env.brains[self.brain_name] self.action_size = self.brain.vector_action_space_size def reset(self): self.env_info = self.base_env.reset( train_mode=self.train)[self.brain_name] return self.get_state() def get_state(self): return self.env_info.vector_observations def step(self, action): self.env_info = self.base_env.step(action)[ self.brain_name] # send the action to the environment next_states = self.get_state() rewards = self.env_info.rewards dones = self.env_info.local_done return next_states, rewards, dones, None def close(self): self.base_env.close()
class Environment(): """Learning Environment.""" def __init__(self, file_name="environments/Tennis.app", no_graphics=True): """Initialize parameters and build model. Params ====== file_name (string): unity environment file no_graphics (boolean): Start environment with graphics """ self.env = UnityEnvironment(file_name=file_name, no_graphics=no_graphics) self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.reset() self.action_space_size = self.brain.vector_action_space_size self.state_size = len(self.info.vector_observations[0]) self.num_agents = len(self.info.agents) def __enter__(self): return self def __exit__(self, type, value, tb): self.env.close() def reset(self, train_mode=False): self.info = self.env.reset(train_mode=train_mode)[self.brain_name] return self.info def step(self, action): self.info = self.env.step(action)[self.brain_name] return self.info
class TennisEnv: def __init__(self, env_path): self.env = UnityEnvironment(file_name=env_path) self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] self.action_size = brain.vector_action_space_size self.state_size = brain.vector_observation_space_size def reset(self, train_mode): env_info = self.env.reset(train_mode)[self.brain_name] return env_info.vector_observations def step(self, action): env_info = self.env.step(action)[self.brain_name] return env_info.vector_observations, env_info.rewards, env_info.local_done def close(self): self.env.close() def run_episode(self, agent1, agent2, fast=False): states = self.reset(train_mode=fast) score1 = 0 score2 = 0 done = False while not done: action1 = agent1.compute_action(states[0], epsilon=0) action2 = agent2.compute_action(states[1], epsilon=0) states, rewards, dones = self.step([action1, action2]) score1 += rewards[0] score2 += rewards[1] if (rewards[0] != 0 or rewards[1] != 0) and not fast: print(f"Scores: {score1:.2f}, {score2:.2f}") return max(score1, score2)
def run_agent(num_episodes=1): env = UnityEnvironment(file_name="env/Reacher20.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] num_agents = len(env_info.agents) agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) agent.actor_local.load_state_dict(torch.load("model/checkpoint_actor.pth", map_location='cpu')) agent.critic_local.load_state_dict(torch.load("model/checkpoint_critic.pth", map_location='cpu')) for i in range(num_episodes): scores = np.zeros(num_agents) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations scores += env_info.rewards states = next_states if np.any(env_info.local_done): break print(f"{i + 1} episode, averaged score: {np.mean(scores)}")
def run(env_file, model_file, num_episodes=5): env = UnityEnvironment(file_name=env_file) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] state_shape = state.shape agent = Agent(state_shape=state_shape, action_size=action_size, seed=0) agent.qnetwork_local.load_state_dict(torch.load(model_file)) for i in range(num_episodes): env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward state = next_state if done: break print("Score: {}".format(score)) env.close()
class UnityEnvironmentWrapper(EnvInterface): def __init__(self, env_binary='../bin/tennis/Tennis.x86_64', train_mode=True): self.env = UnityEnvironment(file_name=env_binary) self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.train_mode = train_mode self.info = self.env.reset(train_mode=self.train_mode)[self.brain_name] self.num_agents = len(self.info.agents) def reset(self): env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name] states = env_info.vector_observations return states def step(self, actions): env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done return (next_states, rewards, np.array(dones) * 1) def close(self): self.env.close() @property def state_size(self): state = self.info.vector_observations[0] return len(state) @property def action_size(self): action_size = self.brain.vector_action_space_size return action_size
class EnvUnityMLAgents: def __init__(self, file_name, train_mode=True): self.env = UnityEnvironment(file_name=file_name) self.brain_name = self.env.brain_names[0] self.train_mode = train_mode env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name] self.num_agents = len(env_info.agents) brain = self.env.brains[self.brain_name] self.action_size = brain.vector_action_space_size states = env_info.vector_observations self.state_size = states.shape[1] print('Number of agents:', self.num_agents) print('Size of each action:', self.action_size) print('There are {} agents. Each observes a state with length: {}'. format(states.shape[0], self.state_size)) print('The state for the first agent looks like:', states[0]) def reset(self): env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name] rewards = env_info.rewards next_states = env_info.vector_observations dones = env_info.local_done return rewards, next_states, dones def step(self, actions): env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done return rewards, next_states, dones def close(self): self.env.close()
class VisualEnvironment(): def __init__(self, env_file, state_stack=4, train=True): self.state_stack = state_stack self.env_file = env_file self.env = UnityEnvironment(file_name=env_file) self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] self.action_size = brain.vector_action_space_size def step(self, action): env_info = self.env.step(action)[self.brain_name] self.states_history.append(env_info.visual_observations[0].transpose( [0, 3, 1, 2])) next_state = np.array(self.states_history).transpose([1, 2, 0, 3, 4]) reward = env_info.rewards[0] done = env_info.local_done[0] return (next_state, reward, done) def sample(self): return np.random.randint(0, self.action_size) # batch_size, channels (RGB), depth (state_stack), height, width def reset(self, train=True): self.states_history = deque(maxlen=self.state_stack) for _ in range(self.state_stack): self.states_history.append(np.zeros((1, 3, 84, 84))) # Reset environment env_info = self.env.reset(train_mode=train)[self.brain_name] self.states_history.append(env_info.visual_observations[0].transpose( [0, 3, 1, 2])) return np.array(self.states_history).transpose([1, 2, 0, 3, 4]) def close(self): self.env.close()
class UnityEnvironmentTask: def __init__(self, file_name, train_mode=True): self.train_mode = train_mode self.env = UnityEnvironment(file_name=file_name) self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.env_info = self.env.reset( train_mode=self.train_mode)[self.brain_name] self.num_agents = len(self.env_info.agents) self.action_dim = self.brain.vector_action_space_size print('Brain name:', self.brain_name) print('Number of agents:', self.num_agents) print('Size of each action:', self.action_dim) def reset(self): env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name] return env_info.vector_observations def step(self, actions): env_info = self.env.step(np.clip(actions, -1, 1))[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards terminals = env_info.local_done if np.any(terminals): next_states = self.reset() return next_states, np.asarray(rewards, dtype=np.float32), np.asarray( terminals, dtype=np.float32), env_info def close(self): self.env.close()
class Environment(): def __init__(self,path, seed = 0): self.env = UnityEnvironment(file_name=path, seed=seed); self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] # self.state_size = self.brain.vector_observation_space_size # bug, returns 8 :.( self.action_size = self.brain.vector_action_space_size info = self.env.reset(train_mode=True)[self.brain_name] self.state_size = len(info.vector_observations[0]) self.num_agents = len(info.agents) def close(self): self.env.close() def reset(self,train=True): info = self.env.reset(train_mode=train)[self.brain_name] return info.vector_observations def step(self,action): info = self.env.step(action)[self.brain_name] state = info.vector_observations reward = info.rewards done = info.local_done return state, reward, done, info
def main(): env_name = file_name = "Environments/Banana_Linux/Banana.x86_64" train_mode = True # Whether to run the environment in training or inference mode env = UnityEnvironment(file_name=env_name, no_graphics=False) # env = UnityEnvironment(file_name="/data/Banana_Linux_NoVis/Banana.x86_64") # Set the default brain to work with brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # Action and Observation spaces nA = brain.vector_action_space_size nS = env_info.vector_observations.shape[1] print('Observation Space {}, Action Space {}'.format(nS, nA)) seed = 7 agent = Priority_DQN(nS, nA, seed, UPDATE_EVERY, BATCH_SIZE, BUFFER_SIZE, MIN_BUFFER_SIZE, LR, GAMMA, TAU, CLIP_NORM, ALPHA) agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) # scores = train(agent,env,brain_name) for i in range(1): state = env.reset() img = plt.imshow(env.render(mode='rgb_array')) for j in range(500): action = agent.act(state) img.set_data(env.render(mode='rgb_array')) plt.axis('off') display.display(plt.gcf()) display.clear_output(wait=True) state, reward, done, _ = env.step(action) # save the image plt.savefig('test' + str(j) + '.png', bbox_inches='tight') if done: break # plot the scores plot(scores)
def process(args): env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score action_size = brain.vector_action_space_size state_size = len(state) agent = Agent(state_size, action_size, 1, args.model_path) while True: action = agent.act(state, 0.0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score))
class BananaWrapper: """Banana Unity Environment Wrapper. """ def __init__(self, file_name: Path): self.env = UnityEnvironment(file_name) # Get the default brain self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] def reset(self): env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations[0] return state def step(self, action): env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return next_state, reward, done def close(self): self.env.close() @property def action_size(self): return self.brain.vector_action_space_size @property def observation_size(self): return self.brain.vector_observation_space_size
class UnityMultiAgent(): """Multi-agent UnityML environment.""" def __init__(self, evaluation_only=False, seed=0, file_name='Tennis_Linux_NoVis/Tennis.x86_64'): """Load env file (platform specific, see README) and initialize the environment.""" self.env = UnityEnvironment(file_name=file_name, seed=seed) self.brain_name = self.env.brain_names[0] self.evaluation_only = evaluation_only def reset(self): """Reset the environment.""" info = self.env.reset( train_mode=not self.evaluation_only)[self.brain_name] state = info.vector_observations return state def step(self, action): """Take a step in the environment.""" info = self.env.step(action)[self.brain_name] state = info.vector_observations reward = info.rewards done = info.local_done return state, reward, done
class BananaEnvWrapper(object): blank_state = torch.zeros(1, 37, dtype=torch.uint8) """ Wraps the udacity enviroment into an object behaving like an atari env """ def __init__(self, train_mode=True, device='cuda'): self.train_mode = train_mode self.device = device self.unity_env = UnityEnvironment( file_name="/home/philipp/udacity/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64") # get the default brain self.brain_name = self.unity_env.brain_names[0] brain = self.unity_env.brains[self.brain_name] # reset the environment env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions self.action_space = brain.vector_action_space_size print('Number of actions:', self.action_space) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) self.state_size = len(state) print('States have length:', self.state_size) self.score = 0 self.episode = 0 def eval(self): self.train_mode = False def train(self): self.train_mode = True def reset(self): # print("Score: %d, epsiode: %d" % (self.score, self.episode)) self.episode += 1 self.score = 0 env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name] return env_info.vector_observations[0] # Return current state def step(self, action): env_info = self.unity_env.step(action)[self.brain_name] state = env_info.vector_observations[0] # get the current state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.score += reward # update the score return state, reward, done def close(self): self.unity_env.close()
def play(): env = UnityEnvironment(file_name='./Tennis.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # create agent maddpg_agent = MADDPG(state_size=state_size, action_size=action_size, seed=0) # load weights for i, agent in enumerate(maddpg_agent.maddpg_agent): agent.policy_local.load_state_dict( torch.load('models/checkpoint_actor_{}.pth'.format(i))) # reverse weights so agent 1 is on the left instead # for i, agent in enumerate(reversed(maddpg_agent.maddpg_agent)): # agent.policy_local.load_state_dict(torch.load('models/checkpoint_actor_{}.pth'.format(i))) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = maddpg_agent.act( states, add_noise=False) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Agent 0 score this episode: {}'.format(scores[0])) print('Agent 0 score this episode: {}'.format(scores[1])) env.close()
def train_agent(env: UnityEnvironment, brain_name: str, agent: Agent, n_episodes: int, max_steps: int = 1500) -> []: """ Trans the agent for n episodes :param env: :param brain_name: :param agent: :param n_episodes: number of episodes to train :param max_steps: max amount of steps :return: returns an array containing the score of every episode """ scores: [int] = [] # store the last 100 scores into a queue to check if the agent reached the goal scores_window = deque(maxlen=100) for i_episode in range(1, n_episodes + 1): # reset the environment env_info = env.reset(train_mode=True)[brain_name] agent.reset() state = env_info.vector_observations[0] score = 0 # the environment will end the episode after n steps, thus no manual termination of the episode is needed for a in range(max_steps): action: int = agent.act(state, add_noise=False) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward agent.step((state, action, reward, next_state, done)) state = next_state if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score # print('\rEpisode {}\tavg Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 10 == 0: print( f"""Episode {i_episode}: Average Score: {np.mean(scores_window):.2f}""" ) if np.mean(scores_window) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'checkpoint-actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint-critic.pth') break return scores
def test_step(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] mock_socket.recv.side_effect = dummy_reset brain_info = env.reset() mock_socket.recv.side_effect = dummy_step brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0]) brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) env.close() assert env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \ len(brain_info['RealFakeBrain'].agents) assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations assert not brain_info['RealFakeBrain'].local_done[0] assert brain_info['RealFakeBrain'].local_done[2]
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_'+self.env.academy_name else: self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters["trainer"] == "imitation": nodes += [scope + x for x in ["action"]] else: nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]] if self.trainers[brain_name].parameters["use_recurrent"]: nodes += [scope + x for x in ["recurrent_out", "memory_size"]] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException("""Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly." .format(model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info('The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model: if self.env.global_done: self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name]) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to Tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step_and_update_last_reward() if self.train_model: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: print('--------------------------Now saving model-------------------------') if self.train_model: self.logger.info("Learning was interrupted. Please wait while the graph is generated.") self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
per agents to be retrieved at the next step. - value is an optional input that be used to send a single float per agent to be displayed if and AgentMonitor.cs component is attached to the agent. if u have more than one brain, use dict for action per brain action = {'brain1': [1.0, 2.0], 'brain2': [3.0, 4.0]} ''' for epi in range(10): # env.global_done could be used to check all env_info = env.reset(train_mode=train_mode)[default_brain] state = env_info.states[0] done = False epi_rewards = 0 while not done: if brain.action_space_type == 'discrete': action = np.random.randint( 0, brain.action_space_size, size=(len(env_info.agents))) else: action = np.random.randn( len(env_info.agents), brain.action_space_size) env_info = env.step(action)[default_brain] state = env_info.states[0] epi_rewards += env_info.rewards[0] done = env_info.local_done[0] print('Total reward for this episode: {}'.format(epi_rewards)) env.close() print('Environment is closed')
class UnityEnv: ''' Class for all Envs. Standardizes the UnityEnv design to work in Lab. Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs ''' def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False def check_u_brain_to_agent(self): '''Check the size match between unity brain and agent''' u_brain_num = self.u_env.number_brains agent_num = len(self.body_e) assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.' def check_u_agent_to_body(self, env_info_a, a): '''Check the size match between unity agent and body''' u_agent_num = len(env_info_a.agents) body_num = util.count_nonan(self.body_e[a]) assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.' def get_brain(self, a): '''Get the unity-equivalent of agent, i.e. brain, to access its info''' name_a = self.u_env.brain_names[a] brain_a = self.u_env.brains[name_a] return brain_a def get_env_info(self, env_info_dict, a): name_a = self.u_env.brain_names[a] env_info_a = env_info_dict[name_a] return env_info_a @lab_api def post_body_init(self): '''Run init for components that need bodies to exist first, e.g. memory or architecture.''' self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e) self.check_u_brain_to_agent() logger.info(util.self_desc(self)) def is_discrete(self, a): '''Check if an agent (brain) is subject to discrete actions''' return self.get_brain(a).is_discrete() def get_action_dim(self, a): '''Get the action dim for an agent (brain) in env''' return self.get_brain(a).get_action_dim() def get_action_space(self, a): return self.action_spaces[a] def get_observable_dim(self, a): '''Get the observable dim for an agent (brain) in env''' return self.get_brain(a).get_observable_dim() def get_observable_types(self, a): '''Get the observable for an agent (brain) in env''' return self.get_brain(a).get_observable_types() def get_observation_space(self, a): return self.observation_spaces[a] @lab_api def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done return _reward_e, state_e, done_e @lab_api def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e @lab_api def close(self): self.u_env.close()