def test_step(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] mock_socket.recv.side_effect = dummy_reset brain_info = env.reset() mock_socket.recv.side_effect = dummy_step brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0]) brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) env.close() assert env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \ len(brain_info['RealFakeBrain'].agents) assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations assert not brain_info['RealFakeBrain'].local_done[0] assert brain_info['RealFakeBrain'].local_done[2]
def test_ppo_model_cc_visual_curio(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=2) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate, model.intrinsic_reward] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.output: [[0.0, 0.0], [0.0, 0.0]], model.visual_in[0]: np.ones([2, 40, 30, 3]), model.visual_in[1]: np.ones([2, 40, 30, 3]), model.next_visual_in[0]: np.ones([2, 40, 30, 3]), model.next_visual_in[1]: np.ones([2, 40, 30, 3]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_initialization(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') with pytest.raises(UnityActionException): env.step([0]) assert env.brain_names[0] == 'RealFakeBrain' env.close()
def test_close(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') assert env._loaded env.close() assert not env._loaded mock_socket.close.assert_called_once()
def test_ppo_model_discrete(): d_action_c_state_start = '''{ "AcademyName": "RealFakeAcademy", "resetParameters": {}, "brainNames": ["RealFakeBrain"], "externalBrainNames": ["RealFakeBrain"], "logPath":"RealFakePath", "apiNumber":"API-3", "brainParameters": [{ "vectorObservationSize": 3, "numStackedVectorObservations": 2, "vectorActionSize": 2, "memorySize": 0, "cameraResolutions": [{"width":30,"height":40,"blackAndWhite":false}], "vectorActionDescriptions": ["",""], "vectorActionSpaceType": 0, "vectorObservationSpaceType": 1 }] }'''.encode() tf.reset_default_graph() with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: # End of mock with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = d_action_c_state_start env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_cc_bc_model(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.policy] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def test_cc_bc_model(): c_action_c_state_start = '''{ "AcademyName": "RealFakeAcademy", "resetParameters": {}, "brainNames": ["RealFakeBrain"], "externalBrainNames": ["RealFakeBrain"], "logPath":"RealFakePath", "apiNumber":"API-3", "brainParameters": [{ "vectorObservationSize": 3, "numStackedVectorObservations": 2, "vectorActionSize": 2, "memorySize": 0, "cameraResolutions": [], "vectorActionDescriptions": ["",""], "vectorActionSpaceType": 1, "vectorObservationSpaceType": 1 }] }'''.encode() tf.reset_default_graph() with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: # End of mock with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = c_action_c_state_start env = UnityEnvironment(' ') model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.policy] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def test_ppo_model_dc_vector(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=True, visual_inputs=0) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
class UnityEnv(gym.Env): """ Provides Gym wrapper for Unity Learning Environments. Multi-agent environments use lists for object types, as done here: https://github.com/openai/multiagent-particle-envs """ def __init__(self, params): environment_filename = params['path'] worker_id = params['worker_id'] seed = params['seed'] use_visual = params['visual_mode'] multiagent = params['multiagent_mode'] self._env = UnityEnvironment(environment_filename, seed=seed) self.name = self._env.academy_name self.visual_obs = None self._action_space_size = None self._current_state = None self._n_agents = None self._multiagent = multiagent # Check brain configuration if len(self._env.brains) != 1: raise UnityGymException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if use_visual and brain.number_visual_observations == 0: raise UnityGymException( "`use_visual` was set to True, however there are no" " visual observations as part of this environment.") self.use_visual = brain.number_visual_observations >= 1 and use_visual if brain.number_visual_observations > 1: logger.warning( "The environment contains more than one visual observation. " "Please note that only the first will be provided in the observation." ) if brain.num_stacked_vector_observations != 1: raise UnityGymException( "There can only be one stacked vector observation in a UnityEnvironment " "if it is wrapped in a gym.") # Check for number of agents in scene. initial_info = self._env.reset()[self.brain_name] self._check_agents(len(initial_info.agents)) # Set observation and action spaces if brain.vector_action_space_type == "discrete": if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete( brain.vector_action_space_size[0]) else: self._action_space = spaces.MultiDiscrete( brain.vector_action_space_size) else: self._action_space_size = brain.vector_action_space_size high = np.array([1] * brain.vector_action_space_size) self._action_space = spaces.Box(-high, high, dtype=np.float32) high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions if self.use_visual: if brain.camera_resolutions[0]["blackAndWhite"]: depth = 1 else: depth = 3 self._observation_space = spaces.Box( 0, 1, dtype=np.float32, shape=(brain.camera_resolutions[0]["height"], brain.camera_resolutions[0]["width"], depth)) else: self._observation_space = spaces.Box(-high, high, dtype=np.float32) def reset(self, train_mode=True): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ info = self._env.reset(train_mode)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._multiagent: if not isinstance(action, list): raise UnityGymException( "The environment was expecting `action` to be a list.") if len(action) != self._n_agents: raise UnityGymException( "The environment was expecting a list of {} actions.". format(self._n_agents)) else: action = np.array(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs, reward, done, info def _single_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations[0][0, :, :, :] default_observation = self.visual_obs else: default_observation = info.vector_observations[0, :] return default_observation, info.rewards[0], info.local_done[0], { "text_observation": info.text_observations[0], "brain_info": info } def _multi_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations default_observation = self.visual_obs else: default_observation = info.vector_observations return list(default_observation), info.rewards, info.local_done, { "text_observation": info.text_observations, "brain_info": info } def render(self, mode='rgb_array'): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ self._env.close() def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Currently not implemented. """ logger.warning("Could not seed environment %s", self.name) return def _check_agents(self, n_agents): if not self._multiagent and n_agents > 1: raise UnityGymException( "The environment was launched as a single-agent environment, however" "there is more than one agent in the scene.") elif self._multiagent and n_agents <= 1: raise UnityGymException( "The environment was launched as a mutli-agent environment, however" "there is only one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityGymException( "The number of agents in the environment has changed since " "initialization. This is not supported.") @property def metadata(self): return {'render.modes': ['rgb_array']} @property def reward_range(self): return -float('inf'), float('inf') @property def spec(self): return None @property def action_space_size(self): return self._action_space_size @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name] trainer.reset_buffers(info, total=True) # Decide and take an action new_info = trainer.take_action(info, env, brain_name, steps, normalize) info = new_info trainer.process_experiences(info, time_horizon, gamma, lambd) if len(trainer.training_buffer['actions']) > buffer_size and train_model: # Perform gradient descent with experience buffer trainer.update_model(batch_size, num_epoch) if steps % summary_freq == 0 and steps != 0 and train_model: # Write training statistics to tensorboard. trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number) if steps % save_freq == 0 and steps != 0 and train_model: # Save Tensorflow model save_model(sess, model_path=model_path, steps=steps, saver=saver) if train_model: steps += 1 sess.run(ppo_model.increment_step) if len(trainer.stats['cumulative_reward']) > 0: mean_reward = np.mean(trainer.stats['cumulative_reward']) sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward}) last_reward = sess.run(ppo_model.last_reward) # Final save Tensorflow model if steps != 0 and train_model: save_model(sess, model_path=model_path, steps=steps, saver=saver) env.close() graph_name = (env_name.strip() .replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', '')) graph_name = os.path.basename(os.path.normpath(graph_name)) export_graph(model_path, graph_name)
def experiment(hidden_size=64, lr=3e-4, num_steps=2048, mini_batch_size=32, ppo_epochs=10, threshold_reward=10, max_episodes=15, nrmlz_adv=True, clip_gradients=True): use_cuda = torch.cuda.is_available() # device = torch.device("cuda" if use_cuda else "cpu") device = torch.device("cpu") print(device) scores_window = deque(maxlen=100) test_rewards = [] # env = UnityEnvironment(file_name='p2_continuous-control/reacher20/reacher', base_port=64739) env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] num_inputs = state_size num_outputs = action_size model = ActorCriticPolicy(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-5) # while episode < max_episodes and not early_stop: for episode in tqdm(range(max_episodes)): log_probs = [] values = [] states_list = [] actions_list = [] rewards = [] masks = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations for duration in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action_t = dist.sample() action_np = action_t.cpu().data.numpy() env_info = env.step(action_np)[ brain_name] # send all actions to the environment next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) dones = np.array(env_info.local_done) # see if episode finished if reward == None: pass log_prob = dist.log_prob(action_t) log_prob = torch.sum(log_prob, dim=1, keepdim=True) log_probs.append(log_prob) values.append(value) reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device) masks_t = torch.FloatTensor(1 - dones) rewards.append(reward_t) masks.append(masks_t) states_list.append(state) actions_list.append(action_t) state = next_state if np.any(dones): break next_state = torch.FloatTensor(state).to(device) _, next_value = model(next_state) # returns = compute_gae(next_value, rewards, masks, values) mean1 = torch.mean(torch.stack(rewards)) print("Rewards: ", mean1) returns = compute_gaes(next_value, rewards, masks, values) # return2 = compute_gae_rollout(rollout) returns = torch.cat(returns).detach() mean2 = torch.mean(returns) #print("Returns: ", mean2) log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states_list) actions = torch.cat(actions_list) advantages = returns - values if nrmlz_adv: advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) losses = [] clip_param = 0.2 print("return: ", returns.mean(), "advantage:", advantages.mean()) for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in ppo_iter( mini_batch_size, states, actions, log_probs, returns, advantages): # print("return: ", return_.mean(), "advantage:", advantage.mean()) dist, value = model(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) new_log_probs = torch.sum(new_log_probs, dim=1, keepdim=True) ratio = (new_log_probs - old_log_probs).exp() # surrogate objective surr1 = ratio * advantage # Clipped Surrogate Objectiv surr2 = ratio.clamp(1.0 - clip_param, 1.0 + clip_param) * advantage policy_loss = -torch.min(surr1, surr2).mean() - 0.01 * entropy value_loss = (return_ - value).pow(2).mean() loss = 0.5 * value_loss + policy_loss losses.append(loss) optimizer.zero_grad() loss.backward() if clip_gradients: nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() test_mean_reward = test_agent(env, brain_name, model, device) test_rewards.append(test_mean_reward) scores_window.append(test_mean_reward) # mean_score = np.mean(scores_window) # print("Mean Score: ", mean_score, "Frame: ", episode) print('Episode {}, Total score this episode: {}, Last {} average: {}'. format(episode, test_mean_reward, min(episode, 100), np.mean(scores_window))) if np.mean(scores_window) > threshold_reward: torch.save( model.state_dict(), f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth" ) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, test_mean_reward)) break episode += 1 # %% #torch.save(model.state_dict(), # f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}.pth") env.close() return scores_window, test_rewards
def train( env_location, curve_path, n_episodes=1000, batch_size=512, buffer_size=int(1e6), ): env = UnityEnvironment(file_name=env_location) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) logger.info(f'Number of agents: {num_agents}') # size of each action action_size = brain.vector_action_space_size logger.info(f'Size of each action: {action_size}') # examine the state space states = env_info.vector_observations state_size = states.shape[1] logger.info( 'There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) logger.info(f'The state for the first agent looks like: {states[0]}') # reset the environment # Replay memory random_seed = 2 memory0 = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) memory1 = memory0 def create_agent(memory): return Agent(state_size=states.shape[1], action_size=brain.vector_action_space_size, random_seed=random_seed, memory=memory, batch_size=batch_size) agent0 = create_agent(memory0) agent1 = create_agent(memory1) def ddpg(n_episodes, average_window=100, plot_every=4): scores_deque = deque(maxlen=average_window) scores_all = [] average_scores_all = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = np.array( env_info.vector_observations, copy=True) # get the current state (for each agent) agent0.reset() agent1.reset() scores = np.zeros( num_agents) # initialize the score (for each agent) while True: action0 = agent0.act(states[0]) action1 = agent1.act(states[1]) actions = np.concatenate((action0, action1)) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished memory0.add(states[0], action0, rewards[0], next_states[0], dones[0]) memory1.add(states[1], action1, rewards[1], next_states[1], dones[1]) agent0.step() agent1.step() scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step any_done = np.any(dones) assert any_done == np.all(dones) if any_done: # exit loop if episode finished break score_episode = np.max(scores) best_agent = np.argmax(scores) scores_deque.append(score_episode) scores_all.append(score_episode) average_score_queue = np.mean(scores_deque) average_scores_all.append(average_score_queue) logger.info( '\rEpisode {}\tScore: {:.4f}\tBest Agent: {}\tAverage Score: {:.4f}' .format(i_episode, score_episode, best_agent, average_score_queue)) torch.save(agent0.actor_local.state_dict(), 'checkpoint_actor0.pth') torch.save(agent0.critic_local.state_dict(), 'checkpoint_critic0.pth') torch.save(agent1.actor_local.state_dict(), 'checkpoint_actor1.pth') torch.save(agent1.critic_local.state_dict(), 'checkpoint_critic1.pth') if i_episode > average_window and average_score_queue > 1.0: break if i_episode % plot_every == 0: plot_curve(scores_all, average_scores_all) return scores_all, average_scores_all scores, average_scores = ddpg(n_episodes=n_episodes) plot_curve(scores, average_scores) env.close() return np.max(average_scores)
class UnityEnvV0(Env, Serializable): def __init__(self, app_name, time_state=False, idx=0, is_render=False, no_graphics=False, recording=True): Serializable.quick_init(self, locals()) # Unity scene self._env = UnityEnvironment(file_name=app_name, worker_id=idx, no_graphics=no_graphics) self.id = 0 self.name = app_name self.idx = idx self.is_render = is_render self.time_state = time_state self.time_step = 0 # Check brain configuration assert len(self._env.brains) == 1 self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] # Check for number of agents in scene initial_info = self._env.reset()[self.brain_name] self.use_visual = (brain.number_visual_observations == 1) and False self.recording = brain.number_visual_observations == 1 and recording # Set observation and action spaces if brain.vector_action_space_type == "discrete": self._action_space = Discrete(1) else: high = np.array([np.inf] * (brain.vector_action_space_size)) self._action_space = Box(-high, high) # ---------------------------------- if self.use_visual and False and no_graphic: high = np.array([np.inf] * brain.camera_resolutions[0]["height"] * brain.camera_resolutions[0]["width"] * 3) self._observation_space = Box(-high, high) else: if self.time_state: high = np.array([np.inf] * (brain.vector_observation_space_size + 1)) else: high = np.array([np.inf] * (brain.vector_observation_space_size)) self._observation_space = Box(-high, high) # video buffer self.frames = [] def reset(self): self.frames = [] info = self._env.reset()[self.brain_name] if self.is_render: self.observation = info.visual_observations[0] state = info.vector_observations[0][:] self._pos = info.vector_observations[0][:2] if self.time_state: state = np.hstack((state, [self.time_step])) self.time_step += 1 self._collect_frames(info.visual_observations[0][0]) return state.flatten() def step(self, action): info = self._env.step([action])[self.brain_name] if self.is_render: self.observation = info.visual_observations[0] state = info.vector_observations[0][:] self._pos = info.vector_observations[0][:2] reward = info.rewards[0] done = info.local_done[0] if self.time_state: state = np.hstack((state, [self.time_step])) self.time_step += 1 if done: self.time_step = 0 self._collect_frames(info.visual_observations[0][0]) return Step(observation=state.flatten(), reward=reward, done=done) def terminate(self): self._env.close() def render(self, mode=None): if self.is_render: x = self.observation[0] * 255 return np.array(x).astype('uint8') else: return np.zeros((480, 360, 3)) def _collect_frames(self, frame): if self.recording: self.frames.append(np.uint8(frame * 255)) @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def position(self): return self._pos
class UnityEnv(IEnvironment): def __init__(self, name): drl_logger.info("Initializing environment.'", extra={"params": { "name": name, }}) self.env = UnityEnvironment(file_name=name) self.brain_name = self.env.brain_names[0] self.termination_reward = 0 def action_offset(self): return 0 def close(self): self.env.close() def get_action_space(self): # isDiscrete = isinstance(self.__env.action_space, Discrete) # # if isDiscrete: # num_action_space = self.__env.action_space.n # logging.debug("Env action space is discrete") # logging.debug("Env action space: {}".format(num_action_space)) # # logging.debug("Env observation space: {}".format(self.__env.observation_space)) pass def render(self, mode): pass def reset(self): brain_name = self.env.brain_names[0] # brain = self.__env.brains[brain_name] env_info = self.env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state # state = env_info.vector_observations # get the current state new_life = True return state, new_life def start_game_action(self): return None def step(self, action): env_info = self.env.step(action)[ self.brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished if done: reward += self.termination_reward new_life = False return next_state, reward, done, new_life
def main(): env = UnityEnvironment(file_name='Reacher.app') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size=state_size, action_size=action_size, random_seed=3) scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, 1000): begin = time.time() curr_scores = np.zeros( num_agents) # initialize the score (for each agent) env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) agent.reset() for t in range(1000): actions = agent.act(states) env_info = env.step(actions)[ brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished agent.step(states, actions, rewards, next_states, dones, t) states = next_states curr_scores += rewards if np.any(dones): break curr_score = np.mean(curr_scores) scores_deque.append(curr_score) average_score = np.mean(scores_deque) scores.append(curr_score) print( '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}' .format(i_episode, time.time() - begin, average_score, curr_score, min(curr_scores), max(curr_scores))) if i_episode % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if average_score >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, average_score)) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break env.close() return
def train_unity_ddpg(PATH, env_name, platform, env_path, policy, score_threshold, timestamp, start, n_episodes, max_t, num_agents): """ Trains unity environments with DDPG policy """ total_scores = [] from unityagents import UnityEnvironment env_path = PATH + f"data/{env_path}" env = UnityEnvironment(file_name=env_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) print(f"Number of agents: {num_agents}") states = env_info.vector_observations state_size = states.shape[1] print( f"There are {states.shape[0]} agents. Each observes a state with length {state_size}" ) print(f"The state for the first agent looks like:\n{states[0]}") action_size = brain.vector_action_space_size print(f"Size of each action: {action_size}") policy = policy(state_size, action_size, num_agents) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) policy.reset() for t in range(max_t): actions = policy.act(states) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations rewards = env_info.rewards # get the reward dones = env_info.local_done policy.step(states, actions, rewards, next_states, dones, t) states = next_states scores += env_info.rewards if np.any(dones): break score_length = len(total_scores) if len(total_scores) < 100 else 100 mean_score = np.mean(scores) min_score = np.min(scores) max_score = np.max(scores) total_scores.append(mean_score) total_average_score = np.mean(total_scores[-score_length:]) end = time.time() print( f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}', end=" ") if i_episode % 20 == 0 or total_average_score >= score_threshold: fap = PATH + f'results/{env_name}_{timestamp}_checkpoint_actor.pth' torch.save(policy.actor.state_dict(), fap) fcp = PATH + f'results/{env_name}_{timestamp}_checkpoint_critic.pth' torch.save(policy.critic.state_dict(), fcp) print( f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}' ) if total_average_score > score_threshold: print(f"Solved in {i_episode} and {calc_runtime(end-start)}") break env.close() return total_scores
def main(): # --------------------------------------------------------------------------------------------------- # Logger # --------------------------------------------------------------------------------------------------- save_path = f"./results/Tennis_DDPG_{pd.Timestamp.utcnow().value}" os.makedirs(save_path, exist_ok=True) logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s : %(message)s') handler = logging.FileHandler( f"{save_path}/logs_p3_{pd.Timestamp.utcnow().value}.log") handler.setLevel(logging.DEBUG) handler.setFormatter(formatter) logger.addHandler(handler) # --------------------------------------------------------------------------------------------------- # Inputs # --------------------------------------------------------------------------------------------------- import json with open(f"./assets/best_agent/config.json", "r") as f: config = json.load(f) config["mode"] = "test" config["n_episodes"] = 10 config["warmup"] = 0 logger.warning("+=" * 90) logger.warning(f" RUNNING SIMULATION WITH PARAMETERS config={config}") logger.warning("+=" * 90) # ------------------------------------------------------------ # 1. Initialization # ------------------------------------------------------------ # 1. Start the Environment env = UnityEnvironment(file_name=f'./{config["env_name"]}') # mac OS # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config["n_agents"] = num_agents # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) config.update(dict(action_size=action_size, state_size=state_size)) # ------------------------------------------------------------ # 2. Training # ------------------------------------------------------------ # Unity Monitor monitor = UnityMonitor(env=env, config=config) # Actor model seed = 0 actor = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=torch.tanh, seed=seed) # Critic model critic = DeepNeuralNetHeadCritic( action_size * num_agents, SimpleNeuralNetBody(state_size * num_agents, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) # MADDPG Agent agent = MADDPGAgent( state_size=state_size, action_size=action_size, model_actor=actor, model_critic=critic, action_space_low=-1, action_space_high=1, config=config, ) # ------------------------------------------------------------ # 3. Testing # ------------------------------------------------------------ logger.warning("Entering Test Mode!") monitor.n_episodes = 100 env.reset(train_mode=False) env.warmup = 0 agent.warmup = 0 for a in agent.agents: a.warmup = 0 agent.load(filepath="./assets/best_agent", mode="test") scores = monitor.run(agent) logger.info(f"Test Score over {len(scores)} episodes: {np.mean(scores)}") config["test_scores"] = scores config["best_test_score"] = max(scores) config["avg_test_score"] = np.mean(scores) # When finished, you can close the environment. logger.info("Closing...") env.close()
class Env: '''A convinience function for generating episodes and memories This convinience class generates a context manager that can be used for generating a Unity environment. The Unity environment and the OpenAI Gym environment operates slightly differently and hence it will be difficult to create a uniform algorithm that is able to solve everything at the sametime. This environment tries to solve that problem. ''' def __init__(self, fileName, showEnv=False, trainMode=True): '''Initialize the environment This sets up the requirements that will later be used for generating the Unity Environment. This assumes that you will provide a binary file for generating the environment. There are different ways in which the environment can be generated. It can be generated either in a *headless* mode by using showEnv as False, in which case the environment will not show a window at startup. This is good for training, as well as situations when you are running the environment without the presence of an X server, especially when you are running this environment remotely. The other thing that you can do is to specify that this is being run in `trainMode`. In this case, the environment will be primed for training. That is, each frame will finish as soon as possible. This is not good for observing what is happening. However, this significantly increases the speed of training. Arguments: fileName {str} -- Path to the binary file. This file must be the same as the one for which the `unityagents` package has been generated. Keyword Arguments: showEnv {bool} -- Set this to ``True`` if you want to view the environment (default: {False}) trainMode {bool} -- Set this to ``True`` if you want the environment tobe in training mode (i.e. fast execution) (default: {True}) ''' try: self.no_graphics = not showEnv self.trainMode = trainMode self.fileName = fileName self.states = None except Exception as e: raise type(e)('lib.envs.envUnity.Env.__init__ - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return def __enter__(self): '''generate a context manager This will actually generate the context manager and allow you use this within a ``with`` statement. This is the function that actually initialized the environment and maintains it, until it is needed. Returns: ``this`` -- Returns an instance of the same class ''' try: self.env = UnityEnvironment(file_name=self.fileName, no_graphics=self.no_graphics) # get the default brain self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.env_info = self.env.reset( train_mode=self.trainMode)[self.brain_name] self.num_agents = len(self.env_info.agents) self.action_size = self.brain.vector_action_space_size except Exception as e: raise type(e)('lib.envs.envUnity.Env.__enter__ - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return self def reset(self): '''reset the environment before starting an episode Returns: status -- The current status after the reset ''' try: self.env.reset(train_mode=self.trainMode) self.states = self.env_info.vector_observations except Exception as e: raise type(e)('lib.envs.envUnity.Env.reset - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return self.states def step(self, policy): '''advance one step by taking an action This function takes a policy function and generates an action according to that particular policy. This results in the advancement of the episode into a one step with the return of the reward, and the next state along with any done information. Arguments: policy {function} -- This function takes a state vector and returns an action vector. It is assumed that the policy is the correct type of policy, and is capable if taking the right returning the right type of vector corresponding the the policy for the current environment. It does not check for the validity of the policy function Returns: list -- This returns a list of tuples containing the tuple ``(s_t, a_t, r_{t+1}, s_{t+1}, d)``. One tuple for each agent. Even for the case of a single agent, this is going to return a list of states ''' try: states = self.states.copy() actions = policy(states) env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done self.states = next_states results = [] for i in range(self.num_agents): state = states[i] action = actions[i] reward = rewards[i] next_state = next_states[i] done = dones[i] results.append((state, action, reward, next_state, done)) except Exception as e: raise type(e)('lib.envs.envUnity.Env.step - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return results def episode(self, policy, maxSteps=None): '''generate data for an entire episode This function generates an entire episde. It plays the environment by first resetting it too the beginning, and then playing the game for a given number of steps (or unless the game is terminated). It generates a set of list of tuplees, again one for each agent. Rememebr that even when the number of agents is 1, it will still return a list oof states. Arguments: policy {function} -- The function that takes the current state and returns the action vector. Keyword Arguments: maxSteps {int or None} -- The maximum number of steps that the agent is going to play the episode before the episode is terminated. (default: {None} in which case the episode will continue until it actually finishes) Returns: list -- This returns the list of tuples for the entire episode. Again, this is a lsit of lists, one for each agent. ''' try: self.reset() stepCount = 0 allResults = [[] for _ in range(self.num_agents)] while True: stepCount += 1 finished = False results = self.step(policy) for agent in range(self.num_agents): state, action, reward, next_state, done = results[agent] allResults[agent].append(results[agent]) finished = finished or done if finished: break if (maxSteps is not None) and (stepCount >= maxSteps): break except Exception as e: raise type(e)('lib.envs.envUnity.Env.episode - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return allResults def __exit__(self, exc, value, traceback): '''Exit the context manager The exit funciton that will result in exiting the context manager. Typically one is supposed to check the error if any at this point. This will be handled at a higher level Arguments: *args {[type]} -- [description] ''' if not exec: self.env.close() return True
class UnityEnv: ''' Class for all Envs. Standardizes the UnityEnv design to work in Lab. Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs ''' def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False def check_u_brain_to_agent(self): '''Check the size match between unity brain and agent''' u_brain_num = self.u_env.number_brains agent_num = len(self.body_e) assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.' def check_u_agent_to_body(self, env_info_a, a): '''Check the size match between unity agent and body''' u_agent_num = len(env_info_a.agents) body_num = util.count_nonan(self.body_e[a]) assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.' def get_brain(self, a): '''Get the unity-equivalent of agent, i.e. brain, to access its info''' name_a = self.u_env.brain_names[a] brain_a = self.u_env.brains[name_a] return brain_a def get_env_info(self, env_info_dict, a): name_a = self.u_env.brain_names[a] env_info_a = env_info_dict[name_a] return env_info_a @lab_api def post_body_init(self): '''Run init for components that need bodies to exist first, e.g. memory or architecture.''' self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e) self.check_u_brain_to_agent() logger.info(util.self_desc(self)) def is_discrete(self, a): '''Check if an agent (brain) is subject to discrete actions''' return self.get_brain(a).is_discrete() def get_action_dim(self, a): '''Get the action dim for an agent (brain) in env''' return self.get_brain(a).get_action_dim() def get_action_space(self, a): return self.action_spaces[a] def get_observable_dim(self, a): '''Get the observable dim for an agent (brain) in env''' return self.get_brain(a).get_observable_dim() def get_observable_types(self, a): '''Get the observable for an agent (brain) in env''' return self.get_brain(a).get_observable_types() def get_observation_space(self, a): return self.observation_spaces[a] @lab_api def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done return _reward_e, state_e, done_e @lab_api def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e @lab_api def close(self): self.u_env.close()
def make_banana_env(): env = UnityEnvironment(file_name=BANANA_APP) yield env env.close()
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ # Get environment instance env = UnityEnvironment(file_name=BANANA_FILE) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # Reset environment env_info = env.reset(train_mode=True)[brain_name] # Get initial state, state size and action size action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # Setup agent agent = Agent(state_size=state_size, action_size=action_size, seed=0) # Train! max_avg_score = -100000 # max avg score over 100 episodes scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): state = env.reset(train_mode=True)[brain_name].vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) score += reward state = next_state if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0 and np.mean( scores_window) > max_avg_score: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') # break max_avg_score = np.mean(scores_window) # Close environment env.close() return scores
class Environment(): """ This is a wrapper class for a Unity environment The Unity environment is wrapped such that the API is similar to a Gym environment. Using this class, DQN algorithms written for Gym environments can be re-used with minimal changes. """ def __init__(self, filename_path, worker_id=0, train_mode=True, no_graphics=False, seed=0): # Create new environment # Create Unity environment self._env = UnityEnvironment(file_name=filename_path, \ worker_id=worker_id,\ no_graphics=no_graphics, \ seed=seed) # get the default brain self._brain_name = self._env.brain_names[0] self._brain = self._env.brains[self._brain_name] # set the initial state self.train_mode = train_mode self._env_info = self._env.reset( train_mode=train_mode)[self._brain_name] self._state = self._env_info.vector_observations[0] # define state_size and action_size self.state_size = len(self._state) self.action_size = self._brain.vector_action_space_size def reset(self): # reset the environment self._env_info = self._env.reset( train_mode=self.train_mode)[self._brain_name] self._state = self._env_info.vector_observations[0] # return the state vector return self._state def step(self, action): # send the action to the environment self._env_info = self._env.step(action)[self._brain_name] # get the next state next_state = self._env_info.vector_observations[0] # get the reward reward = self._env_info.rewards[0] # check if terminal state is reached done = self._env_info.local_done[0] # create dummy value to keep API compatible dummy = 0 # return the next_state vector, the reward, # and whether the terminal state was reached return next_state, reward, done, dummy def close(self): self._env.close() pass
_action_size: int = 4 _state_size: int = 33 _agent = Agent(_state_size, _action_size, gamma=0.99, lr_actor=0.0002, lr_critic=0.0003, tau=0.002, weight_decay=0.0001, buffer_size=1000000, batch_size=128) # with this boolean you can decide if you just want to watch an agent or train the agent yourself watch_only = True if watch_only: watch_agent_from_pth_file(_env, _brain_name, _agent, './checkpoint-actor.pth', './checkpoint-critic.pth') else: scores = train_agent(_env, _brain_name, _agent, n_episodes=500, max_steps=1500) watch_agent(_env, _brain_name, _agent) plot_scores(scores=scores, sma_window=10) _env.close()
class UnityEnv(BaseEnv): ''' Wrapper for Unity ML-Agents env to work with the Lab. e.g. env_spec "env": [{ "name": "gridworld", "max_t": 20, "max_frame": 3, "unity": { "gridSize": 6, "numObstacles": 2, "numGoals": 1 } }], ''' def __init__(self, spec): super().__init__(spec) util.set_attr(self, self.env_spec, ['unity']) worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) seed = ps.get(spec, 'meta.random_seed') # TODO update Unity ml-agents to use seed=seed below self.u_env = UnityEnvironment(file_name=get_env_path(self.name), worker_id=worker_id) self.patch_gym_spaces(self.u_env) self._set_attr_from_u_env(self.u_env) assert self.max_t is not None self.tracked_reward = 0 self.total_reward = 0 logger.info(util.self_desc(self)) def patch_gym_spaces(self, u_env): ''' For standardization, use gym spaces to represent observation and action spaces for Unity. This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces ''' observation_spaces = [] action_spaces = [] for a in range(len(u_env.brain_names)): brain = self._get_brain(u_env, a) observation_shape = (brain.get_observable_dim()['state'],) if brain.is_discrete(): dtype = np.int32 action_space = spaces.Discrete(brain.get_action_dim()) else: dtype = np.float32 action_space = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=dtype) observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype) set_gym_space_attr(observation_space) set_gym_space_attr(action_space) observation_spaces.append(observation_space) action_spaces.append(action_space) # set for singleton u_env.observation_space = observation_spaces[0] u_env.action_space = action_spaces[0] return observation_spaces, action_spaces def _get_brain(self, u_env, a): '''Get the unity-equivalent of agent, i.e. brain, to access its info''' name_a = u_env.brain_names[a] brain_a = u_env.brains[name_a] return brain_a def _check_u_brain_to_agent(self): '''Check the size match between unity brain and agent''' u_brain_num = self.u_env.number_brains agent_num = 1 # TODO rework unity outdated assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.' def _check_u_agent_to_body(self, env_info_a, a): '''Check the size match between unity agent and body''' u_agent_num = len(env_info_a.agents) body_num = 1 # rework unity assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.' def _get_env_info(self, env_info_dict, a): '''Unity API returns a env_info_dict. Use this method to pull brain(env)-specific usable for lab API''' name_a = self.u_env.brain_names[a] env_info_a = env_info_dict[name_a] return env_info_a def seed(self, seed): self.u_env.seed(seed) @lab_api def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) a, b = 0, 0 # default singleton agent and body env_info_a = self._get_env_info(env_info_dict, a) state = env_info_a.states[b] return state @lab_api def step(self, action): env_info_dict = self.u_env.step(action) a, b = 0, 0 # default singleton agent and body env_info_a = self._get_env_info(env_info_dict, a) state = env_info_a.states[b] reward = env_info_a.rewards[b] reward = try_scale_reward(self, reward) done = env_info_a.local_done[b] if not self.is_venv and self.clock.t > self.max_t: done = True self.done = done info = {'env_info': env_info_a} # track total_reward self.tracked_reward += reward if done: self.total_reward = self.tracked_reward self.tracked_reward = 0 # reset info.update({'total_reward': self.total_reward}) return state, reward, done, info @lab_api def close(self): self.u_env.close()
def main(seed=seed): # --------------------------------------------------------------------------------------------------- # Logger # --------------------------------------------------------------------------------------------------- save_path = f"./results/Reacher_DDPG_{pd.Timestamp.utcnow().value}" os.makedirs(save_path, exist_ok=True) logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s : %(message)s') handler = logging.FileHandler( f"{save_path}/logs_navigation_{pd.Timestamp.utcnow().value}.log") handler.setLevel(logging.DEBUG) handler.setFormatter(formatter) logger.addHandler(handler) # --------------------------------------------------------------------------------------------------- # Inputs # --------------------------------------------------------------------------------------------------- n_episodes = 300 config = dict( # Environment parameters env_name="Reacher", n_episodes=n_episodes, length_episode=1500, save_every=100, save_path=save_path, mode="train", # "train" or "test" evaluate_every= 5000, # Number of training episodes before 1 evaluation episode eps_decay=1, # Epsilon decay rate # Agent Parameters agent="DDPG", hidden_layers_actor=(200, 150), # (50, 50, 15), # (200, 150), # hidden_layers_critic_body=(400, ), # (50, 50,), # hidden_layers_critic_head=(300, ), # (50,), # (300,) func_critic_body="F.leaky_relu", # func_critic_head="F.leaky_relu", # func_actor_body="F.leaky_relu", # lr_scheduler= None, #{'scheduler_type': "multistep", # "step", "exp" or "decay", "multistep" # 'gamma': 0.5, # 0.99999, # 'step_size': 1, # 'milestones': [15*1000 * i for i in range(1, 6)], # 'max_epochs': n_episodes}, TAU=1e-3, # for soft update of target parameters BUFFER_SIZE=int(1e6), # replay buffer size BATCH_SIZE=128, # minibatch size GAMMA=0.99, # discount factor LR_ACTOR=1e-3, # learning rate of the actor LR_CRITIC=1e-3, # learning rate of the critic WEIGHT_DECAY=0, # L2 weight decay UPDATE_EVERY=1, # Number of actions before making a learning step action_noise="OU", # action_noise_scale=1, weights_noise=None, # state_normalizer="BatchNorm", # "RunningMeanStd" or "BatchNorm" warmup=0, # Number of random actions to start with as a warm-up start_time=str(pd.Timestamp.utcnow()), random_seed=seed, threshold=30) logger.warning("+=" * 90) logger.warning(f" RUNNING SIMULATION WITH PARAMETERS config={config}") logger.warning("+=" * 90) # ------------------------------------------------------------ # 1. Initialization # ------------------------------------------------------------ # 1. Start the Environment # env = UnityEnvironment(file_name=f'./Reacher_Linux_2/Reacher.x86_64') # Linux env = UnityEnvironment(file_name=f'./{config["env_name"]}') # mac OS # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config["n_agents"] = num_agents # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) config.update(dict(action_size=action_size, state_size=state_size)) # ------------------------------------------------------------ # 2. Training # ------------------------------------------------------------ # Unity Monitor monitor = UnityMonitor(env=env, config=config) if config["mode"] == "train": # Actor model seed = 0 actor = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=F.tanh, seed=seed) actor_target = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=F.tanh, seed=seed) # Critic model critic = DeepNeuralNetHeadCritic( action_size, SimpleNeuralNetBody(state_size, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) critic_target = DeepNeuralNetHeadCritic( action_size, SimpleNeuralNetBody(state_size, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) # DDPG Agent agent = DDPGAgent( state_size=state_size, action_size=action_size, model_actor=actor, model_critic=critic, # actor_target=actor_target, critic_target=critic_target, action_space_low=-1, action_space_high=1, config=config, ) # Training start = pd.Timestamp.utcnow() scores = monitor.run(agent) logger.info("Average Score last 100 episodes: {}".format( np.mean(scores[-100:]))) elapsed_time = pd.Timedelta(pd.Timestamp.utcnow() - start).total_seconds() logger.info(f"Elapsed Time: {elapsed_time} seconds") # ------------------------------------------------------------ # 3. Testing # ------------------------------------------------------------ else: agent = DDPGAgent.load(filepath=config['save_path'], mode="test") scores = monitor.run(agent) logger.info( f"Test Score over {len(scores)} episodes: {np.mean(scores)}") config["test_scores"] = scores config["best_test_score"] = max(scores) config["avg_test_score"] = np.mean(scores) # When finished, you can close the environment. logger.info("Closing...") env.close()
class CollectBananaENV: def __init__(self, env_type='vector', mode='train'): """ This is a wrapper on top of the brain environment that provides useful function to render the environment call very similar to like calling the open AI gym environement. Wrapper Code referred from : https://github.com/yingweiy/drlnd_project1_navigation :param env_type: """ self.env_type = env_type if env_type == 'vector': self.base_env = UnityEnvironment('Banana.app') elif env_type == 'visual': self.base_env = UnityEnvironment('VisualBanana.app') else: raise ValueError('Env Name not understood ....') # get the default brain self.brain_name = self.base_env.brain_names[0] self.brain = self.base_env.brains[self.brain_name] self.action_size = self.brain.vector_action_space_size if mode == 'train': self.train = True else: self.train = False self.frame1 = None self.frame2 = None self.frame3 = None self.reset() if env_type == 'vector': self.state_size = len(self.state) elif env_type == 'visual': self.state_size = self.state.shape else: raise ValueError('Environment type not understood ....') print(self.state_size) def get_state(self): if self.env_type == 'visual': # The DQN paper says to stack 4 frames while running the image through the neural network # state size is 1,84,84,3 # Rearrange from NHWC to NCHW (Pytorch uses 3d covolution in NCHW format, cross corellation across channels) frame = np.transpose(self.env_info.visual_observations[0], (0, 3, 1, 2))[:, :, :, :] frame_size = frame.shape # 1,3,84,84 # print(frame_size) self.state = np.zeros( (1, frame_size[1], 4, frame_size[2], frame_size[3])) self.state[0, :, 0, :, :] = frame if self.frame1 is not None: self.state[0, :, 1, :, :] = self.frame1 if self.frame2 is not None: self.state[0, :, 2, :, :] = self.frame2 if self.frame3 is not None: self.state[0, :, 3, :, :] = self.frame3 # Keep the last 3 frames in the memory to be accessed or stacked with the new input frame to supply as # input to the convolution network self.frame3 = self.frame2 self.frame2 = self.frame1 self.frame1 = frame # self.state = np.squeeze(self.state) # We squeeze it becasue the code implemented in buffer will # unsqueeze the array elif self.env_type == 'vector': self.state = self.env_info.vector_observations[0] else: raise ValueError('Environment name %s not understood.' % str(self.env_type)) def reset(self): self.env_info = self.base_env.reset( train_mode=self.train)[self.brain_name] self.get_state() return self.state def step(self, action): """ This function returns the value in the format of Open AI gym :param action: :return: """ self.env_info = self.base_env.step(action)[ self.brain_name] # send the action to the environment self.get_state() reward = self.env_info.rewards[0] done = self.env_info.local_done[0] return self.state, reward, done, None def close(self): self.base_env.close()
class UnityEnv(BaseEnv): r""" Basic Unity ML Agent environment. config example: "env": { "name": "Reacher", "type": "unity", "seed": 0, "to_render": True, "frame_sleep": 0.001, "max_steps": 1000, "one_hot": None, "action_bins": None, "reward_scale": None, "num_envs": None, } """ def __init__(self, config): super(UnityEnv, self).__init__(config) self._env = UnityEnvironment(file_name=get_env_path(self.name), seed=self.seed) self.patch_gym_spaces(self._env) self._set_attr_from_u_env(self._env) # TODO: Logging print(utils.describe(self)) def reset(self): self.done = False info_dict = self._env.reset(train_mode=self.to_render) env_info = self._get_env_info(info_dict, 0) state = env_info.vector_observations[0] return state def step(self, action): info_dict = self._env.step(action) env_info = self._get_env_info(info_dict, 0) state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return state, reward, done, env_info def render(self): pass def close(self): self._env.close() def _get_brain(self, env, brain_index): r""" Get the unity-equivalent of agent, i.e. brain, to access its info :param env: :param brain_index: :return: """ brain_name = env.brain_names[brain_index] brain = env.brains[brain_name] return brain def patch_gym_spaces(self, env): r""" For standardization, use gym spaces to represent observation and action spaces for Unity. This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces :param env: :return: """ observation_spaces = [] action_spaces = [] for brain_index in range(len(env.brain_names)): brain = self._get_brain(env, brain_index) # TODO: Logging utils.describe(brain) observation_shape = (brain.get_observable_dim()['state'],) action_dim = (brain.get_action_dim(),) if brain.is_discrete(): dtype = np.int32 action_space = spaces.Discrete(brain.get_action_dim()) else: dtype = np.float32 action_space = spaces.Box(low=0.0, high=1.0, shape=action_dim, dtype=dtype) observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype) utils.set_gym_space_attr(observation_space) utils.set_gym_space_attr(action_space) observation_spaces.append(observation_space) action_spaces.append(action_space) # set for singleton env.observation_space = observation_spaces[0] env.action_space = action_spaces[0] return observation_spaces, action_spaces def _get_env_info(self, env_info_dict, index): r""" Unity API returns a env_info_dict. Use this method to pull brain(env)-specific :param env_info_dict: :param index: :return: """ brain_name = self._env.brain_names[index] env_info = env_info_dict[brain_name] return env_info
def loaded_unity_env(file_name): env = UnityEnvironment(str(file_name), no_graphics=True) try: yield env finally: env.close()
class BananaEnvWrapper(object): blank_state = torch.zeros(1, 37, dtype=torch.uint8) """ Wraps the udacity enviroment into an object behaving like an atari env """ def __init__(self, train_mode=True, device='cuda'): self.train_mode = train_mode self.device = device self.unity_env = UnityEnvironment( file_name="/home/philipp/udacity/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64") # get the default brain self.brain_name = self.unity_env.brain_names[0] brain = self.unity_env.brains[self.brain_name] # reset the environment env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions self._action_space = brain.vector_action_space_size print('Number of actions:', self._action_space) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) self.score = 0 def eval(self): self.train_mode = False def train(self): self.train_mode = True def reset(self): print("Score: %d" % self.score) self.score = 0 env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name] return self._wrap_state(env_info.vector_observations[0]) # Return current state def step(self, action): env_info = self.unity_env.step(action)[self.brain_name] state = env_info.vector_observations[0] # get the current state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.score += reward # update the score return self._wrap_state(state), reward, done def close(self): self.unity_env.close() def action_space(self): return self._action_space def _wrap_state(self, state): state = state[np.newaxis, np.newaxis, :] # todo: Normalization return torch.tensor(state, dtype=torch.float32, device=self.device)
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, use_data_gatherer): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file """ ''' Here's a small change (this only happens if code is launched with the '--data-gatherer' flag) ''' self.use_data_gatherer = use_data_gatherer self.trainer_config_path = trainer_config_path env_path = (env_path.strip().replace('.app', '').replace( '.exe', '').replace('.x86_64', '').replace('.x86', '') ) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) env_path = '/{docker_target_name}/{env_name}'.format( docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format( docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training) self.env_name = os.path.basename( os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[ brain_name].get_step / self.trainers[ brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters[ "trainer"] == "imitation": nodes += [scope + x for x in ["action"]] elif not self.trainers[brain_name].parameters["use_recurrent"]: nodes += [ scope + x for x in ["action", "value_estimate", "action_probs"] ] else: node_list = [ "action", "value_estimate", "action_probs", "recurrent_out", "memory_size" ] nodes += [scope + x for x in node_list] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph( input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, ######## FOLOWING LINE UGLY FIX: only return first 20 characters of run_id ###### output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id[:20] + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed, self.use_data_gatherer) else: raise UnityEnvironmentException( "The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException( """Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException( "There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException( "The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly.".format( model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info( 'The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([ t.get_step <= t.get_max_steps for k, t in self.trainers.items() ]) or not self.train_model: if debug_print: print("|", end='', flush=True) if self.env.global_done: self.env.curriculum.increment_lesson( self._get_progress()) curr_info = self.env.reset( train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() if data_gatherer['reset_after_each_frame']: curr_info = self.env.reset( train_mode=self.fast_simulation) # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name] ) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) ''' ----- ''' ''' Enabling data gathering disables the normal functionality.... ''' if self.use_data_gatherer: if data_gatherer['firstRun']: print("---") print("NORMAL FUNCTIONALITY DISABLED!") print( "Now we just sample stats from the initial distribution and save them:" ) print("Save dir: {}".format(data_gatherer['dir'])) print("---") print( "If you did not expect to see this, NOW is the time to [ctrl-C]! (otherwise: [enter] to continue...)" ) ''' Create the folder-structure if it is needed: ''' paths = [ settings['dir_base'], settings['dir_base'] + settings['project'], data_gatherer['dir'] ] for p in paths: if not os.path.isdir(p): os.makedirs(p) print("Created path: {}".format(p)) else: print("Reusing existing: {}".format(p)) ape = input() data_gatherer['firstRun'] = False #if data_gatherer['reset_after_each_frame']: # curr_info = self.env.reset(train_mode=self.fast_simulation) # take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} # for brain_name, trainer in self.trainers.items(): # (take_action_vector[brain_name], # take_action_memories[brain_name], # take_action_text[brain_name], # take_action_outputs[brain_name]) = trainer.take_action(curr_info) # new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, # text_action=take_action_text) is_done = False for x in new_info: for l in range(len(new_info[x].agents)): is_done = is_done or new_info[x].local_done[l] if data_gatherer['idx'] == data_gatherer[ 'n'] or is_done: #WRITE_TO_FILE.... print("Saving chunk {}... ({} samples)".format( data_gatherer['n_chunks'], data_gatherer['idx'])) with open( data_gatherer['dir'] + data_gatherer['file_base'] + "chunk{}.pkl".format( str(data_gatherer['n_chunks']).zfill( 5)), 'wb') as outfile: pickle.dump( data_gatherer['data'] [:data_gatherer['idx'], :, :, :].reshape( (-1, ) + data_gatherer['obs_size']), outfile, pickle.HIGHEST_PROTOCOL) #Prep next: data_gatherer['n_chunks'] += 1 data_gatherer['data'] = np.empty( data_gatherer['size'], dtype=np.uint8) data_gatherer['idx'] = 0 if data_gatherer['n_chunks'] == 1500: print("Total samples gathered: {}".format( (data_gatherer['n_chunks'] - 1000) * 1000)) exit() data_gatherer['data'][ data_gatherer['idx'], :, :, :] = ( 255 * new_info["PepperBrain"].visual_observations[0] ).astype(np.uint8) data_gatherer['idx'] += 1 if data_gatherer['reset_after_each_frame']: continue ''' ----- ''' if settings['store_as_int']: for key in new_info: for x in range( len(new_info[key].visual_observations)): new_info[key].visual_observations[x] = ( 255 * new_info[key].visual_observations[x] ).astype(np.uint8) for brain_name, trainer in self.trainers.items(): if debug_print: print(".", end='', flush=True) trainer.add_experiences( curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update( ) and self.train_model and trainer.get_step <= trainer.get_max_steps: if debug_print: print("!", end='', flush=True) # Perform gradient descent with experience buffer print("Updating model... ", end='', flush=True) t = time.time() trainer.update_model() print("[x] Done in {} seconds.".format( time.time())) # Write training statistics to Tensorboard. if debug_print: print(",", end='', flush=True) trainer.write_summary( self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: if debug_print: print("?", end='', flush=True) trainer.increment_step() trainer.update_last_reward() if self.train_model and trainer.get_step <= trainer.get_max_steps: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: if debug_print: print("x", end='', flush=True) # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: if self.train_model: self.logger.info( "Learning was interrupted. Please wait while the graph is generated." ) self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()
class UnityEnvHelper: # constructor - give file_name of agent environment def __init__(self, file_name, no_graphics=True, seed=8888): self.seed = seed self.uenv = UnityEnvironment(file_name=file_name, seed=self.seed, no_graphics=no_graphics) # pick the first agent as the brain self.brain_name = self.uenv.brain_names[0] self.brain = self.uenv.brains[self.brain_name] # get the action space size self.action_size = self.brain.vector_action_space_size # reset the environment , in training mode self.reset(True) # get the state space size self.state_size = len(self.ue_info.vector_observations[0]) def __del__(self): # make sure we close the environment try: self.uenv.close() del self.uenv except: pass def reset(self, train_mode=True): # tell the unity agent to restart an episode # training mode simple seems to run the simulation at full speed self.ue_info = self.uenv.reset(train_mode=train_mode)[self.brain_name] # we pass in current state for convenience def step(self, state, action): # perform action on environment and get observation self.ue_info = self.uenv.step(action)[self.brain_name] # return state , action , next state , reward and done flag # slightly return { 'state': state, 'action': action, 'reward': self.reward(), 'next_state': self.state(), 'done': self.done() } def state(self): # just last observation state return self.ue_info.vector_observations[0] def reward(self): # return reward from last observation return self.ue_info.rewards[0] def done(self): # return done flag return self.ue_info.local_done[0]
state = next_state score += reward scores.append(score) print('\rStep {}\tScore: {}'.format(i, score), end="") time.sleep(0.25) if done: break # plot the scores of a trained agent fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() return scores if __name__ == "__main__": # set up environment curr_path = os.getcwd() my_env = UnityEnvironment( file_name=os.path.join(curr_path, "Banana_Windows_x86_64/Banana.exe")) my_agent_gamma = Agent(gamma=0.99, tau=1e-3) my_scores = ddqnper(agent=my_agent_gamma, env=my_env, n_episodes=2000) # scores = demonstrate_agent(env=my_env, model_path=os.path.join(curr_path, "checkpoint.pth")) my_env.close()
per agents to be retrieved at the next step. - value is an optional input that be used to send a single float per agent to be displayed if and AgentMonitor.cs component is attached to the agent. if u have more than one brain, use dict for action per brain action = {'brain1': [1.0, 2.0], 'brain2': [3.0, 4.0]} ''' for epi in range(10): # env.global_done could be used to check all env_info = env.reset(train_mode=train_mode)[default_brain] state = env_info.states[0] done = False epi_rewards = 0 while not done: if brain.action_space_type == 'discrete': action = np.random.randint( 0, brain.action_space_size, size=(len(env_info.agents))) else: action = np.random.randn( len(env_info.agents), brain.action_space_size) env_info = env.step(action)[default_brain] state = env_info.states[0] epi_rewards += env_info.rewards[0] done = env_info.local_done[0] print('Total reward for this episode: {}'.format(epi_rewards)) env.close() print('Environment is closed')
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file """ self.trainer_config_path = trainer_config_path env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed) self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters["trainer"] == "imitation": nodes += [scope + x for x in ["action"]] elif not self.trainers[brain_name].parameters["use_recurrent"]: nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]] else: node_list = ["action", "value_estimate", "action_probs", "recurrent_out", "memory_size"] nodes += [scope + x for x in node_list] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): self.trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] self.trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if self.trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) elif self.trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) elif self.trainer_parameters_dict[brain_name]['trainer'] == "dqn": self.trainers[brain_name] = DQNTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) elif self.trainer_parameters_dict[brain_name]['trainer'] == "madqn": self.trainers[brain_name] = MADQNTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) elif self.trainer_parameters_dict[brain_name]['trainer'] == "mappo": self.trainers[brain_name] = MAPPOTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) elif self.trainer_parameters_dict[brain_name]['trainer'] == "coma": self.trainers[brain_name] = COMATrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) all_vars = tf.trainable_variables() self.brain_vars = {} total_vars = len(all_vars) idx1 = 0 idx2 = int(total_vars/len(self.env.external_brain_names)) for brain_name in self.env.external_brain_names: self.brain_vars[brain_name] = all_vars[idx1:idx2] idx1 = idx2 idx2 = idx2*total_vars if (self.trainer_parameters_dict[brain_name]['trainer'] == "dqn" or self.trainer_parameters_dict[brain_name]['trainer'] == "madqn"): self.trainers[brain_name].update_target_graph(self.brain_vars[brain_name]) if self.trainer_parameters_dict[brain_name]['trainer'] == "madqn": if not self.trainers[brain_name].parameters['frozen']: self.free_brain_vars = self.brain_vars[brain_name] for brain_name in self.env.external_brain_names: if self.trainer_parameters_dict[brain_name]['trainer'] == "madqn": if self.trainers[brain_name].parameters['frozen']: self.trainers[brain_name].update_frozen_brain_graph(self.brain_vars[brain_name], self.free_brain_vars) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException("""Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly." .format(model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info('The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model: if self.env.global_done: #self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name]) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo": trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name], take_action_vector) else: trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) curr_info = new_info for brain_name, trainer in self.trainers.items(): if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo": take_action_vector[brain_name] = trainer.simulate_action(curr_info) for brain_name, trainer in self.trainers.items(): if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo": trainer.process_experiences(curr_info, take_action_vector) else: trainer.process_experiences(curr_info) step = trainer.get_step max_steps = trainer.get_max_steps if trainer.is_ready_update() and self.train_model and step <= max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and step <= max_steps: trainer.increment_step() trainer.update_last_reward() if self.train_model and step <= max_steps: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: if self.train_model: self.logger.info("Learning was interrupted. Please wait while the graph is generated.") self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
def main(): env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64") print_env_info(env) random_play(env) env.close()
def main(args): if args.deterministic: set_seed(42) env = UnityEnvironment(file_name=args.env_path, no_graphics=args.no_graphics) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agent = Agent(state_size, action_size, train=True, device=args.device, buffer_size=args.buffer_size, batch_size=args.batch_size, lr=args.lr, gamma=args.gamma, tau=args.tau, update_freq=args.update_freq, nb_updates=args.nb_updates, noise_mean=args.noise_mean, noise_theta=args.noise_theta, noise_sigma=args.noise_sigma, eps=args.eps, eps_decay=args.eps_decay, grad_clip=args.grad_clip) scores = train_agent(agent, env, n_episodes=args.episodes) output_folder = Path(args.output) if not output_folder.is_dir(): output_folder.mkdir(parents=True) # Save model torch.save(agent.actor_local.state_dict(), output_folder.joinpath('actor_model.pt')) torch.save(agent.critic_local.state_dict(), output_folder.joinpath('critic_model.pt')) env.close() # Plot results fig = plt.figure() plot_scores(scores, running_window_size=100, success_thresh=args.success_threshold) fig.savefig(output_folder.joinpath('training_scores.png'), transparent=True)
class UnityEnv(gym.Env): def __init__(self, app_name=None, idx=0): # parameter app_path = os.path.join(os.path.dirname(__file__), 'assets', app_name) idx = idx no_graphics = False self.num_envs = 1 # create environment self._env = UnityEnvironment(file_name=app_path, worker_id=idx, no_graphics=no_graphics) self.name = app_name # Only Accept Environment with Only One Brain assert len(self._env.brains) == 1 self.brain_name = self._env.external_brain_names[0] self.brain = self._env.brains[self.brain_name] # viusalization self.use_visual = (self.brain.number_visual_observations == 1) # action space dimension if self.brain.vector_action_space_type == "discrete": self._a_dim = Discrete(1) else: high = np.array([np.inf] * (self.brain.vector_action_space_size)) self._a_dim = Box(-high, high) # observation spce dimension if self.use_visual and False and no_graphic: high = np.array([np.inf] * self.brain.camera_resolutions[0]["height"] * self.brain.camera_resolutions[0]["width"] * 3) self._ob_dim = Box(-high, high) else: high = np.array([np.inf] * self.brain.vector_observation_space_size) self._ob_dim = Box(-high, high) # video buffer self.frames = [] def reset(self): self.frames = [] info = self._env.reset()[self.brain_name] state = info.vector_observations[0] return np.array([state]) def step(self, action): info = self._env.step([action])[self.brain_name] state = info.vector_observations[0] reward = info.rewards[0] done = info.local_done[0] self._collect_frames(info.visual_observations[0]) return np.array([state]), np.array([reward ]), np.array([done ]), np.array([None]) def close(self): self._env.close() def _collect_frames(self, frame): if self.use_visual: self.frames.append(frame) @property def action_space(self): return self._a_dim @property def observation_space(self): return self._ob_dim
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_'+self.env.academy_name else: self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters["trainer"] == "imitation": nodes += [scope + x for x in ["action"]] else: nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]] if self.trainers[brain_name].parameters["use_recurrent"]: nodes += [scope + x for x in ["recurrent_out", "memory_size"]] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException("""Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly." .format(model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info('The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model: if self.env.global_done: self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name]) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to Tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step_and_update_last_reward() if self.train_model: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: print('--------------------------Now saving model-------------------------') if self.train_model: self.logger.info("Learning was interrupted. Please wait while the graph is generated.") self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
time.sleep(0.01) if np.any(dones): # exit loop if episode finished break scores_window.append(score) # save most recent score scores.append(np.mean(score)) # save most recent score # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') #plt.show() plt.savefig('testRes.png') print('Final Score: ==> ' + np.mean(scores)) env.close() ''' # random for i in range(5): # play game for 5 episodes env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = np.random.randn(num_agents, action_size) # select an action (for each agent) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished