示例#1
0
def test_step():
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                mock_glob.return_value = ['FakeLaunchPath']
                mock_socket.return_value.accept.return_value = (mock_socket, 0)
                mock_socket.recv.return_value.decode.return_value = dummy_start
                env = UnityEnvironment(' ')
                brain = env.brains['RealFakeBrain']
                mock_socket.recv.side_effect = dummy_reset
                brain_info = env.reset()
                mock_socket.recv.side_effect = dummy_step
                brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                with pytest.raises(UnityActionException):
                    env.step([0])
                brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                with pytest.raises(UnityActionException):
                    env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
                env.close()
                assert env.global_done
                assert isinstance(brain_info, dict)
                assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
                assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
                assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray)
                assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations
                assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \
                       len(brain_info['RealFakeBrain'].agents)
                assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \
                       brain.vector_observation_space_size * brain.num_stacked_vector_observations
                assert not brain_info['RealFakeBrain'].local_done[0]
                assert brain_info['RealFakeBrain'].local_done[2]
def test_ppo_model_cc_visual_curio(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=2)
            env = UnityEnvironment(' ')
            model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.output, model.all_probs, model.value, model.entropy,
                        model.learning_rate, model.intrinsic_reward]
            feed_dict = {model.batch_size: 2,
                         model.sequence_length: 1,
                         model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                    [3, 4, 5, 3, 4, 5]]),
                         model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                         [3, 4, 5, 3, 4, 5]]),
                         model.output: [[0.0, 0.0], [0.0, 0.0]],
                         model.visual_in[0]: np.ones([2, 40, 30, 3]),
                         model.visual_in[1]: np.ones([2, 40, 30, 3]),
                         model.next_visual_in[0]: np.ones([2, 40, 30, 3]),
                         model.next_visual_in[1]: np.ones([2, 40, 30, 3])
                         }
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
示例#3
0
def test_initialization():
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                mock_glob.return_value = ['FakeLaunchPath']
                mock_socket.return_value.accept.return_value = (mock_socket, 0)
                mock_socket.recv.return_value.decode.return_value = dummy_start
                env = UnityEnvironment(' ')
                with pytest.raises(UnityActionException):
                    env.step([0])
                assert env.brain_names[0] == 'RealFakeBrain'
                env.close()
示例#4
0
def test_close():
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                mock_glob.return_value = ['FakeLaunchPath']
                mock_socket.return_value.accept.return_value = (mock_socket, 0)
                mock_socket.recv.return_value.decode.return_value = dummy_start
                env = UnityEnvironment(' ')
                assert env._loaded
                env.close()
                assert not env._loaded
                mock_socket.close.assert_called_once()
示例#5
0
def test_ppo_model_discrete():
    d_action_c_state_start = '''{
      "AcademyName": "RealFakeAcademy",
      "resetParameters": {},
      "brainNames": ["RealFakeBrain"],
      "externalBrainNames": ["RealFakeBrain"],
      "logPath":"RealFakePath",
      "apiNumber":"API-3",
      "brainParameters": [{
          "vectorObservationSize": 3,
          "numStackedVectorObservations": 2,
          "vectorActionSize": 2,
          "memorySize": 0,
          "cameraResolutions": [{"width":30,"height":40,"blackAndWhite":false}],
          "vectorActionDescriptions": ["",""],
          "vectorActionSpaceType": 0,
          "vectorObservationSpaceType": 1
          }]
    }'''.encode()

    tf.reset_default_graph()
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                # End of mock
                with tf.Session() as sess:
                    with tf.variable_scope("FakeGraphScope"):
                        mock_glob.return_value = ['FakeLaunchPath']
                        mock_socket.return_value.accept.return_value = (mock_socket, 0)
                        mock_socket.recv.return_value.decode.return_value = d_action_c_state_start
                        env = UnityEnvironment(' ')
                        model = PPOModel(env.brains["RealFakeBrain"])
                        init = tf.global_variables_initializer()
                        sess.run(init)

                        run_list = [model.output, model.all_probs, model.value, model.entropy,
                                    model.learning_rate]
                        feed_dict = {model.batch_size: 2,
                                     model.sequence_length: 1,
                                     model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                               [3, 4, 5, 3, 4, 5]]),
                                     model.visual_in[0]: np.ones([2, 40, 30, 3])
                                     }
                        sess.run(run_list, feed_dict=feed_dict)
                        env.close()
def test_cc_bc_model(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=False, visual_inputs=0)
            env = UnityEnvironment(' ')
            model = BehavioralCloningModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.sample_action, model.policy]
            feed_dict = {model.batch_size: 2,
                         model.sequence_length: 1,
                         model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                   [3, 4, 5, 3, 4, 5]])}
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
示例#7
0
def test_cc_bc_model():
    c_action_c_state_start = '''{
      "AcademyName": "RealFakeAcademy",
      "resetParameters": {},
      "brainNames": ["RealFakeBrain"],
      "externalBrainNames": ["RealFakeBrain"],
      "logPath":"RealFakePath",
      "apiNumber":"API-3",
      "brainParameters": [{
          "vectorObservationSize": 3,
          "numStackedVectorObservations": 2,
          "vectorActionSize": 2,
          "memorySize": 0,
          "cameraResolutions": [],
          "vectorActionDescriptions": ["",""],
          "vectorActionSpaceType": 1,
          "vectorObservationSpaceType": 1
          }]
    }'''.encode()

    tf.reset_default_graph()
    with mock.patch('subprocess.Popen'):
        with mock.patch('socket.socket') as mock_socket:
            with mock.patch('glob.glob') as mock_glob:
                # End of mock
                with tf.Session() as sess:
                    with tf.variable_scope("FakeGraphScope"):
                        mock_glob.return_value = ['FakeLaunchPath']
                        mock_socket.return_value.accept.return_value = (mock_socket, 0)
                        mock_socket.recv.return_value.decode.return_value = c_action_c_state_start
                        env = UnityEnvironment(' ')

                        model = BehavioralCloningModel(env.brains["RealFakeBrain"])
                        init = tf.global_variables_initializer()
                        sess.run(init)

                        run_list = [model.sample_action, model.policy]
                        feed_dict = {model.batch_size: 2,
                                     model.sequence_length: 1,
                                     model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                               [3, 4, 5, 3, 4, 5]])}
                        sess.run(run_list, feed_dict=feed_dict)
                        env.close()
def test_ppo_model_dc_vector(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            mock_communicator.return_value = MockCommunicator(
                discrete_action=True, visual_inputs=0)
            env = UnityEnvironment(' ')
            model = PPOModel(env.brains["RealFakeBrain"])
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [model.output, model.all_probs, model.value, model.entropy,
                        model.learning_rate]
            feed_dict = {model.batch_size: 2,
                         model.sequence_length: 1,
                         model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                                    [3, 4, 5, 3, 4, 5]])}
            sess.run(run_list, feed_dict=feed_dict)
            env.close()
示例#9
0
class UnityEnv(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """
    def __init__(self, params):

        environment_filename = params['path']
        worker_id = params['worker_id']
        seed = params['seed']
        use_visual = params['visual_mode']
        multiagent = params['multiagent_mode']

        self._env = UnityEnvironment(environment_filename, seed=seed)
        self.name = self._env.academy_name
        self.visual_obs = None
        self._action_space_size = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityGymException(
                "`use_visual` was set to True, however there are no"
                " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations >= 1 and use_visual

        if brain.number_visual_observations > 1:
            logger.warning(
                "The environment contains more than one visual observation. "
                "Please note that only the first will be provided in the observation."
            )

        if brain.num_stacked_vector_observations != 1:
            raise UnityGymException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(
                    brain.vector_action_space_size[0])
            else:
                self._action_space = spaces.MultiDiscrete(
                    brain.vector_action_space_size)
        else:
            self._action_space_size = brain.vector_action_space_size
            high = np.array([1] * brain.vector_action_space_size)
            self._action_space = spaces.Box(-high, high, dtype=np.float32)

        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            if brain.camera_resolutions[0]["blackAndWhite"]:
                depth = 1
            else:
                depth = 3
            self._observation_space = spaces.Box(
                0,
                1,
                dtype=np.float32,
                shape=(brain.camera_resolutions[0]["height"],
                       brain.camera_resolutions[0]["width"], depth))
        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self, train_mode=True):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset(train_mode)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityGymException(
                    "The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityGymException(
                    "The environment was expecting a list of {} actions.".
                    format(self._n_agents))
            else:
                action = np.array(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations[0][0, :, :, :]
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info
        }

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = info.visual_observations
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return list(default_observation), info.rewards, info.local_done, {
            "text_observation": info.text_observations,
            "brain_info": info
        }

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warning("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityGymException(
                "The environment was launched as a mutli-agent environment, however"
                "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space_size(self):
        return self._action_space_size

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
示例#10
0
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
            trainer.reset_buffers(info, total=True)
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=model_path, steps=steps, saver=saver)
        if train_model:
            steps += 1
            sess.run(ppo_model.increment_step)
            if len(trainer.stats['cumulative_reward']) > 0:
                mean_reward = np.mean(trainer.stats['cumulative_reward'])
                sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
                last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
graph_name = (env_name.strip()
      .replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
graph_name = os.path.basename(os.path.normpath(graph_name))
export_graph(model_path, graph_name)
示例#11
0
def experiment(hidden_size=64,
               lr=3e-4,
               num_steps=2048,
               mini_batch_size=32,
               ppo_epochs=10,
               threshold_reward=10,
               max_episodes=15,
               nrmlz_adv=True,
               clip_gradients=True):
    use_cuda = torch.cuda.is_available()
    #    device   = torch.device("cuda" if use_cuda else "cpu")
    device = torch.device("cpu")
    print(device)
    scores_window = deque(maxlen=100)

    test_rewards = []

    #    env = UnityEnvironment(file_name='p2_continuous-control/reacher20/reacher', base_port=64739)
    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    num_inputs = state_size
    num_outputs = action_size

    model = ActorCriticPolicy(num_inputs, num_outputs, hidden_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-5)

    #    while episode < max_episodes and not early_stop:
    for episode in tqdm(range(max_episodes)):
        log_probs = []
        values = []
        states_list = []
        actions_list = []
        rewards = []
        masks = []
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        for duration in range(num_steps):

            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)
            action_t = dist.sample()
            action_np = action_t.cpu().data.numpy()
            env_info = env.step(action_np)[
                brain_name]  # send all actions to the environment

            next_state = env_info.vector_observations  # get next state (for each agent)
            reward = env_info.rewards  # get reward (for each agent)
            dones = np.array(env_info.local_done)  # see if episode finished
            if reward == None:
                pass
            log_prob = dist.log_prob(action_t)
            log_prob = torch.sum(log_prob, dim=1, keepdim=True)
            log_probs.append(log_prob)
            values.append(value)
            reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device)
            masks_t = torch.FloatTensor(1 - dones)
            rewards.append(reward_t)
            masks.append(masks_t)
            states_list.append(state)
            actions_list.append(action_t)

            state = next_state

            if np.any(dones):
                break

        next_state = torch.FloatTensor(state).to(device)
        _, next_value = model(next_state)

        #        returns = compute_gae(next_value, rewards, masks, values)
        mean1 = torch.mean(torch.stack(rewards))
        print("Rewards: ", mean1)
        returns = compute_gaes(next_value, rewards, masks, values)
        #        return2 = compute_gae_rollout(rollout)

        returns = torch.cat(returns).detach()
        mean2 = torch.mean(returns)
        #print("Returns: ", mean2)
        log_probs = torch.cat(log_probs).detach()
        values = torch.cat(values).detach()
        states = torch.cat(states_list)
        actions = torch.cat(actions_list)
        advantages = returns - values
        if nrmlz_adv:
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

        losses = []

        clip_param = 0.2
        print("return: ", returns.mean(), "advantage:", advantages.mean())
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                # print("return: ", return_.mean(), "advantage:", advantage.mean())
                dist, value = model(state)
                entropy = dist.entropy().mean()

                new_log_probs = dist.log_prob(action)
                new_log_probs = torch.sum(new_log_probs, dim=1, keepdim=True)

                ratio = (new_log_probs - old_log_probs).exp()
                # surrogate objective
                surr1 = ratio * advantage
                # Clipped Surrogate Objectiv
                surr2 = ratio.clamp(1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                policy_loss = -torch.min(surr1, surr2).mean() - 0.01 * entropy
                value_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * value_loss + policy_loss
                losses.append(loss)
                optimizer.zero_grad()
                loss.backward()
                if clip_gradients:
                    nn.utils.clip_grad_norm_(model.parameters(), 5)
                optimizer.step()

        test_mean_reward = test_agent(env, brain_name, model, device)
        test_rewards.append(test_mean_reward)
        scores_window.append(test_mean_reward)
        # mean_score = np.mean(scores_window)
        # print("Mean Score: ", mean_score, "Frame: ", episode)
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.
              format(episode, test_mean_reward, min(episode, 100),
                     np.mean(scores_window)))
        if np.mean(scores_window) > threshold_reward:
            torch.save(
                model.state_dict(),
                f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth"
            )
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, test_mean_reward))
            break

        episode += 1

    # %%
    #torch.save(model.state_dict(),
    #          f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}.pth")

    env.close()
    return scores_window, test_rewards
示例#12
0
def train(
        env_location,
        curve_path,
        n_episodes=1000,
        batch_size=512,
        buffer_size=int(1e6),
):

    env = UnityEnvironment(file_name=env_location)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    logger.info(f'Number of agents: {num_agents}')

    # size of each action
    action_size = brain.vector_action_space_size
    logger.info(f'Size of each action: {action_size}')

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    logger.info(
        'There are {} agents. Each observes a state with length: {}'.format(
            states.shape[0], state_size))
    logger.info(f'The state for the first agent looks like: {states[0]}')

    # reset the environment

    # Replay memory
    random_seed = 2
    memory0 = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)
    memory1 = memory0

    def create_agent(memory):
        return Agent(state_size=states.shape[1],
                     action_size=brain.vector_action_space_size,
                     random_seed=random_seed,
                     memory=memory,
                     batch_size=batch_size)

    agent0 = create_agent(memory0)
    agent1 = create_agent(memory1)

    def ddpg(n_episodes, average_window=100, plot_every=4):
        scores_deque = deque(maxlen=average_window)
        scores_all = []
        average_scores_all = []

        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            states = np.array(
                env_info.vector_observations,
                copy=True)  # get the current state (for each agent)
            agent0.reset()
            agent1.reset()
            scores = np.zeros(
                num_agents)  # initialize the score (for each agent)

            while True:
                action0 = agent0.act(states[0])
                action1 = agent1.act(states[1])
                actions = np.concatenate((action0, action1))

                env_info = env.step(actions)[
                    brain_name]  # send all actions to tne environment
                next_states = env_info.vector_observations  # get next state (for each agent)
                rewards = env_info.rewards  # get reward (for each agent)
                dones = env_info.local_done  # see if episode finished

                memory0.add(states[0], action0, rewards[0], next_states[0],
                            dones[0])
                memory1.add(states[1], action1, rewards[1], next_states[1],
                            dones[1])

                agent0.step()
                agent1.step()

                scores += env_info.rewards  # update the score (for each agent)
                states = next_states  # roll over states to next time step
                any_done = np.any(dones)
                assert any_done == np.all(dones)
                if any_done:  # exit loop if episode finished
                    break

            score_episode = np.max(scores)
            best_agent = np.argmax(scores)
            scores_deque.append(score_episode)
            scores_all.append(score_episode)
            average_score_queue = np.mean(scores_deque)
            average_scores_all.append(average_score_queue)

            logger.info(
                '\rEpisode {}\tScore: {:.4f}\tBest Agent: {}\tAverage Score: {:.4f}'
                .format(i_episode, score_episode, best_agent,
                        average_score_queue))
            torch.save(agent0.actor_local.state_dict(),
                       'checkpoint_actor0.pth')
            torch.save(agent0.critic_local.state_dict(),
                       'checkpoint_critic0.pth')
            torch.save(agent1.actor_local.state_dict(),
                       'checkpoint_actor1.pth')
            torch.save(agent1.critic_local.state_dict(),
                       'checkpoint_critic1.pth')
            if i_episode > average_window and average_score_queue > 1.0:
                break

            if i_episode % plot_every == 0:
                plot_curve(scores_all, average_scores_all)

        return scores_all, average_scores_all

    scores, average_scores = ddpg(n_episodes=n_episodes)
    plot_curve(scores, average_scores)

    env.close()

    return np.max(average_scores)
示例#13
0
class UnityEnvV0(Env, Serializable):
    def __init__(self,
                 app_name,
                 time_state=False,
                 idx=0,
                 is_render=False,
                 no_graphics=False,
                 recording=True):
        Serializable.quick_init(self, locals())

        # Unity scene
        self._env = UnityEnvironment(file_name=app_name,
                                     worker_id=idx,
                                     no_graphics=no_graphics)
        self.id = 0

        self.name = app_name
        self.idx = idx
        self.is_render = is_render

        self.time_state = time_state
        self.time_step = 0

        # Check brain configuration
        assert len(self._env.brains) == 1
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        # Check for number of agents in scene
        initial_info = self._env.reset()[self.brain_name]
        self.use_visual = (brain.number_visual_observations == 1) and False
        self.recording = brain.number_visual_observations == 1 and recording

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            self._action_space = Discrete(1)
        else:
            high = np.array([np.inf] * (brain.vector_action_space_size))
            self._action_space = Box(-high, high)
        # ----------------------------------
        if self.use_visual and False and no_graphic:
            high = np.array([np.inf] * brain.camera_resolutions[0]["height"] *
                            brain.camera_resolutions[0]["width"] * 3)
            self._observation_space = Box(-high, high)
        else:
            if self.time_state:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size + 1))
            else:
                high = np.array([np.inf] *
                                (brain.vector_observation_space_size))
            self._observation_space = Box(-high, high)

        # video buffer
        self.frames = []

    def reset(self):
        self.frames = []
        info = self._env.reset()[self.brain_name]
        if self.is_render: self.observation = info.visual_observations[0]
        state = info.vector_observations[0][:]
        self._pos = info.vector_observations[0][:2]
        if self.time_state:
            state = np.hstack((state, [self.time_step]))
            self.time_step += 1
        self._collect_frames(info.visual_observations[0][0])
        return state.flatten()

    def step(self, action):
        info = self._env.step([action])[self.brain_name]
        if self.is_render: self.observation = info.visual_observations[0]
        state = info.vector_observations[0][:]
        self._pos = info.vector_observations[0][:2]
        reward = info.rewards[0]
        done = info.local_done[0]
        if self.time_state:
            state = np.hstack((state, [self.time_step]))
            self.time_step += 1
            if done: self.time_step = 0
        self._collect_frames(info.visual_observations[0][0])
        return Step(observation=state.flatten(), reward=reward, done=done)

    def terminate(self):
        self._env.close()

    def render(self, mode=None):
        if self.is_render:
            x = self.observation[0] * 255
            return np.array(x).astype('uint8')
        else:
            return np.zeros((480, 360, 3))

    def _collect_frames(self, frame):
        if self.recording:
            self.frames.append(np.uint8(frame * 255))

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def position(self):
        return self._pos
示例#14
0
class UnityEnv(IEnvironment):
    def __init__(self, name):

        drl_logger.info("Initializing environment.'",
                        extra={"params": {
                            "name": name,
                        }})

        self.env = UnityEnvironment(file_name=name)
        self.brain_name = self.env.brain_names[0]
        self.termination_reward = 0

    def action_offset(self):
        return 0

    def close(self):
        self.env.close()

    def get_action_space(self):
        # isDiscrete = isinstance(self.__env.action_space, Discrete)
        #
        # if isDiscrete:
        #     num_action_space = self.__env.action_space.n
        #     logging.debug("Env action space is discrete")
        #     logging.debug("Env action space: {}".format(num_action_space))
        #
        # logging.debug("Env observation space: {}".format(self.__env.observation_space))
        pass

    def render(self, mode):
        pass

    def reset(self):
        brain_name = self.env.brain_names[0]
        # brain = self.__env.brains[brain_name]

        env_info = self.env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        # state = env_info.vector_observations  # get the current state

        new_life = True

        return state, new_life

    def start_game_action(self):
        return None

    def step(self, action):
        env_info = self.env.step(action)[
            self.brain_name]  # send the action to the environment

        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished

        if done:
            reward += self.termination_reward

        new_life = False

        return next_state, reward, done, new_life
def main():
    env = UnityEnvironment(file_name='Reacher.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=3)

    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, 1000):
        begin = time.time()
        curr_scores = np.zeros(
            num_agents)  # initialize the score (for each agent)
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)

        agent.reset()

        for t in range(1000):
            actions = agent.act(states)
            env_info = env.step(actions)[
                brain_name]  # send all actions to the environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            agent.step(states, actions, rewards, next_states, dones, t)

            states = next_states
            curr_scores += rewards

            if np.any(dones):
                break

        curr_score = np.mean(curr_scores)
        scores_deque.append(curr_score)
        average_score = np.mean(scores_deque)
        scores.append(curr_score)

        print(
            '\rEpisode {}\tTime: {:.2f}\tAvg: {:.2f}\tScore: {:.2f}\tMin {:.2f}\tMax {:.2f}'
            .format(i_episode,
                    time.time() - begin, average_score, curr_score,
                    min(curr_scores), max(curr_scores)))
        if i_episode % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
        if average_score >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, average_score))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break

    env.close()

    return
示例#16
0
def train_unity_ddpg(PATH, env_name, platform, env_path, policy,
                     score_threshold, timestamp, start, n_episodes, max_t,
                     num_agents):
    """ Trains unity environments with DDPG policy """
    total_scores = []
    from unityagents import UnityEnvironment
    env_path = PATH + f"data/{env_path}"
    env = UnityEnvironment(file_name=env_path)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    print(f"Number of agents: {num_agents}")
    states = env_info.vector_observations
    state_size = states.shape[1]
    print(
        f"There are {states.shape[0]} agents.  Each observes a state with length {state_size}"
    )
    print(f"The state for the first agent looks like:\n{states[0]}")
    action_size = brain.vector_action_space_size
    print(f"Size of each action: {action_size}")
    policy = policy(state_size, action_size, num_agents)
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        policy.reset()
        for t in range(max_t):
            actions = policy.act(states)
            env_info = env.step(actions)[
                brain_name]  # send the action to the environment
            next_states = env_info.vector_observations
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done
            policy.step(states, actions, rewards, next_states, dones, t)
            states = next_states
            scores += env_info.rewards
            if np.any(dones):
                break
        score_length = len(total_scores) if len(total_scores) < 100 else 100
        mean_score = np.mean(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)
        total_scores.append(mean_score)
        total_average_score = np.mean(total_scores[-score_length:])
        end = time.time()
        print(
            f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}',
            end=" ")
        if i_episode % 20 == 0 or total_average_score >= score_threshold:
            fap = PATH + f'results/{env_name}_{timestamp}_checkpoint_actor.pth'
            torch.save(policy.actor.state_dict(), fap)
            fcp = PATH + f'results/{env_name}_{timestamp}_checkpoint_critic.pth'
            torch.save(policy.critic.state_dict(), fcp)
            print(
                f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}'
            )
        if total_average_score > score_threshold:
            print(f"Solved in {i_episode} and {calc_runtime(end-start)}")
            break
    env.close()
    return total_scores
示例#17
0
def main():
    # ---------------------------------------------------------------------------------------------------
    #  Logger
    # ---------------------------------------------------------------------------------------------------
    save_path = f"./results/Tennis_DDPG_{pd.Timestamp.utcnow().value}"
    os.makedirs(save_path, exist_ok=True)

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s : %(message)s')

    handler = logging.FileHandler(
        f"{save_path}/logs_p3_{pd.Timestamp.utcnow().value}.log")
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # ---------------------------------------------------------------------------------------------------
    #  Inputs
    # ---------------------------------------------------------------------------------------------------
    import json
    with open(f"./assets/best_agent/config.json", "r") as f:
        config = json.load(f)
    config["mode"] = "test"
    config["n_episodes"] = 10
    config["warmup"] = 0

    logger.warning("+=" * 90)
    logger.warning(f"  RUNNING SIMULATION WITH PARAMETERS config={config}")
    logger.warning("+=" * 90)

    # ------------------------------------------------------------
    #  1. Initialization
    # ------------------------------------------------------------
    # 1. Start the Environment
    env = UnityEnvironment(file_name=f'./{config["env_name"]}')  # mac OS

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config["n_agents"] = num_agents

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    config.update(dict(action_size=action_size, state_size=state_size))

    # ------------------------------------------------------------
    #  2. Training
    # ------------------------------------------------------------
    # Unity Monitor
    monitor = UnityMonitor(env=env, config=config)

    # Actor model
    seed = 0
    actor = SimpleNeuralNetHead(action_size,
                                SimpleNeuralNetBody(
                                    state_size,
                                    config["hidden_layers_actor"],
                                    seed=seed),
                                func=torch.tanh,
                                seed=seed)
    # Critic model
    critic = DeepNeuralNetHeadCritic(
        action_size * num_agents,
        SimpleNeuralNetBody(state_size * num_agents,
                            config["hidden_layers_critic_body"],
                            func=eval(config["func_critic_body"]),
                            seed=seed),
        hidden_layers_sizes=config["hidden_layers_critic_head"],
        func=eval(config["func_critic_head"]),
        end_func=None,
        seed=seed)

    # MADDPG Agent
    agent = MADDPGAgent(
        state_size=state_size,
        action_size=action_size,
        model_actor=actor,
        model_critic=critic,
        action_space_low=-1,
        action_space_high=1,
        config=config,
    )

    # ------------------------------------------------------------
    #  3. Testing
    # ------------------------------------------------------------
    logger.warning("Entering Test Mode!")
    monitor.n_episodes = 100
    env.reset(train_mode=False)
    env.warmup = 0
    agent.warmup = 0
    for a in agent.agents:
        a.warmup = 0
    agent.load(filepath="./assets/best_agent", mode="test")
    scores = monitor.run(agent)
    logger.info(f"Test Score over {len(scores)} episodes: {np.mean(scores)}")
    config["test_scores"] = scores
    config["best_test_score"] = max(scores)
    config["avg_test_score"] = np.mean(scores)

    # When finished, you can close the environment.
    logger.info("Closing...")
    env.close()
示例#18
0
class Env:
    '''A convinience function for generating episodes and memories
    
    This convinience class generates a context manager that can be
    used for generating a Unity environment. The Unity environment
    and the OpenAI Gym environment operates slightly differently
    and hence it will be difficult to create a uniform algorithm that
    is able to solve everything at the sametime. This environment
    tries to solve that problem.
    '''
    def __init__(self, fileName, showEnv=False, trainMode=True):
        '''Initialize the environment
        
        This sets up the requirements that will later be used for generating
        the Unity Environment. This assumes that you will provide a binary
        file for generating the environment. There are different ways in 
        which the environment can be generated. It can be generated either
        in a *headless* mode by using showEnv as False, in which case the 
        environment will not show a window at startup. This is good for 
        training, as well as situations when you are running the environment
        without the presence of an X server, especially when you are running 
        this environment remotely. The other thing that you can do is to 
        specify that this is being run in `trainMode`. In this case, the 
        environment will be primed for training. That is, each frame will
        finish as soon as possible. This is not good for observing what is
        happening. However, this significantly increases the speed of 
        training. 
        
        Arguments:
            fileName {str} -- Path to the binary file. This file must be
                the same as the one for which the `unityagents` package 
                has been generated. 
        
        Keyword Arguments:
            showEnv {bool} -- Set this to ``True`` if you want to view the 
                environment (default: {False})
            trainMode {bool} -- Set this to ``True`` if you want the environment
                tobe in training mode (i.e. fast execution) (default: {True})
        '''

        try:
            self.no_graphics = not showEnv
            self.trainMode = trainMode
            self.fileName = fileName
            self.states = None
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.__init__ - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])
        return

    def __enter__(self):
        '''generate a context manager
        
        This will actually generate the context manager and allow you use this 
        within a ``with`` statement. This is the function that actually 
        initialized the environment and maintains it, until it is needed. 
        
        Returns:
            ``this`` -- Returns an instance of the same class
        '''

        try:
            self.env = UnityEnvironment(file_name=self.fileName,
                                        no_graphics=self.no_graphics)

            # get the default brain
            self.brain_name = self.env.brain_names[0]
            self.brain = self.env.brains[self.brain_name]
            self.env_info = self.env.reset(
                train_mode=self.trainMode)[self.brain_name]

            self.num_agents = len(self.env_info.agents)
            self.action_size = self.brain.vector_action_space_size
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.__enter__ - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return self

    def reset(self):
        '''reset the environment before starting an episode
        
        Returns:
            status -- The current status after the reset
        '''
        try:
            self.env.reset(train_mode=self.trainMode)
            self.states = self.env_info.vector_observations
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.reset - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])
        return self.states

    def step(self, policy):
        '''advance one step by taking an action
        
        This function takes a policy function and generates an action 
        according to that particular policy. This results in the 
        advancement of the episode into a one step with the return 
        of the reward, and the next state along with any done 
        information. 
        
        Arguments:
            policy {function} -- This function takes a state vector and 
                returns an action vector. It is assumed that the policy 
                is the correct type of policy, and is capable if taking
                the right returning the right type of vector corresponding
                the the policy for the current environment. It does not 
                check for the validity of the policy function
        
        Returns:
            list -- This returns a list of tuples containing the tuple 
                ``(s_t, a_t, r_{t+1}, s_{t+1}, d)``. One tuple for each
                agent. Even for the case of a single agent, this is going
                to return a list of states
        '''

        try:
            states = self.states.copy()
            actions = policy(states)
            env_info = self.env.step(actions)[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            self.states = next_states

            results = []
            for i in range(self.num_agents):
                state = states[i]
                action = actions[i]
                reward = rewards[i]
                next_state = next_states[i]
                done = dones[i]

                results.append((state, action, reward, next_state, done))

        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.step - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return results

    def episode(self, policy, maxSteps=None):
        '''generate data for an entire episode
        
        This function generates an entire episde. It plays the environment
        by first resetting it too the beginning, and then playing the game for 
        a given number of steps (or unless the game is terminated). It generates
        a set of list of tuplees, again one for each agent. Rememebr that even
        when the number of agents is 1, it will still return a list oof states.

        Arguments:
            policy {function} -- The function that takes the current state and 
                returns the action vector. 
        
        Keyword Arguments:
            maxSteps {int or None} -- The maximum number of steps that the agent is
                going to play the episode before the episode is terminated. (default: 
                {None} in which case the episode will continue until it actually 
                finishes)
        
        Returns:
            list -- This returns the list of tuples for the entire episode. Again, this
                is a lsit of lists, one for each agent.
        '''

        try:
            self.reset()
            stepCount = 0
            allResults = [[] for _ in range(self.num_agents)]

            while True:

                stepCount += 1
                finished = False
                results = self.step(policy)
                for agent in range(self.num_agents):
                    state, action, reward, next_state, done = results[agent]
                    allResults[agent].append(results[agent])
                    finished = finished or done

                if finished:
                    break

                if (maxSteps is not None) and (stepCount >= maxSteps):
                    break
        except Exception as e:
            raise type(e)('lib.envs.envUnity.Env.episode - ERROR - ' +
                          str(e)).with_traceback(sys.exc_info()[2])

        return allResults

    def __exit__(self, exc, value, traceback):
        '''Exit the context manager
        
        The exit funciton that will result in exiting the
        context manager. Typically one is supposed to check 
        the error if any at this point. This will be handled 
        at a higher level
        
        Arguments:
            *args {[type]} -- [description]
        '''

        if not exec:
            self.env.close()
            return True
示例#19
0
class UnityEnv:
    '''
    Class for all Envs.
    Standardizes the UnityEnv design to work in Lab.
    Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs
    '''

    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        self.e = e
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id)
        # spaces for NN auto input/output inference
        logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.')
        self.observation_spaces = []
        self.action_spaces = []
        for a in range(len(self.u_env.brain_names)):
            observation_shape = (self.get_observable_dim(a)['state'],)
            if self.get_brain(a).state_space_type == 'discrete':
                observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32)
            else:
                observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32)
            self.observation_spaces.append(observation_space)
            if self.is_discrete(a):
                action_space = gym.spaces.Discrete(self.get_action_dim(a))
            else:
                action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
            self.action_spaces.append(action_space)
        for observation_space, action_space in zip(self.observation_spaces, self.action_spaces):
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)

        # TODO experiment to find out optimal benchmarking max_timestep, set
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False

    def check_u_brain_to_agent(self):
        '''Check the size match between unity brain and agent'''
        u_brain_num = self.u_env.number_brains
        agent_num = len(self.body_e)
        assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.'

    def check_u_agent_to_body(self, env_info_a, a):
        '''Check the size match between unity agent and body'''
        u_agent_num = len(env_info_a.agents)
        body_num = util.count_nonan(self.body_e[a])
        assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.'

    def get_brain(self, a):
        '''Get the unity-equivalent of agent, i.e. brain, to access its info'''
        name_a = self.u_env.brain_names[a]
        brain_a = self.u_env.brains[name_a]
        return brain_a

    def get_env_info(self, env_info_dict, a):
        name_a = self.u_env.brain_names[a]
        env_info_a = env_info_dict[name_a]
        return env_info_a

    @lab_api
    def post_body_init(self):
        '''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
        self.nanflat_body_e = util.nanflatten(self.body_e)
        for idx, body in enumerate(self.nanflat_body_e):
            body.nanflat_e_idx = idx
        self.body_num = len(self.nanflat_body_e)
        self.check_u_brain_to_agent()
        logger.info(util.self_desc(self))

    def is_discrete(self, a):
        '''Check if an agent (brain) is subject to discrete actions'''
        return self.get_brain(a).is_discrete()

    def get_action_dim(self, a):
        '''Get the action dim for an agent (brain) in env'''
        return self.get_brain(a).get_action_dim()

    def get_action_space(self, a):
        return self.action_spaces[a]

    def get_observable_dim(self, a):
        '''Get the observable dim for an agent (brain) in env'''
        return self.get_brain(a).get_observable_dim()

    def get_observable_types(self, a):
        '''Get the observable for an agent (brain) in env'''
        return self.get_brain(a).get_observable_types()

    def get_observation_space(self, a):
        return self.observation_spaces[a]

    @lab_api
    def reset(self):
        self.done = False
        env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
        _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
        for (a, b), body in util.ndenumerate_nonan(self.body_e):
            env_info_a = self.get_env_info(env_info_dict, a)
            self.check_u_agent_to_body(env_info_a, a)
            state = env_info_a.states[b]
            state_e[(a, b)] = state
            done_e[(a, b)] = self.done
        return _reward_e, state_e, done_e

    @lab_api
    def step(self, action_e):
        # TODO implement clock_speed: step only if self.clock.to_step()
        if self.done:
            return self.reset()
        action_e = util.nanflatten(action_e)
        env_info_dict = self.u_env.step(action_e)
        reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
        for (a, b), body in util.ndenumerate_nonan(self.body_e):
            env_info_a = self.get_env_info(env_info_dict, a)
            reward_e[(a, b)] = env_info_a.rewards[b]
            state_e[(a, b)] = env_info_a.states[b]
            done_e[(a, b)] = env_info_a.local_done[b]
        self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep)
        return reward_e, state_e, done_e

    @lab_api
    def close(self):
        self.u_env.close()
示例#20
0
def make_banana_env():
    env = UnityEnvironment(file_name=BANANA_APP)
    yield env
    env.close()
def dqn(n_episodes=2000,
        max_t=1000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    # Get environment instance
    env = UnityEnvironment(file_name=BANANA_FILE)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # Reset environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Get initial state, state size and action size
    action_size = brain.vector_action_space_size
    state = env_info.vector_observations[0]
    state_size = len(state)
    # Setup agent
    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    # Train!
    max_avg_score = -100000  # max avg score over 100 episodes
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        state = env.reset(train_mode=True)[brain_name].vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 13.0 and np.mean(
                scores_window) > max_avg_score:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            # break
            max_avg_score = np.mean(scores_window)

    # Close environment
    env.close()
    return scores
class Environment():
    """
    This is a wrapper class for a Unity environment

    The Unity environment is wrapped such that the API
    is similar to a Gym environment.

    Using this class, DQN algorithms written for Gym environments
    can be re-used with minimal changes.
    """
    def __init__(self,
                 filename_path,
                 worker_id=0,
                 train_mode=True,
                 no_graphics=False,
                 seed=0):
        # Create new environment

        # Create Unity environment
        self._env = UnityEnvironment(file_name=filename_path, \
                                    worker_id=worker_id,\
                                    no_graphics=no_graphics, \
                                    seed=seed)

        # get the default brain
        self._brain_name = self._env.brain_names[0]
        self._brain = self._env.brains[self._brain_name]

        # set the initial state
        self.train_mode = train_mode
        self._env_info = self._env.reset(
            train_mode=train_mode)[self._brain_name]
        self._state = self._env_info.vector_observations[0]

        # define state_size and action_size
        self.state_size = len(self._state)
        self.action_size = self._brain.vector_action_space_size

    def reset(self):
        # reset the environment
        self._env_info = self._env.reset(
            train_mode=self.train_mode)[self._brain_name]
        self._state = self._env_info.vector_observations[0]

        # return the state vector
        return self._state

    def step(self, action):
        # send the action to the environment
        self._env_info = self._env.step(action)[self._brain_name]
        # get the next state
        next_state = self._env_info.vector_observations[0]
        # get the reward
        reward = self._env_info.rewards[0]
        # check if terminal state is reached
        done = self._env_info.local_done[0]
        # create dummy value to keep API compatible
        dummy = 0

        # return the next_state vector, the reward,
        # and whether the terminal state was reached
        return next_state, reward, done, dummy

    def close(self):
        self._env.close()
        pass
    _action_size: int = 4
    _state_size: int = 33

    _agent = Agent(_state_size,
                   _action_size,
                   gamma=0.99,
                   lr_actor=0.0002,
                   lr_critic=0.0003,
                   tau=0.002,
                   weight_decay=0.0001,
                   buffer_size=1000000,
                   batch_size=128)

    # with this boolean you can decide if you just want to watch an agent or train the agent yourself
    watch_only = True
    if watch_only:
        watch_agent_from_pth_file(_env, _brain_name, _agent,
                                  './checkpoint-actor.pth',
                                  './checkpoint-critic.pth')
    else:
        scores = train_agent(_env,
                             _brain_name,
                             _agent,
                             n_episodes=500,
                             max_steps=1500)
        watch_agent(_env, _brain_name, _agent)
        plot_scores(scores=scores, sma_window=10)

    _env.close()
示例#24
0
class UnityEnv(BaseEnv):
    '''
    Wrapper for Unity ML-Agents env to work with the Lab.

    e.g. env_spec
    "env": [{
      "name": "gridworld",
      "max_t": 20,
      "max_frame": 3,
      "unity": {
        "gridSize": 6,
        "numObstacles": 2,
        "numGoals": 1
      }
    }],
    '''

    def __init__(self, spec):
        super().__init__(spec)
        util.set_attr(self, self.env_spec, ['unity'])
        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        seed = ps.get(spec, 'meta.random_seed')
        # TODO update Unity ml-agents to use seed=seed below
        self.u_env = UnityEnvironment(file_name=get_env_path(self.name), worker_id=worker_id)
        self.patch_gym_spaces(self.u_env)
        self._set_attr_from_u_env(self.u_env)
        assert self.max_t is not None
        self.tracked_reward = 0
        self.total_reward = 0
        logger.info(util.self_desc(self))

    def patch_gym_spaces(self, u_env):
        '''
        For standardization, use gym spaces to represent observation and action spaces for Unity.
        This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces
        '''
        observation_spaces = []
        action_spaces = []
        for a in range(len(u_env.brain_names)):
            brain = self._get_brain(u_env, a)
            observation_shape = (brain.get_observable_dim()['state'],)
            if brain.is_discrete():
                dtype = np.int32
                action_space = spaces.Discrete(brain.get_action_dim())
            else:
                dtype = np.float32
                action_space = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=dtype)
            observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype)
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)
            observation_spaces.append(observation_space)
            action_spaces.append(action_space)
        # set for singleton
        u_env.observation_space = observation_spaces[0]
        u_env.action_space = action_spaces[0]
        return observation_spaces, action_spaces

    def _get_brain(self, u_env, a):
        '''Get the unity-equivalent of agent, i.e. brain, to access its info'''
        name_a = u_env.brain_names[a]
        brain_a = u_env.brains[name_a]
        return brain_a

    def _check_u_brain_to_agent(self):
        '''Check the size match between unity brain and agent'''
        u_brain_num = self.u_env.number_brains
        agent_num = 1  # TODO rework unity outdated
        assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.'

    def _check_u_agent_to_body(self, env_info_a, a):
        '''Check the size match between unity agent and body'''
        u_agent_num = len(env_info_a.agents)
        body_num = 1  # rework unity
        assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.'

    def _get_env_info(self, env_info_dict, a):
        '''Unity API returns a env_info_dict. Use this method to pull brain(env)-specific usable for lab API'''
        name_a = self.u_env.brain_names[a]
        env_info_a = env_info_dict[name_a]
        return env_info_a

    def seed(self, seed):
        self.u_env.seed(seed)

    @lab_api
    def reset(self):
        self.done = False
        env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
        a, b = 0, 0  # default singleton agent and body
        env_info_a = self._get_env_info(env_info_dict, a)
        state = env_info_a.states[b]
        return state

    @lab_api
    def step(self, action):
        env_info_dict = self.u_env.step(action)
        a, b = 0, 0  # default singleton agent and body
        env_info_a = self._get_env_info(env_info_dict, a)
        state = env_info_a.states[b]
        reward = env_info_a.rewards[b]
        reward = try_scale_reward(self, reward)
        done = env_info_a.local_done[b]
        if not self.is_venv and self.clock.t > self.max_t:
            done = True
        self.done = done
        info = {'env_info': env_info_a}
        # track total_reward
        self.tracked_reward += reward
        if done:
            self.total_reward = self.tracked_reward
            self.tracked_reward = 0  # reset
        info.update({'total_reward': self.total_reward})
        return state, reward, done, info

    @lab_api
    def close(self):
        self.u_env.close()
示例#25
0
def main(seed=seed):
    # ---------------------------------------------------------------------------------------------------
    #  Logger
    # ---------------------------------------------------------------------------------------------------
    save_path = f"./results/Reacher_DDPG_{pd.Timestamp.utcnow().value}"
    os.makedirs(save_path, exist_ok=True)

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s : %(message)s')

    handler = logging.FileHandler(
        f"{save_path}/logs_navigation_{pd.Timestamp.utcnow().value}.log")
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # ---------------------------------------------------------------------------------------------------
    #  Inputs
    # ---------------------------------------------------------------------------------------------------
    n_episodes = 300
    config = dict(
        # Environment parameters
        env_name="Reacher",
        n_episodes=n_episodes,
        length_episode=1500,
        save_every=100,
        save_path=save_path,
        mode="train",  # "train" or "test"
        evaluate_every=
        5000,  # Number of training episodes before 1 evaluation episode
        eps_decay=1,  # Epsilon decay rate

        # Agent Parameters
        agent="DDPG",
        hidden_layers_actor=(200, 150),  # (50, 50, 15),  # (200, 150),  #
        hidden_layers_critic_body=(400, ),  # (50, 50,),  #
        hidden_layers_critic_head=(300, ),  # (50,),   # (300,)
        func_critic_body="F.leaky_relu",  #
        func_critic_head="F.leaky_relu",  #
        func_actor_body="F.leaky_relu",  #
        lr_scheduler=
        None,  #{'scheduler_type': "multistep",  # "step", "exp" or "decay", "multistep"
        #               'gamma': 0.5,  # 0.99999,
        #               'step_size': 1,
        #               'milestones': [15*1000 * i for i in range(1, 6)],
        #               'max_epochs': n_episodes},
        TAU=1e-3,  # for soft update of target parameters
        BUFFER_SIZE=int(1e6),  # replay buffer size
        BATCH_SIZE=128,  # minibatch size
        GAMMA=0.99,  # discount factor
        LR_ACTOR=1e-3,  # learning rate of the actor
        LR_CRITIC=1e-3,  # learning rate of the critic
        WEIGHT_DECAY=0,  # L2 weight decay
        UPDATE_EVERY=1,  # Number of actions before making a learning step
        action_noise="OU",  #
        action_noise_scale=1,
        weights_noise=None,  #
        state_normalizer="BatchNorm",  # "RunningMeanStd" or "BatchNorm"
        warmup=0,  # Number of random actions to start with as a warm-up
        start_time=str(pd.Timestamp.utcnow()),
        random_seed=seed,
        threshold=30)
    logger.warning("+=" * 90)
    logger.warning(f"  RUNNING SIMULATION WITH PARAMETERS config={config}")
    logger.warning("+=" * 90)

    # ------------------------------------------------------------
    #  1. Initialization
    # ------------------------------------------------------------
    # 1. Start the Environment

    # env = UnityEnvironment(file_name=f'./Reacher_Linux_2/Reacher.x86_64')  # Linux
    env = UnityEnvironment(file_name=f'./{config["env_name"]}')  # mac OS

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    config["n_agents"] = num_agents

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    config.update(dict(action_size=action_size, state_size=state_size))

    # ------------------------------------------------------------
    #  2. Training
    # ------------------------------------------------------------
    # Unity Monitor
    monitor = UnityMonitor(env=env, config=config)

    if config["mode"] == "train":
        # Actor model
        seed = 0
        actor = SimpleNeuralNetHead(action_size,
                                    SimpleNeuralNetBody(
                                        state_size,
                                        config["hidden_layers_actor"],
                                        seed=seed),
                                    func=F.tanh,
                                    seed=seed)
        actor_target = SimpleNeuralNetHead(action_size,
                                           SimpleNeuralNetBody(
                                               state_size,
                                               config["hidden_layers_actor"],
                                               seed=seed),
                                           func=F.tanh,
                                           seed=seed)
        # Critic model
        critic = DeepNeuralNetHeadCritic(
            action_size,
            SimpleNeuralNetBody(state_size,
                                config["hidden_layers_critic_body"],
                                func=eval(config["func_critic_body"]),
                                seed=seed),
            hidden_layers_sizes=config["hidden_layers_critic_head"],
            func=eval(config["func_critic_head"]),
            end_func=None,
            seed=seed)

        critic_target = DeepNeuralNetHeadCritic(
            action_size,
            SimpleNeuralNetBody(state_size,
                                config["hidden_layers_critic_body"],
                                func=eval(config["func_critic_body"]),
                                seed=seed),
            hidden_layers_sizes=config["hidden_layers_critic_head"],
            func=eval(config["func_critic_head"]),
            end_func=None,
            seed=seed)

        # DDPG Agent
        agent = DDPGAgent(
            state_size=state_size,
            action_size=action_size,
            model_actor=actor,
            model_critic=critic,
            # actor_target=actor_target, critic_target=critic_target,
            action_space_low=-1,
            action_space_high=1,
            config=config,
        )

        # Training
        start = pd.Timestamp.utcnow()
        scores = monitor.run(agent)
        logger.info("Average Score last 100 episodes: {}".format(
            np.mean(scores[-100:])))
        elapsed_time = pd.Timedelta(pd.Timestamp.utcnow() -
                                    start).total_seconds()
        logger.info(f"Elapsed Time: {elapsed_time} seconds")

    # ------------------------------------------------------------
    #  3. Testing
    # ------------------------------------------------------------
    else:
        agent = DDPGAgent.load(filepath=config['save_path'], mode="test")
        scores = monitor.run(agent)
        logger.info(
            f"Test Score over {len(scores)} episodes: {np.mean(scores)}")
        config["test_scores"] = scores
        config["best_test_score"] = max(scores)
        config["avg_test_score"] = np.mean(scores)

    # When finished, you can close the environment.
    logger.info("Closing...")
    env.close()
示例#26
0
class CollectBananaENV:
    def __init__(self, env_type='vector', mode='train'):
        """
        This is a wrapper on top of the brain environment that provides useful function to render the environment
        call very similar to like calling the open AI gym environement.

        Wrapper Code referred from : https://github.com/yingweiy/drlnd_project1_navigation

        :param env_type:
        """
        self.env_type = env_type
        if env_type == 'vector':
            self.base_env = UnityEnvironment('Banana.app')
        elif env_type == 'visual':
            self.base_env = UnityEnvironment('VisualBanana.app')
        else:
            raise ValueError('Env Name not understood ....')
        # get the default brain
        self.brain_name = self.base_env.brain_names[0]
        self.brain = self.base_env.brains[self.brain_name]
        self.action_size = self.brain.vector_action_space_size

        if mode == 'train':
            self.train = True
        else:
            self.train = False

        self.frame1 = None
        self.frame2 = None
        self.frame3 = None
        self.reset()

        if env_type == 'vector':
            self.state_size = len(self.state)
        elif env_type == 'visual':
            self.state_size = self.state.shape
        else:
            raise ValueError('Environment type not understood ....')

        print(self.state_size)

    def get_state(self):
        if self.env_type == 'visual':

            # The DQN paper says to stack 4 frames while running the image through the neural network
            # state size is 1,84,84,3
            # Rearrange from NHWC to NCHW (Pytorch uses 3d covolution in NCHW format, cross corellation across channels)
            frame = np.transpose(self.env_info.visual_observations[0],
                                 (0, 3, 1, 2))[:, :, :, :]
            frame_size = frame.shape  # 1,3,84,84
            # print(frame_size)
            self.state = np.zeros(
                (1, frame_size[1], 4, frame_size[2], frame_size[3]))
            self.state[0, :, 0, :, :] = frame

            if self.frame1 is not None:
                self.state[0, :, 1, :, :] = self.frame1
            if self.frame2 is not None:
                self.state[0, :, 2, :, :] = self.frame2
            if self.frame3 is not None:
                self.state[0, :, 3, :, :] = self.frame3

            # Keep the last 3 frames in the memory to be accessed or stacked with the new input frame to supply as
            # input to the convolution network
            self.frame3 = self.frame2
            self.frame2 = self.frame1
            self.frame1 = frame

            # self.state = np.squeeze(self.state)  # We squeeze it becasue the code implemented in buffer will
            # unsqueeze the array
        elif self.env_type == 'vector':
            self.state = self.env_info.vector_observations[0]

        else:
            raise ValueError('Environment name %s not understood.' %
                             str(self.env_type))

    def reset(self):
        self.env_info = self.base_env.reset(
            train_mode=self.train)[self.brain_name]
        self.get_state()
        return self.state

    def step(self, action):
        """
        This function returns the value in the format of Open AI gym
        :param action:
        :return:
        """
        self.env_info = self.base_env.step(action)[
            self.brain_name]  # send the action to the environment
        self.get_state()
        reward = self.env_info.rewards[0]
        done = self.env_info.local_done[0]
        return self.state, reward, done, None

    def close(self):
        self.base_env.close()
示例#27
0
class UnityEnv(BaseEnv):
    r"""
    Basic Unity ML Agent environment.

    config example:
    "env": {
        "name": "Reacher",
        "type": "unity",
        "seed": 0,
        "to_render": True,
        "frame_sleep": 0.001,
        "max_steps": 1000,
        "one_hot": None,
        "action_bins": None,
        "reward_scale": None,
        "num_envs": None,
    }
    """

    def __init__(self, config):
        super(UnityEnv, self).__init__(config)

        self._env = UnityEnvironment(file_name=get_env_path(self.name), seed=self.seed)
        self.patch_gym_spaces(self._env)
        self._set_attr_from_u_env(self._env)

        # TODO: Logging
        print(utils.describe(self))

    def reset(self):
        self.done = False
        info_dict = self._env.reset(train_mode=self.to_render)
        env_info = self._get_env_info(info_dict, 0)
        state = env_info.vector_observations[0]
        return state

    def step(self, action):
        info_dict = self._env.step(action)
        env_info = self._get_env_info(info_dict, 0)
        state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        return state, reward, done, env_info

    def render(self):
        pass

    def close(self):
        self._env.close()

    def _get_brain(self, env, brain_index):
        r"""
        Get the unity-equivalent of agent, i.e. brain, to access its info
        :param env:
        :param brain_index:
        :return:
        """
        brain_name = env.brain_names[brain_index]
        brain = env.brains[brain_name]
        return brain

    def patch_gym_spaces(self, env):
        r"""
        For standardization, use gym spaces to represent observation and action spaces for Unity.
        This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces
        :param env:
        :return:
        """

        observation_spaces = []
        action_spaces = []
        for brain_index in range(len(env.brain_names)):
            brain = self._get_brain(env, brain_index)

            # TODO: Logging
            utils.describe(brain)

            observation_shape = (brain.get_observable_dim()['state'],)
            action_dim = (brain.get_action_dim(),)

            if brain.is_discrete():
                dtype = np.int32
                action_space = spaces.Discrete(brain.get_action_dim())
            else:
                dtype = np.float32
                action_space = spaces.Box(low=0.0, high=1.0, shape=action_dim, dtype=dtype)

            observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype)
            utils.set_gym_space_attr(observation_space)
            utils.set_gym_space_attr(action_space)
            observation_spaces.append(observation_space)
            action_spaces.append(action_space)

        # set for singleton
        env.observation_space = observation_spaces[0]
        env.action_space = action_spaces[0]

        return observation_spaces, action_spaces

    def _get_env_info(self, env_info_dict, index):
        r"""
        Unity API returns a env_info_dict. Use this method to pull brain(env)-specific
        :param env_info_dict:
        :param index:
        :return:
        """
        brain_name = self._env.brain_names[index]
        env_info = env_info_dict[brain_name]
        return env_info
def loaded_unity_env(file_name):
    env = UnityEnvironment(str(file_name), no_graphics=True)
    try:
        yield env
    finally:
        env.close()
示例#29
0
class BananaEnvWrapper(object):

    blank_state = torch.zeros(1, 37, dtype=torch.uint8)

    """ Wraps the udacity enviroment into an object behaving like an atari env
    """

    def __init__(self, train_mode=True, device='cuda'):
        self.train_mode = train_mode
        self.device = device
        self.unity_env = UnityEnvironment(
            file_name="/home/philipp/udacity/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64")

        # get the default brain
        self.brain_name = self.unity_env.brain_names[0]
        brain = self.unity_env.brains[self.brain_name]

        # reset the environment
        env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name]

        # number of agents in the environment
        print('Number of agents:', len(env_info.agents))

        # number of actions
        self._action_space = brain.vector_action_space_size
        print('Number of actions:', self._action_space)

        # examine the state space
        state = env_info.vector_observations[0]
        print('States look like:', state)
        state_size = len(state)
        print('States have length:', state_size)

        self.score = 0

    def eval(self):
        self.train_mode = False

    def train(self):
        self.train_mode = True

    def reset(self):
        print("Score: %d" % self.score)
        self.score = 0
        env_info = self.unity_env.reset(train_mode=self.train_mode)[self.brain_name]
        return self._wrap_state(env_info.vector_observations[0])  # Return current state

    def step(self, action):
        env_info = self.unity_env.step(action)[self.brain_name]
        state = env_info.vector_observations[0]  # get the current state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        self.score += reward  # update the score
        return self._wrap_state(state), reward, done

    def close(self):
        self.unity_env.close()

    def action_space(self):
        return self._action_space

    def _wrap_state(self, state):
        state = state[np.newaxis, np.newaxis, :]
        # todo: Normalization
        return torch.tensor(state, dtype=torch.float32, device=self.device)
示例#30
0
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file,
                 fast_simulation, load, train, worker_id, keep_checkpoints,
                 lesson, seed, docker_target_name, trainer_config_path,
                 use_data_gatherer):
        """

        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        """
        ''' Here's a small change (this only happens if code is launched with the '--data-gatherer' flag) '''
        self.use_data_gatherer = use_data_gatherer

        self.trainer_config_path = trainer_config_path
        env_path = (env_path.strip().replace('.app', '').replace(
            '.exe', '').replace('.x86_64', '').replace('.x86', '')
                    )  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.docker_training = False
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name, run_id=run_id)
            env_path = '/{docker_target_name}/{env_name}'.format(
                docker_target_name=docker_target_name, env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(
                docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path,
                                    worker_id=self.worker_id,
                                    curriculum=self.curriculum_file,
                                    seed=self.seed,
                                    docker_training=self.docker_training)
        self.env_name = os.path.basename(
            os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[
                        brain_name].get_step / self.trainers[
                            brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters[
                        "trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                elif not self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [
                        scope + x
                        for x in ["action", "value_estimate", "action_probs"]
                    ]
                else:
                    node_list = [
                        "action", "value_estimate", "action_probs",
                        "recurrent_out", "memory_size"
                    ]
                    nodes += [scope + x for x in node_list]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def,
                             self.model_path,
                             'raw_graph_def.pb',
                             as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)

        freeze_graph.freeze_graph(
            input_graph=self.model_path + '/raw_graph_def.pb',
            input_binary=True,
            input_checkpoint=ckpt.model_checkpoint_path,
            output_node_names=target_nodes,
            ######## FOLOWING LINE UGLY FIX: only return first 20 characters of run_id ######
            output_graph=self.model_path + '/' + self.env_name + "_" +
            self.run_id[:20] + '.bytes',
            clear_devices=True,
            initializer_nodes="",
            input_saver="",
            restore_op_name="save/restore_all",
            filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir, name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed)
            elif trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(
                    sess, self.env, brain_name,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.use_data_gatherer)
            else:
                raise UnityEnvironmentException(
                    "The trainer config contains an unknown trainer type for brain {}"
                    .format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException(
                """Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException(
                "There was an error decoding Trainer Config from this path : {}"
                .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException(
                "The folder {} containing the generated model could not be accessed."
                " Please make sure the permissions are set correctly.".format(
                    model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()
        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info(
                        'The model {0} could not be found. Make sure you specified the right '
                        '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters',
                                                   trainer.parameters)
            try:
                while any([
                        t.get_step <= t.get_max_steps
                        for k, t in self.trainers.items()
                ]) or not self.train_model:
                    if debug_print:
                        print("|", end='', flush=True)
                    if self.env.global_done:
                        self.env.curriculum.increment_lesson(
                            self._get_progress())
                        curr_info = self.env.reset(
                            train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    if data_gatherer['reset_after_each_frame']:
                        curr_info = self.env.reset(
                            train_mode=self.fast_simulation)

                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]
                         ) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector,
                                             memory=take_action_memories,
                                             text_action=take_action_text)
                    ''' ----- '''
                    ''' Enabling data gathering disables the normal functionality.... '''
                    if self.use_data_gatherer:
                        if data_gatherer['firstRun']:
                            print("---")
                            print("NORMAL FUNCTIONALITY DISABLED!")
                            print(
                                "Now we just sample stats from the initial distribution and save them:"
                            )
                            print("Save dir: {}".format(data_gatherer['dir']))
                            print("---")
                            print(
                                "If you did not expect to see this, NOW is the time to [ctrl-C]! (otherwise: [enter] to continue...)"
                            )
                            ''' Create the folder-structure if it is needed: '''
                            paths = [
                                settings['dir_base'],
                                settings['dir_base'] + settings['project'],
                                data_gatherer['dir']
                            ]
                            for p in paths:
                                if not os.path.isdir(p):
                                    os.makedirs(p)
                                    print("Created path: {}".format(p))
                                else:
                                    print("Reusing existing: {}".format(p))
                            ape = input()
                            data_gatherer['firstRun'] = False

                        #if data_gatherer['reset_after_each_frame']:
                        #    curr_info = self.env.reset(train_mode=self.fast_simulation)
                        #    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                        #    for brain_name, trainer in self.trainers.items():
                        #        (take_action_vector[brain_name],
                        #        take_action_memories[brain_name],
                        #        take_action_text[brain_name],
                        #        take_action_outputs[brain_name]) = trainer.take_action(curr_info)
                        #    new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
                        #                         text_action=take_action_text)

                        is_done = False
                        for x in new_info:
                            for l in range(len(new_info[x].agents)):
                                is_done = is_done or new_info[x].local_done[l]

                        if data_gatherer['idx'] == data_gatherer[
                                'n'] or is_done:
                            #WRITE_TO_FILE....
                            print("Saving chunk {}... ({} samples)".format(
                                data_gatherer['n_chunks'],
                                data_gatherer['idx']))
                            with open(
                                    data_gatherer['dir'] +
                                    data_gatherer['file_base'] +
                                    "chunk{}.pkl".format(
                                        str(data_gatherer['n_chunks']).zfill(
                                            5)), 'wb') as outfile:
                                pickle.dump(
                                    data_gatherer['data']
                                    [:data_gatherer['idx'], :, :, :].reshape(
                                        (-1, ) + data_gatherer['obs_size']),
                                    outfile, pickle.HIGHEST_PROTOCOL)
                            #Prep next:
                            data_gatherer['n_chunks'] += 1
                            data_gatherer['data'] = np.empty(
                                data_gatherer['size'], dtype=np.uint8)
                            data_gatherer['idx'] = 0

                            if data_gatherer['n_chunks'] == 1500:
                                print("Total samples gathered: {}".format(
                                    (data_gatherer['n_chunks'] - 1000) * 1000))
                                exit()
                        data_gatherer['data'][
                            data_gatherer['idx'], :, :, :] = (
                                255 *
                                new_info["PepperBrain"].visual_observations[0]
                            ).astype(np.uint8)
                        data_gatherer['idx'] += 1

                        if data_gatherer['reset_after_each_frame']:
                            continue
                    ''' ----- '''

                    if settings['store_as_int']:
                        for key in new_info:
                            for x in range(
                                    len(new_info[key].visual_observations)):
                                new_info[key].visual_observations[x] = (
                                    255 * new_info[key].visual_observations[x]
                                ).astype(np.uint8)

                    for brain_name, trainer in self.trainers.items():
                        if debug_print:
                            print(".", end='', flush=True)
                        trainer.add_experiences(
                            curr_info, new_info,
                            take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update(
                        ) and self.train_model and trainer.get_step <= trainer.get_max_steps:
                            if debug_print:
                                print("!", end='', flush=True)
                            # Perform gradient descent with experience buffer
                            print("Updating model... ", end='', flush=True)
                            t = time.time()
                            trainer.update_model()
                            print("[x] Done in {} seconds.".format(
                                time.time()))
                        # Write training statistics to Tensorboard.
                        if debug_print:
                            print(",", end='', flush=True)
                        trainer.write_summary(
                            self.env.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            if debug_print:
                                print("?", end='', flush=True)
                            trainer.increment_step()
                            trainer.update_last_reward()
                    if self.train_model and trainer.get_step <= trainer.get_max_steps:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        if debug_print:
                            print("x", end='', flush=True)
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                if self.train_model:
                    self.logger.info(
                        "Learning was interrupted. Please wait while the graph is generated."
                    )
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
def main():

    seeding()

    number_of_episodes = 20000
    episode_length = 1000
    batchsize = 256
    save_interval = 1000
    rewards_deque = deque(maxlen=100)
    rewards_all = []
    noise = 1.0
    noise_reduction = 1.0

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)
    """ Info about the UnityEnvironment
    brain_name: 'TennisBrain'
    brain: ['brain_name', 'camera_resolutions',
           'num_stacked_vector_observations', 'number_visual_observations',
           'vector_action_descriptions', 'vector_action_space_size',
           'vector_action_space_type', 'vector_observation_space_size',
           'vector_observation_space_type']]
    """

    env = UnityEnvironment(file_name="Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    buffer = ReplayBuffer(int(1e5))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)

    # ------------------------------ training ------------------------------ #
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    for episode in range(1, number_of_episodes + 1):

        timer.update(episode)
        rewards_this_episode = np.zeros((2, ))
        """ Info about the UnityEnvironment
        env_info: ['agents', 'local_done', 'max_reached', 'memories',
                  'previous_text_actions', 'previous_vector_actions', 'rewards',
                  'text_observations', 'vector_observations', 'visual_observations']
        actions: List(num_agents=2, action_size=2)
        states: List((24,), (24,))
        rewards: List(2,)
        dones: List(2,)
        """
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        for episode_t in range(episode_length):
            # reset the OUNoise for each agent.
            for i in range(2):
                maddpg.maddpg_agent[i].noise.reset()

            actions = maddpg.act(states, noise=noise)
            env_info = env.step(actions)[brain_name]
            noise *= noise_reduction

            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # add data to buffer
            transition = (states, actions, rewards, next_states, dones)
            buffer.push(transition)

            rewards_this_episode += rewards

            states = next_states

            if any(dones):
                break

        # update the local and target network
        if len(buffer) > batchsize:
            # update the local network
            for _ in range(5):
                for a_i in range(2):
                    samples = buffer.sample(batchsize)
                    maddpg.update(samples, a_i, logger)
            # soft update the target network
            maddpg.update_targets()

        rewards_all.append(rewards_this_episode)
        rewards_deque.append(np.max(rewards_this_episode))
        average_score = np.mean(rewards_deque)

        # --------------------- Logging for TensorBoard --------------------- #
        logger.add_scalars('rewards', {
            'agent0': rewards_this_episode[0],
            'agent1': rewards_this_episode[1]
        }, episode)
        logger.add_scalars('global', {
            'score': np.max(rewards_this_episode),
            'average_score': average_score
        }, episode)
        # -------------------------- Save the model -------------------------- #
        save_dict_list = []

        if episode % save_interval == 0 or average_score >= 0.5:
            for i in range(2):
                save_dict = \
                    {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                     'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                     'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                     'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            if average_score >= 3.0:
                print('\nEnvironment solved in {} episodes!'.format(episode -
                                                                    100))
                print('\nAverage Score: {:.2f}'.format(average_score))
                break

    env.close()
    logger.close()
    timer.finish()
示例#32
0
class UnityEnvHelper:

    # constructor - give file_name of agent environment

    def __init__(self, file_name, no_graphics=True, seed=8888):

        self.seed = seed
        self.uenv = UnityEnvironment(file_name=file_name,
                                     seed=self.seed,
                                     no_graphics=no_graphics)

        # pick the first agent as the brain

        self.brain_name = self.uenv.brain_names[0]
        self.brain = self.uenv.brains[self.brain_name]

        # get the action space size

        self.action_size = self.brain.vector_action_space_size

        # reset the environment , in training mode

        self.reset(True)

        # get the state space size
        self.state_size = len(self.ue_info.vector_observations[0])

    def __del__(self):

        # make sure we close the environment
        try:
            self.uenv.close()
            del self.uenv
        except:
            pass

    def reset(self, train_mode=True):

        # tell the unity agent to restart an episode
        # training mode simple seems to run the simulation at full speed
        self.ue_info = self.uenv.reset(train_mode=train_mode)[self.brain_name]

    # we pass in current state for convenience
    def step(self, state, action):

        # perform action on environment  and get observation
        self.ue_info = self.uenv.step(action)[self.brain_name]
        # return state , action , next state , reward and done flag
        # slightly
        return {
            'state': state,
            'action': action,
            'reward': self.reward(),
            'next_state': self.state(),
            'done': self.done()
        }

    def state(self):
        # just last observation state
        return self.ue_info.vector_observations[0]

    def reward(self):
        # return reward from last observation
        return self.ue_info.rewards[0]

    def done(self):
        # return done flag
        return self.ue_info.local_done[0]
        state = next_state
        score += reward
        scores.append(score)
        print('\rStep {}\tScore: {}'.format(i, score), end="")
        time.sleep(0.25)
        if done:
            break
    # plot the scores of a trained agent
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

    return scores


if __name__ == "__main__":
    # set up environment
    curr_path = os.getcwd()
    my_env = UnityEnvironment(
        file_name=os.path.join(curr_path, "Banana_Windows_x86_64/Banana.exe"))

    my_agent_gamma = Agent(gamma=0.99, tau=1e-3)

    my_scores = ddqnper(agent=my_agent_gamma, env=my_env, n_episodes=2000)
    # scores = demonstrate_agent(env=my_env, model_path=os.path.join(curr_path, "checkpoint.pth"))

    my_env.close()
示例#34
0
  per agents to be retrieved at the next step.
- value is an optional input that be used to send a single float per agent
  to be displayed if and AgentMonitor.cs component is attached to the agent.
if u have more than one brain, use dict for action per brain
action = {'brain1': [1.0, 2.0], 'brain2': [3.0, 4.0]}
'''


for epi in range(10):
    # env.global_done could be used to check all
    env_info = env.reset(train_mode=train_mode)[default_brain]
    state = env_info.states[0]
    done = False
    epi_rewards = 0
    while not done:
        if brain.action_space_type == 'discrete':
            action = np.random.randint(
                0, brain.action_space_size, size=(len(env_info.agents)))
        else:
            action = np.random.randn(
                len(env_info.agents), brain.action_space_size)
        env_info = env.step(action)[default_brain]
        state = env_info.states[0]
        epi_rewards += env_info.rewards[0]
        done = env_info.local_done[0]
    print('Total reward for this episode: {}'.format(epi_rewards))


env.close()
print('Environment is closed')
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train,
                 worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path):
        """

        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        """
        self.trainer_config_path = trainer_config_path
        env_path = (env_path.strip()
                    .replace('.app', '')
                    .replace('.exe', '')
                    .replace('.x86_64', '')
                    .replace('.x86', ''))  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name,
                run_id=run_id)
            env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name,
                                                                 env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id,
                                    curriculum=self.curriculum_file, seed=self.seed)
        self.env_name = os.path.basename(os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters["trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                elif not self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
                else:
                    node_list = ["action", "value_estimate", "action_probs", "recurrent_out", "memory_size"]
                    nodes += [scope + x for x in node_list]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)
        freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb',
                                  input_binary=True,
                                  input_checkpoint=ckpt.model_checkpoint_path,
                                  output_node_names=target_nodes,
                                  output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes',
                                  clear_devices=True, initializer_nodes="", input_saver="",
                                  restore_op_name="save/restore_all", filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        self.trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            self.trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if self.trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name,
                                                                     self.trainer_parameters_dict[brain_name],
                                                                     self.train_model, self.seed)
            elif self.trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            elif self.trainer_parameters_dict[brain_name]['trainer'] == "dqn":
                self.trainers[brain_name] = DQNTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            elif self.trainer_parameters_dict[brain_name]['trainer'] == "madqn":
                self.trainers[brain_name] = MADQNTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            elif self.trainer_parameters_dict[brain_name]['trainer'] == "mappo":
                self.trainers[brain_name] = MAPPOTrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            elif self.trainer_parameters_dict[brain_name]['trainer'] == "coma":
                self.trainers[brain_name] = COMATrainer(sess, self.env, brain_name, self.trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            else:
                raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}"
                                                .format(brain_name))

        all_vars = tf.trainable_variables()
        self.brain_vars = {}
        total_vars = len(all_vars)
        idx1 = 0
        idx2 = int(total_vars/len(self.env.external_brain_names))
        for brain_name in self.env.external_brain_names:
            self.brain_vars[brain_name] = all_vars[idx1:idx2]
            idx1 = idx2
            idx2 = idx2*total_vars
            if (self.trainer_parameters_dict[brain_name]['trainer'] == "dqn" or
            self.trainer_parameters_dict[brain_name]['trainer'] == "madqn"):
                self.trainers[brain_name].update_target_graph(self.brain_vars[brain_name])
                if self.trainer_parameters_dict[brain_name]['trainer'] == "madqn":
                    if not self.trainers[brain_name].parameters['frozen']:
                        self.free_brain_vars = self.brain_vars[brain_name]
        for brain_name in self.env.external_brain_names:
            if self.trainer_parameters_dict[brain_name]['trainer'] == "madqn":
                if self.trainers[brain_name].parameters['frozen']:
                    self.trainers[brain_name].update_frozen_brain_graph(self.brain_vars[brain_name], self.free_brain_vars)

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException("""Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                                            .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}"
                                            .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
                                            " Please make sure the permissions are set correctly."
                                            .format(model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info('The model {0} could not be found. Make sure you specified the right '
                                     '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
            try:
                while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model:
                    if self.env.global_done:
                        #self.env.curriculum.increment_lesson(self._get_progress())
                        curr_info = self.env.reset(train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}

                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
                                             text_action=take_action_text)
                    for brain_name, trainer in self.trainers.items():
                        if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo":
                            trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name], take_action_vector)
                        else:
                            trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name])
                    curr_info = new_info
                    for brain_name, trainer in self.trainers.items():
                        if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo":
                            take_action_vector[brain_name] = trainer.simulate_action(curr_info)
                    for brain_name, trainer in self.trainers.items():
                        if self.trainer_parameters_dict[brain_name]['trainer'] == "mappo":
                            trainer.process_experiences(curr_info, take_action_vector)
                        else:
                            trainer.process_experiences(curr_info)
                        step = trainer.get_step
                        max_steps = trainer.get_max_steps
                        if trainer.is_ready_update() and self.train_model and step <= max_steps:
                            # Perform gradient descent with experience buffer
                            trainer.update_model()
                        # Write training statistics to tensorboard.
                        trainer.write_summary(self.env.curriculum.lesson_number)
                        if self.train_model and step <= max_steps:
                            trainer.increment_step()
                            trainer.update_last_reward()
                    if self.train_model and step <= max_steps:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)

                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess,  steps=global_step, saver=saver)
            except KeyboardInterrupt:
                if self.train_model:
                    self.logger.info("Learning was interrupted. Please wait while the graph is generated.")
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
示例#36
0
def main():
    env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64")
    print_env_info(env)
    random_play(env)
    env.close()
示例#37
0
def main(args):

    if args.deterministic:
        set_seed(42)

    env = UnityEnvironment(file_name=args.env_path,
                           no_graphics=args.no_graphics)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    state = env_info.vector_observations[0]
    print('States look like:', state)
    state_size = len(state)
    print('States have length:', state_size)

    agent = Agent(state_size,
                  action_size,
                  train=True,
                  device=args.device,
                  buffer_size=args.buffer_size,
                  batch_size=args.batch_size,
                  lr=args.lr,
                  gamma=args.gamma,
                  tau=args.tau,
                  update_freq=args.update_freq,
                  nb_updates=args.nb_updates,
                  noise_mean=args.noise_mean,
                  noise_theta=args.noise_theta,
                  noise_sigma=args.noise_sigma,
                  eps=args.eps,
                  eps_decay=args.eps_decay,
                  grad_clip=args.grad_clip)

    scores = train_agent(agent, env, n_episodes=args.episodes)

    output_folder = Path(args.output)
    if not output_folder.is_dir():
        output_folder.mkdir(parents=True)
    # Save model
    torch.save(agent.actor_local.state_dict(),
               output_folder.joinpath('actor_model.pt'))
    torch.save(agent.critic_local.state_dict(),
               output_folder.joinpath('critic_model.pt'))

    env.close()

    # Plot results
    fig = plt.figure()
    plot_scores(scores,
                running_window_size=100,
                success_thresh=args.success_threshold)
    fig.savefig(output_folder.joinpath('training_scores.png'),
                transparent=True)
示例#38
0
class UnityEnv(gym.Env):
    def __init__(self, app_name=None, idx=0):
        # parameter
        app_path = os.path.join(os.path.dirname(__file__), 'assets', app_name)
        idx = idx
        no_graphics = False
        self.num_envs = 1

        # create environment
        self._env = UnityEnvironment(file_name=app_path,
                                     worker_id=idx,
                                     no_graphics=no_graphics)
        self.name = app_name

        # Only Accept Environment with Only One Brain
        assert len(self._env.brains) == 1
        self.brain_name = self._env.external_brain_names[0]
        self.brain = self._env.brains[self.brain_name]

        # viusalization
        self.use_visual = (self.brain.number_visual_observations == 1)

        # action space dimension
        if self.brain.vector_action_space_type == "discrete":
            self._a_dim = Discrete(1)
        else:
            high = np.array([np.inf] * (self.brain.vector_action_space_size))
            self._a_dim = Box(-high, high)

        # observation spce dimension
        if self.use_visual and False and no_graphic:
            high = np.array([np.inf] *
                            self.brain.camera_resolutions[0]["height"] *
                            self.brain.camera_resolutions[0]["width"] * 3)
            self._ob_dim = Box(-high, high)
        else:
            high = np.array([np.inf] *
                            self.brain.vector_observation_space_size)
            self._ob_dim = Box(-high, high)

        # video buffer
        self.frames = []

    def reset(self):
        self.frames = []
        info = self._env.reset()[self.brain_name]
        state = info.vector_observations[0]
        return np.array([state])

    def step(self, action):
        info = self._env.step([action])[self.brain_name]

        state = info.vector_observations[0]
        reward = info.rewards[0]
        done = info.local_done[0]

        self._collect_frames(info.visual_observations[0])
        return np.array([state]), np.array([reward
                                            ]), np.array([done
                                                          ]), np.array([None])

    def close(self):
        self._env.close()

    def _collect_frames(self, frame):
        if self.use_visual:
            self.frames.append(frame)

    @property
    def action_space(self):
        return self._a_dim

    @property
    def observation_space(self):
        return self._ob_dim
class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train,
                 worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path,
                 no_graphics):
        """
        :param env_path: Location to the environment executable to be loaded.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        :param curriculum_file: Curriculum json file for environment
        :param fast_simulation: Whether to run the game at training speed
        :param load: Whether to load the model or randomly initialize
        :param train: Whether to train model, or only run inference
        :param worker_id: Number to add to communication port (5005). Used for multi-environment
        :param keep_checkpoints: How many model checkpoints to keep
        :param lesson: Start learning from this lesson
        :param seed: Random seed used for training.
        :param docker_target_name: Name of docker volume that will contain all data.
        :param trainer_config_path: Fully qualified path to location of trainer configuration file
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        """
        self.trainer_config_path = trainer_config_path
        if env_path is not None:
            env_path = (env_path.strip()
                        .replace('.app', '')
                        .replace('.exe', '')
                        .replace('.x86_64', '')
                        .replace('.x86', ''))  # Strip out executable extensions if passed
        # Recognize and use docker volume if one is passed as an argument
        if docker_target_name == '':
            self.docker_training = False
            self.model_path = './models/{run_id}'.format(run_id=run_id)
            self.curriculum_file = curriculum_file
            self.summaries_dir = './summaries'
        else:
            self.docker_training = True
            self.model_path = '/{docker_target_name}/models/{run_id}'.format(
                docker_target_name=docker_target_name,
                run_id=run_id)
            if env_path is not None:
                env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name,
                                                                     env_name=env_path)
            if curriculum_file is None:
                self.curriculum_file = None
            else:
                self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format(
                    docker_target_name=docker_target_name,
                    curriculum_file=curriculum_file)
            self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name)
        self.logger = logging.getLogger("unityagents")
        self.run_id = run_id
        self.save_freq = save_freq
        self.lesson = lesson
        self.fast_simulation = fast_simulation
        self.load_model = load
        self.train_model = train
        self.worker_id = worker_id
        self.keep_checkpoints = keep_checkpoints
        self.trainers = {}
        if seed == -1:
            seed = np.random.randint(0, 999999)
        self.seed = seed
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id,
                                    curriculum=self.curriculum_file, seed=self.seed,
                                    docker_training=self.docker_training,
                                    no_graphics=no_graphics)
        if env_path is None:
            self.env_name = 'editor_'+self.env.academy_name
        else:
            self.env_name = os.path.basename(os.path.normpath(env_path))  # Extract out name of environment

    def _get_progress(self):
        if self.curriculum_file is not None:
            progress = 0
            if self.env.curriculum.measure_type == "progress":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps
                return progress / len(self.env.external_brain_names)
            elif self.env.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
            else:
                return None
        else:
            return None

    def _process_graph(self):
        nodes = []
        scopes = []
        for brain_name in self.trainers.keys():
            if self.trainers[brain_name].graph_scope is not None:
                scope = self.trainers[brain_name].graph_scope + '/'
                if scope == '/':
                    scope = ''
                scopes += [scope]
                if self.trainers[brain_name].parameters["trainer"] == "imitation":
                    nodes += [scope + x for x in ["action"]]
                else:
                    nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
                if self.trainers[brain_name].parameters["use_recurrent"]:
                    nodes += [scope + x for x in ["recurrent_out", "memory_size"]]
        if len(scopes) > 1:
            self.logger.info("List of available scopes :")
            for scope in scopes:
                self.logger.info("\t" + scope)
        self.logger.info("List of nodes to export :")
        for n in nodes:
            self.logger.info("\t" + n)
        return nodes

    def _save_model(self, sess, saver, steps=0):
        """
        Saves current model to checkpoint folder.
        :param sess: Current Tensorflow session.
        :param steps: Current number of steps in training process.
        :param saver: Tensorflow saver for session.
        """
        last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk'
        saver.save(sess, last_checkpoint)
        tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False)
        self.logger.info("Saved Model")

    def _export_graph(self):
        """
        Exports latest saved model to .bytes format for Unity embedding.
        """
        target_nodes = ','.join(self._process_graph())
        ckpt = tf.train.get_checkpoint_state(self.model_path)
        freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb',
                                  input_binary=True,
                                  input_checkpoint=ckpt.model_checkpoint_path,
                                  output_node_names=target_nodes,
                                  output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes',
                                  clear_devices=True, initializer_nodes="", input_saver="",
                                  restore_op_name="save/restore_all", filename_tensor_name="save/Const:0")

    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == "imitation":
                self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name,
                                                                     trainer_parameters_dict[brain_name],
                                                                     self.train_model, self.seed)
            elif trainer_parameters_dict[brain_name]['trainer'] == "ppo":
                self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name],
                                                       self.train_model, self.seed)
            else:
                raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}"
                                                .format(brain_name))

    def _load_config(self):
        try:
            with open(self.trainer_config_path) as data_file:
                trainer_config = yaml.load(data_file)
                return trainer_config
        except IOError:
            raise UnityEnvironmentException("""Parameter file could not be found here {}.
                                            Will use default Hyper parameters"""
                                            .format(self.trainer_config_path))
        except UnicodeDecodeError:
            raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}"
                                            .format(self.trainer_config_path))

    @staticmethod
    def _create_model_path(model_path):
        try:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
        except Exception:
            raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
                                            " Please make sure the permissions are set correctly."
                                            .format(model_path))

    def start_learning(self):
        self.env.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

        tf.reset_default_graph()

        with tf.Session() as sess:
            self._initialize_trainers(trainer_config, sess)
            for k, t in self.trainers.items():
                self.logger.info(t)
            init = tf.global_variables_initializer()
            saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
            # Instantiate model parameters
            if self.load_model:
                self.logger.info('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                if ckpt is None:
                    self.logger.info('The model {0} could not be found. Make sure you specified the right '
                                     '--run-id'.format(self.model_path))
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
            self.env.curriculum.increment_lesson(self._get_progress())
            curr_info = self.env.reset(train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
            try:
                while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model:
                    if self.env.global_done:
                        self.env.curriculum.increment_lesson(self._get_progress())
                        curr_info = self.env.reset(train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    # Decide and take an action
                    take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
                    for brain_name, trainer in self.trainers.items():
                        (take_action_vector[brain_name],
                         take_action_memories[brain_name],
                         take_action_text[brain_name],
                         take_action_outputs[brain_name]) = trainer.take_action(curr_info)
                    new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
                                             text_action=take_action_text)
                    for brain_name, trainer in self.trainers.items():
                        trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name])
                        trainer.process_experiences(curr_info, new_info)
                        if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps:
                            # Perform gradient descent with experience buffer
                            trainer.update_model()
                        # Write training statistics to Tensorboard.
                        trainer.write_summary(self.env.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            trainer.increment_step_and_update_last_reward()
                    if self.train_model:
                        global_step += 1
                    if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                        # Save Tensorflow model
                        self._save_model(sess, steps=global_step, saver=saver)
                    curr_info = new_info
                # Final save Tensorflow model
                if global_step != 0 and self.train_model:
                    self._save_model(sess, steps=global_step, saver=saver)
            except KeyboardInterrupt:
                print('--------------------------Now saving model-------------------------')
                if self.train_model:
                    self.logger.info("Learning was interrupted. Please wait while the graph is generated.")
                    self._save_model(sess, steps=global_step, saver=saver)
                pass
        self.env.close()
        if self.train_model:
            self._export_graph()
            time.sleep(0.01)
            if np.any(dones):  # exit loop if episode finished
                break
        scores_window.append(score)  # save most recent score
        scores.append(np.mean(score))  # save most recent score
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    #plt.show()
    plt.savefig('testRes.png')
    print('Final Score: ==> ' + np.mean(scores))

env.close()
''' # random 
for i in range(5):                                         # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished