예제 #1
0
class UnityEnv:
    def __init__(self, env_name, **kwargs) -> None:
        super().__init__()
        filename = unity_filename(env_name)
        self.unity_env = UnityEnvironment(file_name=filename, **kwargs)
        brain_name = self.unity_env.brain_names[0]
        self.name = brain_name.replace("Brain", "")
        brain = self.unity_env.brains[brain_name]

        env_info = self.unity_env.reset(train_mode=True)[brain_name]

        self.brain_name = brain_name
        self.num_agents = len(env_info.agents)
        self.num_actions = list(brain.vector_action_space_size)[0]
        self.states = env_info.vector_observations
        self.num_states = self.states.shape[1]

    def reset(self, train_mode=False):
        env_info = self.unity_env.reset(train_mode=train_mode)[self.brain_name]
        return env_info.vector_observations

    def step(self, actions):
        env_info = self.unity_env.step(actions)
        env_info = env_info[self.brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done

        return np.asarray(next_states), np.asarray(rewards), np.asarray(
            dones), env_info
예제 #2
0
    def test(self):
        from mlagents.envs import UnityEnvironment
        num_worker = 20
        state_size = 33
        output_size = 4
        n_step = 128
        ep = 0
        score = 0
        saver = tf.train.Saver()
        saver.restore(self.sess, 'model/model')

        env = UnityEnvironment(file_name='env/walker', worker_id=2)
        default_brain = env.brain_names[0]
        brain = env.brains[default_brain]
        initial_observation = env.reset()

        env_info = env.reset()
        states = np.zeros([num_worker, state_size])

        while True:
            inference = [self.get_action(s) for s in states]
            actions = [inf[0] for inf in inference]
            env_info = env.step(actions)[default_brain]

            states = env_info.vector_observations
예제 #3
0
    def run(self):
        from mlagents.envs import UnityEnvironment

        writer = SummaryWriter('runs/td3')
        num_worker = 20
        state_size = 33
        output_size = 4
        epsilon = 1.0
        ep = 0
        train_size = 5

        env = UnityEnvironment(file_name='env/training', worker_id=0)
        default_brain = env.brain_names[0]
        brain = env.brains[default_brain]
        initial_observation = env.reset()

        step = 0
        score = 0

        while True:
            ep += 1
            env_info = env.reset()
            states = np.zeros([num_worker, state_size])
            terminal = False
            self.noise.reset()
            if epsilon > 0.001:
                epsilon = -ep * 0.005 + 1.0
            while not terminal:
                step += 1

                actions = [self.get_action(s, epsilon) for s in states]
                env_info = env.step(actions)[default_brain]

                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                terminal = dones[0]

                for s, ns, r, d, a in zip(states, next_states, rewards, dones,
                                          actions):
                    self.memory.append(s, ns, r, d, a)

                score += sum(rewards)

                states = next_states

                if step % train_size == 0:
                    self.update()

            if ep < 1000:
                print('episode :', ep, '| score : ', score, '| epsilon :',
                      epsilon)
                writer.add_scalar('data/reward', score, ep)
                writer.add_scalar('data/epsilon', epsilon, ep)
                writer.add_scalar('data/memory_size', len(self.memory.memory),
                                  ep)
                score = 0
예제 #4
0
def walking_iterator():
    env = UnityEnvironment(file_name=env_name)

    # Set the default brain to work with
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    # Reset the environment
    env_info = env.reset(train_mode=train_mode)[default_brain]

    # Examine the state space for the default brain
    print("Agent vector observations look like: \n{}".format(env_info.vector_observations[0]))

    # Examine the observation space for the default brain
    print("Agent visual observations look like:")
    for i, vo in enumerate(env_info.visual_observations):
        print("Visual observation", i, ":", vo[0].shape)

    turning_sign = 1
    while True:
        # Interpret and yield sensory input
        rgb_image = env_info.visual_observations[0][0]
        depth_image = depth_rgb_to_float(env_info.visual_observations[1][0])
        pose = env_info.vector_observations[0][:4]
        forward_clear_dist = env_info.vector_observations[0][4]

        yield {
            'image': rgb_image,
            'depth': np.clip(depth_image * 1000, 0, 65535).astype(np.uint16),
            'pose': pose
        }

        # Decide on actions
        # First action dim is forward motion, second is rotation
        actions = np.zeros([len(env_info.agents), brain.vector_action_space_size[0]], np.float32)

        if forward_clear_dist > 3.0:
            turning_sign = -1 * turning_sign

        if forward_clear_dist > 1.0:
            # Forward is clear, go forward
            actions[0,0] = np.random.uniform(0.05, 0.5)
            actions[0,1] = np.random.uniform(0.0, 0.01) * turning_sign
        elif forward_clear_dist < 0.1:
            # Back up!
            actions[0,0] = np.random.uniform(-0.05, -0.5)
        else:
            # Just a little distance. Turn
            actions[0,1] = np.random.uniform(0.01, 0.05) * turning_sign

        env_info = env.step(actions)[default_brain]
        if env_info.local_done[0]:
            env_info = env.reset(train_mode=train_mode)
        if type(env_info) is dict:
            # This happens sometimes, not sure why
            env_info = env_info['SlamWalkerLearning']
예제 #5
0
def test_step(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain = env.brains['RealFakeBrain']
    brain_info = env.reset()
    brain_info = env.step([0] * brain.vector_action_space_size[0] *
                          len(brain_info['RealFakeBrain'].agents))
    with pytest.raises(UnityActionException):
        env.step([0])
    brain_info = env.step([-1] * brain.vector_action_space_size[0] *
                          len(brain_info['RealFakeBrain'].agents))
    with pytest.raises(UnityActionException):
        env.step([0] * brain.vector_action_space_size[0] *
                 len(brain_info['RealFakeBrain'].agents))
    env.close()
    assert env.global_done
    assert isinstance(brain_info, dict)
    assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
    assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
    assert isinstance(brain_info['RealFakeBrain'].vector_observations,
                      np.ndarray)
    assert len(brain_info['RealFakeBrain'].visual_observations
               ) == brain.number_visual_observations
    assert len(brain_info['RealFakeBrain'].vector_observations) == \
           len(brain_info['RealFakeBrain'].agents)
    assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \
           brain.vector_observation_space_size * brain.num_stacked_vector_observations

    print("\n\n\n\n\n\n\n" + str(brain_info['RealFakeBrain'].local_done))
    assert not brain_info['RealFakeBrain'].local_done[0]
    assert brain_info['RealFakeBrain'].local_done[2]
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(
        discrete_action=False, visual_inputs=0
    )
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(
        0, env.brains[env.brain_names[0]], trainer_parameters, False, False
    )
    run_out = policy.get_value_estimates(brain_info, 0, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert type(val) is float

    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    policy.reward_signals["extrinsic"].use_terminal_states = False
    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val != 0.0

    env.close()
예제 #7
0
    def test_unity(self):
        from mlagents.envs import UnityEnvironment
        env = UnityEnvironment(file_name='env/' + env_set['env_name'],
                               worker_id=0)
        default_brain = env.brain_names[0]
        env_info = env.reset(config={
            'Mass': 1,
            'Length': 1.5 * 3.0
        })[default_brain]
        states = env_info.vector_observations
        self.saver.restore(self.sess,
                           save_path='runs/td3_' + env_set['env_name'] +
                           '/save')
        scores = np.zeros([self.worker_size])
        score = deque(maxlen=1000)
        for i in range(10000):
            actions = self.get_action(states, 0.05)
            env_info = env.step(actions)[default_brain]
            states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards

            for idx, d in enumerate(dones):
                if d:
                    score.append(scores[idx])
                    scores[idx] = 0
        print('score : ', "{0:.2f}".format(np.mean(score)))
        env.close()
예제 #8
0
def main():
    env = UnityEnvironment(file_name='../env/Pong/Pong')

    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    env_info = env.reset(train_mode=False)[default_brain]

    obs_dim = env_info.vector_observations[0].shape[0]
    act_num = brain.vector_action_space_size[0]

    mlp = MLP(obs_dim, act_num).to(device)

    if args.load is not None:
        pretrained_model_path = os.path.join('./save_model/' + str(args.load))
        pretrained_model = torch.load(pretrained_model_path)
        mlp.load_state_dict(pretrained_model)

    sum_returns = 0.
    num_episodes = 0

    for episode in range(1, 10001):
        total_reward = 0.

        obs = env_info.vector_observations[0]
        done = False

        while not done:
            action = mlp(
                torch.Tensor(obs).to(device)).argmax().detach().cpu().numpy()
            env_info = env.step(int(action))[default_brain]

            next_obs = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            total_reward += reward
            obs = next_obs

        sum_returns += total_reward
        num_episodes += 1

        average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0

        if episode % 10 == 0:
            print('---------------------------------------')
            print('Episodes:', num_episodes)
            print('AverageReturn:', average_return)
            print('---------------------------------------')

    env.close()
    def evaluate_model(self, model_name):
        """
        Session for evaluating every model. Uses the concept of the Trainer in a deterministic manner
        :param model_name: Name of the model to be evaluated
        """
        print(
            "\n==================== New Evaluation {} ============================"
            .format(model_name))

        if self.use_executable:
            env = UnityEnvironment(file_name=self.env_name)
        else:
            env = UnityEnvironment(file_name=None)

        default_brain = env.brain_names[0]
        env_info = env.reset(train_mode=False)[default_brain]
        num_output = len(env_info.action_masks[0])

        # Fetching model
        model_path = self.path_to_models + model_name + ".h5"
        model_manager = ModelManager(load=True,
                                     num_views=num_output,
                                     num_output=num_output,
                                     model_name=model_path)

        # Change the model name
        if "_" in model_name and False:
            model_name = "evaluation_" + model_name.split("_", 1)[1]
        else:
            model_name = "eval_" + model_name
        model_name = "eval_coverage_progression"

        # Evaluating the model
        trainer = Trainer(model_manager, env, self.max_step)
        synopsis = SynopsisManager(trainer,
                                   model_manager,
                                   run_name=model_name,
                                   max_step=self.max_step)
        trainer.evaluate_solution(self.evaluation_size)

        # Close environment
        env.close()

        # Cleanup
        # del trainer.memory
        del trainer
        del synopsis
        del model_manager
예제 #10
0
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain_infos = env.reset()
    brain_info = brain_infos[env.brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.brain_names[0]
    trainer_parameters['model_path'] = model_path
    trainer_parameters['keep_checkpoints'] = 3
    policy = PPOPolicy(0, env.brains[env.brain_names[0]], trainer_parameters,
                       False, False)
    run_out = policy.evaluate(brain_info)
    assert run_out['action'].shape == (3, 2)
    env.close()
    def execute_session(self, model_name, alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, t):
        """
        Executes a single training session of a model
        :param model_name: Name of the model
        :param alpha_acc: Coverage reward
        :param exp_acc: Coverage exponential reward
        :param alpha_dist: Distance reward
        :param exp_dist: Distance Exponential Reward
        :param alpha_steps: Step Reward
        :param t: Which architecture to be used
        """
        print("\n==================== New Session {} ============================".format(model_name))
        print("acc: {} - {}, dist: {} - {}, steps {}, views: {}, LR: {}\n"
              .format(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views, self.learning_rate))

        if self.use_executable:
            env = UnityEnvironment(file_name=self.env_name)
        else:
            env = UnityEnvironment(file_name=None)
        default_brain = env.brain_names[0]
        env_info = env.reset(train_mode=False)[default_brain]
        num_output = len(env_info.action_masks[0])

        # Fetching model
        model_manager = ModelManager(load=self.load_model, num_views=num_output, num_output=num_output,
                                     model_name=model_name, learning_rate=self.learning_rate, variation=t)

        # Train
        trainer = Trainer(model_manager, env, self.max_step)
        trainer.set_reward_values(alpha_acc, exp_acc, alpha_dist, exp_dist, alpha_steps, self.alpha_views)
        synopsis = SynopsisManager(trainer, model_manager, run_name=model_name, max_step=self.max_step)
        trainer.train(self.num_generations, self.num_batches, self.batch_size, self.test_size)
        synopsis.print_training_summary()
        trainer.evaluate_solution(self.evaluation_size)

        # Close environment
        env.close()

        # Save model
        model_manager.save_model()

        # Cleanup
        # del trainer.memory
        del trainer
        del synopsis
        del model_manager
예제 #12
0
class Drone:
    spec = None
    name = None
    action_space = None
    observation_space = None

    def __init__(
        self,
        env_path: str,
        env_name: str,
        cfg: dict,
        train_mode: bool = True,
        worker_id: int = 1,
    ):
        self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id)
        self.default_brain = self.env.brain_names[0]
        self.cfg = cfg
        self.name = env_name
        self.action_space = spaces.Box(low=-1,
                                       high=1,
                                       shape=(3, ),
                                       dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf,
                                            high=np.inf,
                                            shape=(9, ),
                                            dtype=np.float32)
        self.train_mode = train_mode

    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode,
                                  config=self.cfg)[self.default_brain]
        return env_info.vector_observations[0]

    def step(self, action):
        env_info = self.env.step(action.tolist())[self.default_brain]
        observation = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()

    def seed(self, seed):
        pass
예제 #13
0
def test_ppo_policy_evaluate(mock_communicator, mock_launcher):
    tf.reset_default_graph()
    with tf.Session() as sess:
        mock_communicator.return_value = MockCommunicator(
            discrete_action=False, visual_inputs=0)
        env = UnityEnvironment(' ')
        brain_infos = env.reset()
        brain_info = brain_infos[env.brain_names[0]]

        trainer_parameters = dummy_config()
        graph_scope = env.brain_names[0]
        trainer_parameters['graph_scope'] = graph_scope
        policy = PPOPolicy(0, env.brains[env.brain_names[0]],
                           trainer_parameters, sess, False)
        init = tf.global_variables_initializer()
        sess.run(init)
        run_out = policy.evaluate(brain_info)
        assert run_out['action'].shape == (3, 2)
        env.close()
예제 #14
0
class Sokoban:
    spec = None
    name = None
    action_space = None
    observation_space = None

    def __init__(
        self,
        env_path: str,
        env_name: str,
        cfg: dict,
        train_mode=True,
        worker_id: int = 1,
    ):
        self.env = UnityEnvironment(file_name=env_path, worker_id=worker_id)
        self.default_brain = self.env.brain_names[0]
        self.cfg = cfg
        self.name = env_name
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(3, 84, 84),
                                            dtype=np.uint8)
        self.train_mode = train_mode

    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode,
                                  config=self.cfg)[self.default_brain]
        return env_info.visual_observations[0][0].reshape(3, 84, 84)

    def step(self, action):
        env_info = self.env.step(action.tolist())[self.default_brain]
        observation = env_info.visual_observations[0][0].reshape(3, 84, 84)
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()

    def seed(self, seed):
        pass
예제 #15
0
def test_reset(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(' ')
    brain = env.brains['RealFakeBrain']
    brain_info = env.reset()
    env.close()
    assert not env.global_done
    assert isinstance(brain_info, dict)
    assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
    assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
    assert isinstance(brain_info['RealFakeBrain'].vector_observations,
                      np.ndarray)
    assert len(brain_info['RealFakeBrain'].visual_observations
               ) == brain.number_visual_observations
    assert len(brain_info['RealFakeBrain'].vector_observations) == \
           len(brain_info['RealFakeBrain'].agents)
    assert len(brain_info['RealFakeBrain'].vector_observations[0]) == \
           brain.vector_observation_space_size * brain.num_stacked_vector_observations
class Sokoban_env():
    def __init__(self, env_path, env_cfg=Sokoban_env_cfg):
        self.env = UnityEnvironment(file_name=env_path)
        self.default_brain = self.env.brain_names[0]
        self.env_cfg = env_cfg

    def reset(self):
        env_info = self.env.reset(train_mode=True, config=self.env_cfg)[self.default_brain]
        return env_info.visual_observations[0][0].reshape(1,3,84,84)

    def step(self, action):
        env_info = self.env.step(action)[self.default_brain]
        observation = env_info.visual_observations[0][0].reshape(1,3,84,84)
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        info = None
        return observation, reward, done, info

    def close(self):
        self.env.close()
예제 #17
0
def init_env(env_name):
    #env_name = "../UnitySDK/BananaCollector"
    train_mode = True  # Whether to run the environment in training or inference mode
    env = UnityEnvironment(file_name=env_name)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents in the environment
    print('Number of agents:', len(env_info.agents))
    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)
    # examine the state space
    state = env_info.vector_observations[0]
    print('Observations', state)
    state_size = len(state)
    print('Observations have length:', state_size)
    obs_dim = env_info.vector_observations.shape[0]
    print('shape of observations', obs_dim)

    return env, brain, brain_name
예제 #18
0
class UnityEnvBase(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """

    worker_id = UNIVERSAL_LOCK

    def __init__(
        self,
        environment_filename: str,
        use_visual=True,
        uint8_visual=True,
        multiagent=False,
        flatten_branched=False,
        no_graphics=False,
        allow_multiple_visual_obs=False,
    ):
        """
        Environment initialization
        :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
        :param worker_id: Worker number for environment.
        :param use_visual: Whether to use visual observation or vector observation.
        :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
        :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
        :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than MultiDiscrete.
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
        """
        worker_id = UnityEnvBase._generate_new_env_id()
        self.worker_id = worker_id

        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     no_graphics=no_graphics)

        self.name = self._env.academy_name
        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._multiagent = multiagent
        self._flattener = None
        self.game_over = (
            False
        )  # Hidden flag used by Atari environments to determine if the game is over
        self._allow_multiple_visual_obs = allow_multiple_visual_obs

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityEnvBaseException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        if len(self._env.external_brain_names) <= 0:
            raise UnityEnvBaseException(
                "There are not any external brain in the UnityEnvironment")

        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if use_visual and brain.number_visual_observations == 0:
            raise UnityEnvBaseException(
                "`use_visual` was set to True, however there are no"
                " visual observations as part of this environment.")
        self.use_visual = brain.number_visual_observations >= 1 and use_visual

        if not use_visual and uint8_visual:
            logger.warning(
                "`uint8_visual was set to true, but visual observations are not in use. "
                "This setting will not have any effect.")
        else:
            self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1 and not self._allow_multiple_visual_obs:
            logger.warning(
                "The environment contains more than one visual observation. "
                "You must define allow_multiple_visual_obs=True to received them all. "
                "Otherwise, please note that only the first will be provided in the observation."
            )

        if brain.num_stacked_vector_observations != 1:
            raise UnityEnvBaseException(
                "There can only be one stacked vector observation in a UnityEnvironment "
                "if it is wrapped in a gym.")

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if brain.vector_action_space_type == "discrete":
            if len(brain.vector_action_space_size) == 1:
                self._action_space = spaces.Discrete(
                    brain.vector_action_space_size[0])
            else:
                if flatten_branched:
                    self._flattener = ActionFlattener(
                        brain.vector_action_space_size)
                    self._action_space = self._flattener.action_space
                else:
                    self._action_space = spaces.MultiDiscrete(
                        brain.vector_action_space_size)

        else:
            if flatten_branched:
                logger.warning(
                    "The environment has a non-discrete action space. It will "
                    "not be flattened.")
            high = np.array([np.inf] * brain.vector_action_space_size[0])
            self._action_space = spaces.Box(-high, high, dtype=np.float32)
        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions
        if self.use_visual:
            if brain.camera_resolutions[0]["blackAndWhite"]:
                depth = 1
            else:
                depth = 3
            self._observation_space = spaces.Box(
                0,
                1,
                dtype=np.float32,
                shape=(
                    brain.camera_resolutions[0]["height"],
                    brain.camera_resolutions[0]["width"],
                    depth,
                ),
            )
        else:
            self._observation_space = spaces.Box(-high, high, dtype=np.float32)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset()[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._multiagent:
            if not isinstance(action, list):
                raise UnityEnvBaseException(
                    "The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityEnvBaseException(
                    "The environment was expecting a list of {} actions.".
                    format(self._n_agents))
            else:
                if self._flattener is not None:
                    # Action space is discrete and flattened - we expect a list of scalars
                    action = [
                        self._flattener.lookup_action(_act) for _act in action
                    ]
                action = np.array(action)
        else:
            if self._flattener is not None:
                # Translate action into list
                action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if not self._multiagent:
            obs, reward, done, info = self._single_step(info)
            self.game_over = done
        else:
            obs, reward, done, info = self._multi_step(info)
            self.game_over = all(done)
        return obs, reward, done, info

    def _single_step(self, info):
        if self.use_visual:
            visual_obs = info.visual_observations
            if isinstance(visual_obs, list):
                visual_obs = np.array(visual_obs)

            if self._allow_multiple_visual_obs:
                visual_obs_list = []
                for obs in visual_obs:
                    visual_obs_list.append(
                        self._preprocess_single(obs[0, :, :, :]))
                self.visual_obs = visual_obs_list
            else:
                self.visual_obs = self._preprocess_single(
                    visual_obs[0][0, :, :, :])

            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations[0, :]

        return (
            default_observation,
            info.rewards[0],
            info.local_done[0],
            {
                "text_observation": info.text_observations[0],
                "brain_info": info,
                "vector_observations": info.vector_observations[0, :]
            },
        )

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def _multi_step(self, info):
        if self.use_visual:
            self.visual_obs = self._preprocess_multi(info.visual_observations)
            default_observation = self.visual_obs
        else:
            default_observation = info.vector_observations
        return (
            list(default_observation),
            info.rewards,
            info.local_done,
            {
                "text_observation": info.text_observations,
                "brain_info": info
            },
        )

    def _preprocess_multi(self, multiple_visual_obs):
        if self.uint8_visual:
            return [(255.0 * _visual_obs).astype(np.uint8)
                    for _visual_obs in multiple_visual_obs]
        else:
            return multiple_visual_obs

    def render(self, mode="rgb_array"):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        try:
            self._env.close()
        except:
            pass

    def __del__(self):
        try:
            self._env.close()
        except:
            pass

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warn("Could not seed environment %s", self.name)
        return

    def _check_agents(self, n_agents):
        if not self._multiagent and n_agents > 1:
            raise UnityEnvBaseException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        elif self._multiagent and n_agents <= 1:
            raise UnityEnvBaseException(
                "The environment was launched as a mutli-agent environment, however"
                "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityEnvBaseException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @staticmethod
    def _generate_new_env_id():

        with UnityEnvBase.worker_id.get_lock():
            new_id = UnityEnvBase.worker_id.value
            UnityEnvBase.worker_id.value += 1

        return new_id

    @property
    def metadata(self):
        return {"render.modes": ["rgb_array"]}

    @property
    def reward_range(self):
        return -float("inf"), float("inf")

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
예제 #19
0
def run(options, runLog, minimumAcceptableFitness=None):
    isFunctionInValidationMode = \
            isinstance(minimumAcceptableFitness, float)
    runLog.Append("This is run.py -> script for running pretrained models!")

    locationOfPretrainedModel = options["--model"]

    resultsRepository = TrainingResultsRepository()
    bestAgent = resultsRepository.LoadBestModel(locationOfPretrainedModel)

    if bestAgent is None:
        runLog.Append("run.run() error: Cannot load model, location " \
                "'training_results/{0}' does not exist!".format(
                        locationOfPretrainedModel))
        exit()

    runLog.Append("Run model from 'training_results/{0}'!".format(
        locationOfPretrainedModel))

    pathToEnv = options["--env-path"]
    env = UnityEnvironment(file_name=pathToEnv)
    if pathToEnv is None:
        runLog.Append("Established connection with Unity Editor!")
    else:
        runLog.Append("Established connection with Unity build '{0}'!" \
                .format(pathToEnv))
    del pathToEnv

    brainName = env.brain_names[0]

    if isFunctionInValidationMode:
        fitness = 0.0

    shouldRunBeExecuted = True
    try:
        while shouldRunBeExecuted:
            envInfo = env.reset(train_mode=False)[brainName]
            inputData = envInfo.vector_observations.tolist()
            inputData = inputData[0][0:-1]
            while shouldRunBeExecuted:
                outputData = bestAgent.forward(inputData)
                envInfo = env.step([outputData])[brainName]
                inputData = envInfo.vector_observations.tolist()

                if isFunctionInValidationMode:
                    episodeReward = inputData[0][-1]
                    if episodeReward > fitness:
                        fitness = episodeReward

                inputData = inputData[0][:-1]

                if envInfo.local_done[0]:
                    if isFunctionInValidationMode:
                        shouldRunBeExecuted = False
                    break

    except KeyboardInterrupt:
        runLog.Append("\nRun interrupted because of KeyboardInterrupt!")

    runLog.Append("End of run!")
    env.close()
    runLog.Append("Closed Unity environment.")

    if isFunctionInValidationMode:
        return fitness >= minimumAcceptableFitness
    else:
        return False
예제 #20
0
def main():
    env = UnityEnvironment()
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print(brain.vector_observation_space_size)
    print(brain.vector_action_space_size)

    sess = tf.Session()

    actor = Actor(sess=sess,
                  s_dim=brain.vector_observation_space_size,
                  a_counts=brain.vector_action_space_size[0])
    critic = Critic(sess=sess, s_dim=brain.vector_observation_space_size)

    sess.run(tf.global_variables_initializer())

    for episode in range(MAX_EP):
        step = 0
        total_reward = 0.
        discounted_reward = 0
        s, a, r, dc_r = [], [], [], []
        obs = env.reset(train_mode=True)[brain_name]
        state = obs.vector_observations
        s.append(state[0])
        while True:
            action = actor.choose_action(state)
            a.append(action[0])
            obs = env.step(action)[brain_name]
            step += 1
            reward = obs.rewards
            r.append(reward[0])
            state = obs.vector_observations
            done = obs.local_done[0]
            if done or step >= MAX_STEP:
                if len(s) < BATCHSIZE:
                    break
                else:
                    length = len(s)
                    for index in reversed(range(length)):
                        discounted_reward = discounted_reward * GAMMA + r[index]
                        dc_r.append(discounted_reward)
                    total_reward = dc_r[-1]
                    s_ = list(
                        reversed([
                            s[index:index + BATCHSIZE]
                            for index in range(length - BATCHSIZE + 1)
                        ]))
                    a_ = list(
                        reversed([
                            a[index:index + BATCHSIZE]
                            for index in range(length - BATCHSIZE + 1)
                        ]))
                    r_ = list(
                        reversed([
                            r[index:index + BATCHSIZE]
                            for index in range(length - BATCHSIZE + 1)
                        ]))
                    dc_r_ = list(
                        reversed([
                            dc_r[index:index + BATCHSIZE]
                            for index in range(length - BATCHSIZE + 1)
                        ]))
                    for index in range(len(s_)):
                        actor.assign_params()
                        ss = np.array(s_[index])
                        aa = np.array(a_[index])
                        rr = np.array(r_[index])
                        dc_rr = np.array(dc_r_[index])[:, np.newaxis]
                        values = critic.get_state_value(ss)
                        value_ = critic.get_state_value(state)
                        sub_advantage = np.zeros_like(rr)
                        for index in reversed(range(np.shape(rr)[0])):
                            sub_advantage[index] = rr[
                                index] + GAMMA * value_ - values[index]
                            value_ = values[index]
                        tmp = 0
                        advantage = np.zeros_like(sub_advantage)
                        for index in reversed(range(
                                np.shape(sub_advantage)[0])):
                            tmp = tmp * LAMBDA * GAMMA + sub_advantage[index]
                            advantage[index] = tmp

                        [
                            actor.learn(ss, aa, advantage)
                            for _ in range(LEARN_COUNTS)
                        ]
                        [critic.learn(ss, dc_rr) for _ in range(LEARN_COUNTS)]
                    if done:
                        break

                s, a, r, dc_r = [], [], [], []
                s.append(state[0])
            else:
                s.append(state[0])
        print('episede: {0} steps: {1} reward: {2}'.format(
            episode, step, total_reward))
예제 #21
0
class ObstacleTowerEnv(gym.Env):
    ALLOWED_VERSIONS = ['1']

    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=True):
        """
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual 
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each 
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
        """
        if self.is_grading():
            environment_filename = None
            docker_training = True

        self._env = UnityEnvironment(environment_filename,
                                     worker_id,
                                     docker_training=docker_training)

        split_name = self._env.academy_name.split('-v')
        if len(split_name) == 2 and split_name[0] == "ObstacleTower":
            self.name, self.version = split_name
        else:
            raise UnityGymException(
                "Attempting to launch non-Obstacle Tower environment")

        if self.version not in self.ALLOWED_VERSIONS:
            raise UnityGymException(
                "Invalid Obstacle Tower version.  Your build is v" + self.version + \
                " but only the following versions are compatible with this gym: " + \
                str(self.ALLOWED_VERSIONS)
            )

        self.visual_obs = None
        self._current_state = None
        self._n_agents = None
        self._done_grading = False
        self._flattener = None
        self._seed = None
        self._floor = None
        self.game_over = False  # Hidden flag used by Atari environments to determine if the game is over
        self.retro = retro

        flatten_branched = self.retro
        uint8_visual = self.retro

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if brain.number_visual_observations == 0:
            raise UnityGymException(
                "Environment provides no visual observations.")

        self.uint8_visual = uint8_visual

        if brain.number_visual_observations > 1:
            logger.warning(
                "The environment contains more than one visual observation. "
                "Please note that only the first will be provided in the observation."
            )

        # Check for number of agents in scene.
        initial_info = self._env.reset()[self.brain_name]
        self._check_agents(len(initial_info.agents))

        # Set observation and action spaces
        if len(brain.vector_action_space_size) == 1:
            self._action_space = spaces.Discrete(
                brain.vector_action_space_size[0])
        else:
            if flatten_branched:
                self._flattener = ActionFlattener(
                    brain.vector_action_space_size)
                self._action_space = self._flattener.action_space
            else:
                self._action_space = spaces.MultiDiscrete(
                    brain.vector_action_space_size)

        high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions

        depth = 3
        image_space_max = 1.0
        image_space_dtype = np.float32
        camera_height = brain.camera_resolutions[0]["height"]
        camera_width = brain.camera_resolutions[0]["width"]
        if self.retro:
            image_space_max = 255
            image_space_dtype = np.uint8
            camera_height = 84
            camera_width = 84

        image_space = spaces.Box(0,
                                 image_space_max,
                                 dtype=image_space_dtype,
                                 shape=(camera_height, camera_width, depth))
        if self.retro:
            self._observation_space = image_space
        else:
            max_float = np.finfo(np.float32).max
            keys_space = spaces.Discrete(5)
            time_remaining_space = spaces.Box(low=0.0,
                                              high=max_float,
                                              shape=(1, ),
                                              dtype=np.float32)
            self._observation_space = spaces.Tuple(
                (image_space, keys_space, time_remaining_space))

    def done_grading(self):
        return self._done_grading

    def is_grading(self):
        return os.getenv('OTC_EVALUATION_ENABLED', False)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        reset_params = {}
        if self._floor is not None:
            reset_params['floor-number'] = self._floor
        if self._seed is not None:
            reset_params['tower-seed'] = self._seed

        info = self._env.reset(config=reset_params)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        obs, reward, done, info = self._single_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        obs, reward, done, info = self._single_step(info)
        self.game_over = done

        if info.get('text_observation') == 'evaluation_complete':
            done = True
            self._done_grading = True
        return obs, reward, done, info

    def _single_step(self, info):
        self.visual_obs = self._preprocess_single(
            info.visual_observations[0][0, :, :, :])
        if self.retro:
            self.visual_obs = self._resize_observation(self.visual_obs)
            self.visual_obs = self._add_stats_to_image(
                self.visual_obs, info.vector_observations[0])
            default_observation = self.visual_obs
        else:
            default_observation = self._prepare_tuple_observation(
                self.visual_obs, info.vector_observations[0])

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info
        }

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets a fixed seed for this env's random number generator(s).
        The valid range for seeds is [0, 100). By default a random seed
        will be chosen.
        """
        if seed is None:
            self._seed = seed
            return

        seed = int(seed)
        if seed < 0 or seed >= 100:
            logger.warn("Seed outside of valid range [0, 100). A random seed "
                        "within the valid range will be used on next reset.")
        logger.warn("New seed " + str(seed) + " will apply on next reset.")
        self._seed = seed

    def floor(self, floor=None):
        """Sets the starting floor to a fixed floor number on subsequent environment
        resets."""
        if floor is None:
            self._floor = floor
            return

        floor = int(floor)
        if floor < 0 or floor >= 25:
            logger.warn(
                "Starting floor outside of valid range [0, 25). Floor 0 will be used"
                "on next reset.")
        logger.warn("New starting floor " + str(floor) +
                    " will apply on next reset.")
        self._floor = floor

    @staticmethod
    def _resize_observation(observation):
        """
        Re-sizes visual observation to 84x84
        """
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        return np.array(obs_image)

    @staticmethod
    def _prepare_tuple_observation(vis_obs, vector_obs):
        """
        Converts separate visual and vector observation into prepared tuple
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        key_num = np.argmax(key, axis=0)
        return vis_obs, key_num, time

    @staticmethod
    def _add_stats_to_image(vis_obs, vector_obs):
        """
        Displays time left and number of keys on visual observation
        """
        key = vector_obs[0:6]
        time = vector_obs[6]
        key_num = np.argmax(key, axis=0)
        time_num = min(time, 10000) / 10000

        vis_obs[0:10, :, :] = 0
        for i in range(key_num):
            start = int(i * 16.8) + 4
            end = start + 10
            vis_obs[1:5, start:end, 0:2] = 255
        vis_obs[6:10, 0:int(time_num * 84), 1] = 255
        return vis_obs

    def _check_agents(self, n_agents):
        if n_agents > 1:
            raise UnityGymException(
                "The environment was launched as a single-agent environment, however"
                "there is more than one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException(
                "The number of agents in the environment has changed since "
                "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
예제 #22
0
# action_size = len(brain.vector_action_space_size)
action_size = 4

# Set the size of state observations or state size
state_size = brain.vector_observation_space_size

agent = Agent(state_size=state_size,
              action_size=action_size,
              dqn_type='DQN',
              seed=2)

# loop from num_episodes
for i_episode in range(1, num_episodes + 1):

    # reset the unity environment at the beginning of each episode
    env_info = env.reset(train_mode=True)[brain_name]

    # get initial state of the unity environment
    state = env_info.vector_observations[0]

    # set the initial episode score to zero.
    score = 0

    # Run the episode training loop;
    # At each loop step take an epsilon-greedy action as a function of the current state observations
    # Based on the resultant environmental state (next_state) and reward received update the Agent network
    # If environment episode is done, exit loop...
    # Otherwise repeat until done == true
    converted_action_size = brain.vector_action_space_size
    converted_agent_num = len(env_info.agents)
예제 #23
0
state_size = brain.vector_observation_space_size


#Initialize Agent
agent = Agent(state_size=state_size, action_size=action_size, seed=0)

# Load trained model weights
agent.network.load_state_dict(torch.load('dqnAgent_Trained_Model.pth'))


# loop from num_episodes
for i_episode in range(1, num_episodes+1):

    # reset the unity environment at the beginning of each episode
    # set train mode to false
    env_info = env.reset(train_mode=False)[brain_name]     

    # get initial state of the unity environment 
    state = env_info.vector_observations[0]

    # set the initial episode score to zero.
    score = 0

    # Run the episode loop;
    # At each loop step take an action as a function of the current state observations
    # If environment episode is done, exit loop...
    # Otherwise repeat until done == true 
    while True:
        # determine epsilon-greedy action from current sate
        action = agent.act(state, .01)             
예제 #24
0
    # 유니티 환경 설정
    env = UnityEnvironment(file_name=env_name)
    default_brain = env.brain_names[0]

    # DDPGAgnet 선언
    agent = DDPGAgent()
    rewards = deque(maxlen=print_interval)
    success_cnt = 0
    step = 0

    # 각 에피소드를 거치며 replay memory에 저장
    for episode in range(run_episode + test_episode):
        if episode == run_episode:
            train_mode = False

        env_info = env.reset(train_mode=train_mode)[default_brain]
        state = env_info.vector_observations[0]
        episode_rewards = 0
        done = False

        while not done:
            step += 1

            action = agent.get_action([state])[0]
            env_info = env.step(action)[default_brain]
            next_state = env_info.veotctor_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]

            episode_rewards += reward
예제 #25
0
                                                 self.summary_loss2: loss2, 
                                                 self.summary_reward2: reward2}), episode)

# Main 함수 -> 전체적으로 적대적인 DQN 알고리즘을 진행 
if __name__ == '__main__':
    # 유니티 환경 설정
    env = UnityEnvironment(file_name=env_name)

    # 유니티 브레인 설정 
    brain_name1 = env.brain_names[0]
    brain_name2 = env.brain_names[1]

    brain1 = env.brains[brain_name1]
    brain2 = env.brains[brain_name2]

    env_info = env.reset(train_mode=train_mode)
    
    # DQNAgent 클래스를 agent로 정의 
    agent = DQNAgent()

    step = 0

    rewards1 = []
    losses1 = []
    rewards2 = []
    losses2 = []

    # 게임 진행 반복문 
    for episode in range(run_episode + test_episode):
        if episode == run_episode:
            train_mode = False
예제 #26
0
def train_wrapper(env_config, wrapper_config):
    """
    Set the Training Parameters
    :param env_config: dictionary, used to pass parameters into the environment
    :param wrapper_config: dictionary of user defined variables.
    """
    # num_episodes (int): maximum number of training episodes
    num_episodes = wrapper_config['num_episodes']

    # scores_average_window (int): the window size employed for calculating the average score
    scores_average_window = wrapper_config['scores_avg_window']

    # solved_score (float): the average score required for the environment to be considered solved
    solved_score = wrapper_config['solved_score']

    # load_weights (bool): whether or not to start training with loaded weights
    load_weights = wrapper_config['load_weights']

    # weights_path: path to the directory containing the weights (same directory to save them)
    weights_path = wrapper_config['weights_path']
    if load_weights and not (os.path.isdir(weights_path)):
        print('weights dir does not exist')
        raise NotADirectoryError

    # save_mem (bool): whether or not to save memory
    save_mem = wrapper_config['save_mem']
    # load_mem (bool): whether or not to continue training with loaded memory
    load_mem = wrapper_config['load_mem']
    # mem_path: path to directory containing the memory to load
    mem_path = wrapper_config['mem_path']
    if load_mem and not (os.path.isdir(mem_path)):
        print('mem dir does not exist')
        raise NotADirectoryError

    # build_path: path to the build of the unity environment.
    build_path = None if wrapper_config['build'] == 'None' else wrapper_config[
        'build']
    if (build_path is not None) and (not os.path.isfile(build_path)):
        print('--build is not a valid path')
        raise FileNotFoundError

    # no_graphics (bool): whether or not to start the environment without graphics (default = True in training)
    no_graphics_in = not wrapper_config['show_graphics']

    # agent_type (DDPG | MDDPG | MADDPG)
    agent_type = wrapper_config['agent']
    if not issubclass(agent_type, AgentABC):
        print('invalid agent type')
        raise TypeError

    # print_Agent_loss (bool): whether or not to print the agent's loss (mse for critic) after every episode
    print_agent_loss = wrapper_config['print_agent_loss']

    # save_log (bool): whether or not to save the episodes score (csv format, default is True)
    save_log = wrapper_config['save_score_log']

    # save_best_weights (bool): save also the best weights of the session (by average score)
    save_best_weights = wrapper_config['save_best_weights']

    # episode_scores (float): list to record the scores obtained from each episode
    episode_scores = []
    """
    Start the Unity Environment
    """
    env = UnityEnvironment(file_name=build_path, no_graphics=no_graphics_in)
    """
    Get The Unity Environment Brain
    Unity ML-Agent applications or Environments contain "BRAINS" which are responsible for deciding 
    the actions an agent or set of agents should take given a current set of environment (state) 
    observations. The Race environment has a single Brain, thus, we just need to access the first brain 
    available (i.e., the default brain). We then set the default brain as the brain that will be controlled.
    """
    # Get the default brain
    brain_name = env.brain_names[0]

    # Assign the default brain as the brain to be controlled
    brain = env.brains[brain_name]
    """
    Determine the size of the Action and State Spaces and the Number of Agents.
    The observation space consists of variables corresponding to Ray Cast in different direction, 
    velocity and direction.  
    Each action is a vector with 2 numbers, corresponding to steer left/right and brake/drive (in this order).
    each action is a number between -1 and 1.
    num_agents will correspond to the number of agent using the same brain -
    (since all cars use the same action / observation space they all use the same brain)
    if in the future one should have different cars use different observation space, 
    one will need to split them into different brains..
    """
    # Set the number of actions or action size
    action_size = brain.vector_action_space_size

    # Set the size of state observations or state size
    state_size = brain.vector_observation_space_size

    # Get number of agents in Environment
    env_info = env.reset(train_mode=True, config=env_config)[brain_name]
    num_agents = len(env_info.agents)
    print('\nNumber of Agents: ', num_agents)
    """
    Create an Agent from the Agent Class in Agent.py
    Any agent initialized with the following parameters.
        ======
        state_size (int): dimension of each state (required)
        action_size (int): dimension of each action (required)
        num_agents (int): number of agents in the unity environment
        seed (int): random seed for initializing training point (default = 0)
    
    Here we initialize an agent using the Unity environments state and action size and number of Agents
    determined above.
    """
    agent: AgentABC = agent_type(state_size=state_size,
                                 action_size=action_size[0],
                                 num_agents=num_agents,
                                 random_seed=0)

    # Load trained model weights
    if load_weights:
        agent.load_weights(weights_path)
    if load_mem:
        agent.load_mem(mem_path)
    """
    ###################################
    STEP 6: Run the Training Sequence
    The Training Process involves the agent learning from repeated episodes of behaviour 
    to map states to actions the maximize rewards received via environmental interaction.
    
    The agent training process involves the following:
    (1) Reset the environment at the beginning of each episode.
    (2) Obtain (observe) current state, s, of the environment at time t
    (3) Perform an action, a(t), in the environment given s(t)
    (4) Observe the result of the action in terms of the reward received and 
        the state of the environment at time t+1 (i.e., s(t+1))
    (5) Update agent memory and learn from experience (i.e, agent.step)
    (6) Update episode score (total reward received) and set s(t) -> s(t+1).
    (7) If episode is done, break and repeat from (1), otherwise repeat from (3).
    
    Below we also exit the training process early if the environment is solved. 
    That is, if the average score for the previous 100 episodes is greater than solved_score.
    """

    best_score = -np.inf  # used to determine the best average score so far (for saving best_weights)
    # loop from num_episodes
    for i_episode in range(1, num_episodes + 1):
        # reset the unity environment at the beginning of each episode
        env_info = env.reset(train_mode=True, config=env_config)[brain_name]

        # get initial state of the unity environment
        states = env_info.vector_observations

        # reset the training agent for new episode
        agent.reset()

        # set the initial episode score to zero.
        agent_scores = np.zeros(num_agents)

        # Run the episode training loop;
        # At each loop step take an action as a function of the current state observations
        # Based on the resultant environmental state (next_state) and reward received update the agent ('step' method)
        # If environment episode is done, exit loop...
        # Otherwise repeat until done == true
        steps = 0
        while True:
            steps = steps + 1
            # determine actions for the unity agents from current sate
            actions = agent.act(states)

            # send the actions to the unity agents in the environment and receive resultant environment information
            env_info = env.step(actions)[brain_name]

            next_states = env_info.vector_observations  # get the next states for each unity agent in the environment
            rewards = env_info.rewards  # get the rewards for each unity agent in the environment
            dones = env_info.local_done  # see if episode has finished for each unity agent in the environment

            # Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates
            agent.step(states, actions, rewards, next_states, dones)

            # set new states to current states for determining next actions
            states = next_states

            # Update episode score for each unity agent
            agent_scores += rewards

            # If any unity agent indicates that the episode is done,
            # then exit episode loop, to begin new episode
            if np.any(dones):
                break

        # Add episode score to Scores and...
        # Calculate mean score over last 100 episodes
        # Mean score is calculated over current episodes until i_episode > 100
        episode_scores.append(np.mean(agent_scores))
        average_score = np.mean(
            episode_scores[i_episode -
                           min(i_episode, scores_average_window):i_episode +
                           1])

        # Print current and average score, number of steps in episode.
        print(
            '\nEpisode {}\tEpisode Score: {:.3f}\tAverage Score: {:.3f}\tNumber Of Steps{}'
            .format(i_episode, episode_scores[i_episode - 1], average_score,
                    steps),
            end="")
        if print_agent_loss:
            # print agent's loss (useful for babysitting the training)
            print('\t episode loss: {}'.format(agent.debug_loss))

        if save_log:
            # Save the recorded Scores data (in weights path)
            if not (os.path.isdir(weights_path)):
                os.mkdir(weights_path)
            scores_filename = "Agent_Scores.csv"
            # noinspection PyTypeChecker
            np.savetxt(os.path.join(weights_path, scores_filename),
                       episode_scores,
                       delimiter=",")

        # Save trained  Actor and Critic network weights after each episode
        agent.save_weights(weights_path)
        if save_best_weights:
            if best_score < average_score:
                best_score = average_score
                agent.save_weights(weights_path + '_best')

        if save_mem and (i_episode % 50) == 0:
            agent.save_mem(mem_path)
        # Check to see if the task is solved (i.e,. average_score > solved_score over 100 episodes).
        # If yes, save the network weights and scores and end training.
        if i_episode > scores_average_window * 2 and average_score >= solved_score:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode, average_score))
            break
    agent.save_mem(mem_path)
    """
    ###################################
    STEP 7: Everything is Finished -> Close the Environment.
    """
    env.close()
예제 #27
0
high = np.ones(39)
action_dim = spaces.Box(-high, high, dtype=np.float32)
agent = SAC(state_dim, action_dim, args)

#TesnorboardX
writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env,
    args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Training Loop
total_numsteps = 0

agent_reward = np.zeros(num_worker)
buffer_reward = np.zeros(num_worker)
done = False
env_info = env.reset(train_mode=True)[default_brain]
states = env_info.vector_observations

while total_numsteps <= args.num_steps:

    actions = agent.act(states)  # Sample action from policy

    env_info = env.step(actions)[default_brain]  # Step
    next_states = env_info.vector_observations
    rewards = env_info.rewards
    dones = env_info.local_done
    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.step(
        states, actions, rewards, next_states, dones)

    writer.add_scalar('loss/critic_1', critic_1_loss, total_numsteps)
    writer.add_scalar('loss/critic_2', critic_2_loss, total_numsteps)
예제 #28
0
            # 게임 진행 상황 출력 및 텐서 보드에 loss, accuracy값 기록
            if epoch % print_interval == 0 and epoch != 0:
                print("epoch({}) - loss: {:.4f} / accuracy: {:.4f}".format(
                       epoch, np.mean(losses), np.mean(accuracies)))
                agent.Write_Summray(np.mean(losses), np.mean(accuracies), epoch)
                losses = []
                accuracies = []

        # 네트워크 모델 저장
        agent.save_model()

    else:
        env = UnityEnvironment(file_name=env_name)
        default_brain = env.brain_names[0]
        brain = env.brains[default_brain]

        env_info = env.reset(train_mode=train_mode, config=env_config)[default_brain]
        
        for episode in range(test_episode):
            done = False
            episode_rewards = 0

            while not done:
                action = agent.get_action(np.array([env_info.vector_observations[0]]))
                env_info = env.step(action)[default_brain]
                episode_rewards += env_info.rewards[0]
                done = env_info.local_done[0]

            print("Total reward this episode: {}".format(episode_rewards))

        env.close()
예제 #29
0
파일: train.py 프로젝트: seongl/rl_bootcamp
def main():
    # Initialize environment
    env = UnityEnvironment(file_name='../env/Hopper/Hopper')

    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    env_info = env.reset(train_mode=True)[default_brain]

    obs_dim = env_info.vector_observations[0].shape[0]
    act_dim = brain.vector_action_space_size[0]
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)

    # Set a random seed
    np.random.seed(0)
    torch.manual_seed(0)

    # Create a SummaryWriter object by TensorBoard
    dir_name = 'runs/' + 'Hopper' + '_' + time.ctime()
    writer = SummaryWriter(log_dir=dir_name)

    # Main network
    actor = GaussianPolicy(obs_dim, act_dim).to(device)
    qf1 = FlattenMLP(obs_dim + act_dim, 1).to(device)
    qf2 = FlattenMLP(obs_dim + act_dim, 1).to(device)
    # Target network
    qf1_target = FlattenMLP(obs_dim + act_dim, 1).to(device)
    qf2_target = FlattenMLP(obs_dim + act_dim, 1).to(device)

    # Initialize target parameters to match main parameters
    hard_target_update(qf1, qf1_target)
    hard_target_update(qf2, qf2_target)

    # Create optimizers
    actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
    qf1_optimizer = optim.Adam(qf1.parameters(), lr=args.qf_lr)
    qf2_optimizer = optim.Adam(qf2.parameters(), lr=args.qf_lr)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size)

    # If automatic entropy tuning is True, initialize a target entropy, a log alpha and an alpha optimizer
    if args.automatic_entropy_tuning:
        target_entropy = -np.prod((act_dim, )).item()
        log_alpha = torch.zeros(1, requires_grad=True, device=device)
        alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr)
    else:
        target_entropy = None
        log_alpha = None
        alpha_optimizer = None

    def run_one_episode(steps, eval_mode):
        total_reward = 0.

        env_info = env.reset(train_mode=True)[default_brain]
        obs = env_info.vector_observations[0]
        done = False

        # Keep interacting until agent reaches a terminal state.
        while not done:
            steps += 1

            if eval_mode:
                action, _, _ = actor(torch.Tensor(obs).to(device))
                action = action.detach().cpu().numpy()
                env_info = env.step(action)[default_brain]

                next_obs = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
            else:
                # Collect experience (s, a, r, s') using some policy
                _, action, _ = actor(torch.Tensor(obs).to(device))
                action = action.detach().cpu().numpy()
                env_info = env.step(action)[default_brain]

                next_obs = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]

                # Add experience to replay buffer
                replay_buffer.add(obs, action, reward, next_obs, done)

                # Start training when the number of experience is greater than batch size
                if steps > args.batch_size:
                    batch = replay_buffer.sample(args.batch_size)
                    args.alpha = train_model(actor, qf1, qf2, qf1_target,
                                             qf2_target, actor_optimizer,
                                             qf1_optimizer, qf2_optimizer,
                                             batch, target_entropy, log_alpha,
                                             alpha_optimizer)

            total_reward += reward
            obs = next_obs
        return steps, total_reward, args.alpha

    train_sum_returns = 0.
    train_num_episodes = 0

    start_time = time.time()
    steps = 0

    for episode in range(1, args.training_eps + 1):
        # Perform the training phase, during which the agent learns
        eval_mode = False

        # Run one episode
        steps, train_episode_return, args.alpha = run_one_episode(
            steps, eval_mode)

        train_sum_returns += train_episode_return
        train_num_episodes += 1

        train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

        # Log experiment result for training episodes
        writer.add_scalar('Train/AverageReturns', train_average_return,
                          episode)
        writer.add_scalar('Train/EpisodeReturns', train_episode_return,
                          episode)
        if args.automatic_entropy_tuning:
            writer.add_scalar('Train/Alpha', args.alpha, episode)

        # Perform the evaluation phase -- no learning
        if episode > 0 and episode % args.eval_per_train == 0:
            eval_mode = True

            eval_sum_returns = 0.
            eval_num_episodes = 0

            for _ in range(args.evaluation_eps):
                # Run one episode
                steps, eval_episode_return, _ = run_one_episode(
                    steps, eval_mode)

                eval_sum_returns += eval_episode_return
                eval_num_episodes += 1

                eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

                # Log experiment result for evaluation episodes
                writer.add_scalar('Eval/AverageReturns', eval_average_return,
                                  episode)
                writer.add_scalar('Eval/EpisodeReturns', eval_episode_return,
                                  episode)

            print('---------------------------------------')
            print('Episodes:', episode)
            print('AverageReturn:', round(train_average_return, 2))
            print('EvalEpisodes:', eval_num_episodes)
            print('EvalAverageReturn:', round(eval_average_return, 2))
            print('Time:', int(time.time() - start_time))
            print('---------------------------------------')

            # Save a training model
            if eval_average_return >= args.threshold_return:
                if not os.path.exists('./save_model'):
                    os.mkdir('./save_model')

                ckpt_path = os.path.join('./save_model/' + 'Hopper' + '_ep_' + str(episode) \
                                                                              + '_rt_' + str(round(eval_average_return, 2)) \
                                                                              + '_t_' + str(int(time.time() - start_time)) + '.pt')
                torch.save(actor.state_dict(), ckpt_path)
                break

    env.close()
예제 #30
0
class SocTwoEnv():
    def __init__(self,
                 env_path,
                 worker_id,
                 train_mode=True,
                 n_str=16,
                 n_goalie=16):
        self.env = UnityEnvironment(file_name=env_path, worker_id=0)
        self.striker_brain_name, self.goalie_brain_name = self.env.brain_names
        self.striker_brain = self.env.brains[self.striker_brain_name]
        self.goalie_brain = self.env.brains[self.goalie_brain_name]
        self.done_str = [False] * 16
        self.done_goalie = [False] * 16
        self.train_mode = train_mode
        self.done_hist_str = [False] * 16
        self.done_hist_goalie = [False] * 16
        self.episode_str_rewards = 0
        self.episode_goalie_rewards = 0
        self.n_str = n_str
        self.n_goalie = n_goalie
        self.act_str_hist = [[] for x in range(n_str)]
        self.act_goalie_hist = [[] for x in range(n_goalie)]
        self.observation_str_hist = [[] for x in range(SIZE_OBSERVATION)]
        self.observation_goalie_hist = [[] for x in range(SIZE_OBSERVATION)]
        self.observation_str = None
        self.observation_goalie = None
        return

    def reset(self):
        """
            Reset the all environments and agents.
        """
        self.env_info_str = self.env.reset(
            train_mode=self.train_mode)[self.striker_brain_name]
        self.env_info_goalie = self.env.reset(
            train_mode=self.train_mode)[self.goalie_brain_name]

        self.episode_rewards = 0
        self.done_str = [False] * 16
        self.done_goalie = [False] * 16
        self.done_hist_str = np.array([False] * 16)
        self.done_hist_goalie = np.array([False] * 16)
        return {'str': self.env_info_str, 'goalie': self.env_info_goalie}

    def step(self, action_str, action_goalie):
        """
            In each timestep, give each striker and goalie a instruction
            to do action. And then, get the current observation stored
            at observation_str and observation_goalie.
        """
        self.env_info = self.env.step({
            self.striker_brain_name: action_str,
            self.goalie_brain_name: action_goalie
        })
        self.observation_str = np.array(
            self.env_info[self.striker_brain_name].vector_observations)
        self.observation_goalie = np.array(
            self.env_info[self.goalie_brain_name].vector_observations)
        return self.env_info

    def reward(self):
        self.episode_str_rewards = np.array(
            self.env_info[self.striker_brain_name].rewards)
        self.episode_goalie_rewards = np.array(
            self.env_info[self.goalie_brain_name].rewards)
        return self.episode_str_rewards, self.episode_goalie_rewards

    def close(self):
        """
            Close the simulation Unity environment.
        """
        self.env.close()
        return

    def done(self):
        self.done_str = np.array(
            self.env_info[self.striker_brain_name].local_done)
        self.done_goalie = np.array(
            self.env_info[self.goalie_brain_name].local_done)

    def reset_some_agents(self, str_arg, goalie_arg):
        """
            params:
                str_arg, mark which striker's history that wants to be cleared.
                goalie_arg, mark which goalie's history that wants to be cleared.
            Clear the history of specific agents.

        """
        for i in str_arg:
            self.act_str_hist[i[0]] = []
            self.observation_str_hist[i[0]] = []
        for i in goalie_arg:
            self.act_goalie_hist[i[0]] = []

    def print_r(self, episode):
        print("Total reward this episode_{}: {}".format(
            episode, self.episode_rewards))
        return