Exemplo n.º 1
0
class AnimalAIWrapper(gym.Env):
    def __init__(
        self,
        worker_id,
        env_path,
        config_path,
        reduced_actions=False,
        docker_training=False,
    ):
        super(AnimalAIWrapper, self).__init__()
        self.config = ArenaConfig(config_path)
        self.time_limit = self.config.arenas[0].t

        self.env = UnityEnvironment(
            file_name=env_path,
            worker_id=worker_id,
            seed=worker_id,
            n_arenas=1,
            arenas_configurations=self.config,
            docker_training=docker_training,
        )

        lookup_func = lambda a: {"Learner": np.array([a], dtype=float)}
        if reduced_actions:
            lookup = itertools.product([0, 1], [0, 1, 2])
        else:
            lookup = itertools.product([0, 1, 2], repeat=2)
        lookup = dict(enumerate(map(lookup_func, lookup)))
        self.action_map = lambda a: lookup[a]

        self.observation_space = gym.spaces.Box(0,
                                                255, [84, 84, 3],
                                                dtype=np.uint8)
        self.action_space = gym.spaces.Discrete(len(lookup))
        self.t = 0

    def _process_state(self, state):
        img = 255 * state["Learner"].visual_observations[0][0]
        vec = state["Learner"].vector_observations[0]
        r = state["Learner"].rewards[0]
        done = state["Learner"].local_done[0]
        return np.uint8(img), vec, r, done

    def reset(self):
        self.t = 0
        img, vec, r, done = self._process_state(
            self.env.reset(arenas_configurations=self.config))
        while done:
            img, vec, r, done = self._process_state(
                self.env.reset(arenas_configurations=self.config))
        return img

    def step(self, action):
        obs, vec, r, done = self._process_state(
            self.env.step(vector_action=self.action_map(action.item())))
        self.t += 1
        done = done or self.t >= self.time_limit
        return obs, r, done, {}
Exemplo n.º 2
0
class AnimalAIEnv(gym.Env):
    """
    Provides Gym wrapper for Unity Learning Environments.
    Multi-agent environments use lists for object types, as done here:
    https://github.com/openai/multiagent-particle-envs
    """

    def __init__(self,
                 environment_filename: str,
                 worker_id=0,
                 docker_training=False,
                 n_arenas=1,
                 seed=0,
                 arenas_configurations=None,
                 greyscale=False,
                 retro=True,
                 inference=False,
                 resolution=None):
        """
        Environment initialization
        :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym.
        :param worker_id: Worker number for environment.
        :param docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
        :param n_arenas: number of arenas to create in the environment (one agent per arena)
        :param arenas_configurations: an ArenaConfig to configure the items present in each arena, will spawn random
            objects randomly if not provided
        :param greyscale: whether the visual observations should be grayscaled or not
        :param retro: Resize visual observation to 84x84 (int8) and flattens action space.
        """
        self._env = UnityEnvironment(file_name=environment_filename,
                                     worker_id=worker_id,
                                     seed=seed,
                                     docker_training=docker_training,
                                     n_arenas=n_arenas,
                                     arenas_configurations=arenas_configurations,
                                     inference=inference,
                                     resolution=resolution)
        # self.name = self._env.academy_name
        self.vector_obs = None
        self.inference = inference
        self.resolution = resolution
        self._current_state = None
        self._n_agents = None
        self._flattener = None
        self._greyscale = greyscale or retro
        # self._seed = None
        self.retro = retro
        self.game_over = False  # Hidden flag used by Atari environments to determine if the game is over
        self.arenas_configurations = arenas_configurations

        self.flatten_branched = self.retro
        self.uint8_visual = self.retro

        # Check brain configuration
        if len(self._env.brains) != 1:
            raise UnityGymException(
                "There can only be one brain in a UnityEnvironment "
                "if it is wrapped in a gym.")
        self.brain_name = self._env.external_brain_names[0]
        brain = self._env.brains[self.brain_name]

        if brain.number_visual_observations == 0:
            raise UnityGymException("Environment provides no visual observations.")

        if brain.num_stacked_vector_observations != 1:
            raise UnityGymException("Environment provides no vector observations.")

        # Check for number of agents in scene.
        initial_info = self._env.reset(arenas_configurations=arenas_configurations)[self.brain_name]
        self._check_agents(len(initial_info.agents))

        if self.retro and self._n_agents > 1:
            raise UnityGymException("Only one agent is allowed in retro mode, set n_agents to 1.")

        # Set observation and action spaces
        if len(brain.vector_action_space_size) == 1:
            self._action_space = spaces.Discrete(brain.vector_action_space_size[0])
        else:
            if self.flatten_branched:
                self._flattener = ActionFlattener(brain.vector_action_space_size)
                self._action_space = self._flattener.action_space
            else:
                self._action_space = spaces.MultiDiscrete(brain.vector_action_space_size)

        # high = np.array([np.inf] * brain.vector_observation_space_size)
        self.action_meanings = brain.vector_action_descriptions

        # if self.visual_obs:
        if self._greyscale:
            depth = 1
        else:
            depth = 3

        if self.retro:
            image_space_max = 255
            image_space_dtype = np.uint8
            camera_height = 84
            camera_width = 84

            image_space = spaces.Box(
                0, image_space_max,
                dtype=image_space_dtype,
                shape=(camera_height, camera_width, depth)
            )

            self._observation_space = image_space
        else:
            image_space_max = 1.0
            image_space_dtype = np.float32
            camera_height = brain.camera_resolutions[0]["height"]
            camera_width = brain.camera_resolutions[0]["width"]
            max_float = np.finfo(np.float32).max

            image_space = spaces.Box(
                0, image_space_max,
                dtype=image_space_dtype,
                shape=(self._n_agents, camera_height, camera_width, depth)
            )
            vector_space = spaces.Box(-max_float, max_float,
                                      shape=(self._n_agents, brain.vector_observation_space_size))
            self._observation_space = spaces.Tuple((image_space, vector_space))

    def reset(self, arenas_configurations=None):
        """Resets the state of the environment and returns an initial observation.
        In the case of multi-agent environments, this is a list.
        Returns: observation (object/list): the initial observation of the
            space.
        """
        info = self._env.reset(arenas_configurations=arenas_configurations)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self.game_over = False

        if self._n_agents == 1:
            obs, reward, done, info = self._single_step(info)
        else:
            obs, reward, done, info = self._multi_step(info)
        return obs

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        In the case of multi-agent environments, these are lists.
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information, including BrainInfo.
        """

        # Use random actions for all other agents in environment.
        if self._n_agents > 1:
            if not isinstance(action, list):
                raise UnityGymException("The environment was expecting `action` to be a list.")
            if len(action) != self._n_agents:
                raise UnityGymException(
                    "The environment was expecting a list of {} actions.".format(self._n_agents))
            else:
                if self._flattener is not None:
                    # Action space is discrete and flattened - we expect a list of scalars
                    action = [self._flattener.lookup_action(_act) for _act in action]
                action = np.array(action)
        else:
            if self._flattener is not None:
                # Translate action into list
                action = self._flattener.lookup_action(action)

        info = self._env.step(action)[self.brain_name]
        n_agents = len(info.agents)
        self._check_agents(n_agents)
        self._current_state = info

        if self._n_agents == 1:
            obs, reward, done, info = self._single_step(info)
            self.game_over = done
        else:
            obs, reward, done, info = self._multi_step(info)
            self.game_over = all(done)
        return obs, reward, done, info

    def _single_step(self, info):

        self.visual_obs = self._preprocess_single(info.visual_observations[0][0, :, :, :])
        self.vector_obs = info.vector_observations[0]

        if self._greyscale:
            self.visual_obs = self._greyscale_obs_single(self.visual_obs)

        if self.retro:
            self.visual_obs = self._resize_observation(self.visual_obs)
            default_observation = self.visual_obs
        else:
            default_observation = self.visual_obs, self.vector_obs

        return default_observation, info.rewards[0], info.local_done[0], {
            "text_observation": info.text_observations[0],
            "brain_info": info}

    def _preprocess_single(self, single_visual_obs):
        if self.uint8_visual:
            return (255.0 * single_visual_obs).astype(np.uint8)
        else:
            return single_visual_obs

    def _multi_step(self, info):

        self.visual_obs = self._preprocess_multi(info.visual_observations)
        self.vector_obs = info.vector_observations

        if self._greyscale:
            self.visual_obs = self._greyscale_obs_multi(self.visual_obs)

        default_observation = self.visual_obs

        return list(default_observation), info.rewards, info.local_done, {
            "text_observation": info.text_observations,
            "brain_info": info}

    def _preprocess_multi(self, multiple_visual_obs):
        if self.uint8_visual:
            return [(255.0 * _visual_obs).astype(np.uint8) for _visual_obs in multiple_visual_obs]
        else:
            return multiple_visual_obs

    def render(self, mode='rgb_array'):
        return self.visual_obs

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self._env.close()

    def get_action_meanings(self):
        return self.action_meanings

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Currently not implemented.
        """
        logger.warning("Could not seed environment %s", self.name)
        return

    @staticmethod
    def _resize_observation(observation):
        """
        Re-sizes visual observation to 84x84
        """
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        return np.array(obs_image)

    def _greyscale_obs_single(self, obs):
        new_obs = np.floor(np.expand_dims(np.mean(obs, axis=2), axis=2)).squeeze().astype(np.uint8)
        return new_obs

    def _greyscale_obs_multi(self, obs):
        new_obs = [np.floor(np.expand_dims(np.mean(o, axis=2), axis=2)).squeeze().astype(np.uint8) for o in obs]
        return new_obs

    def _check_agents(self, n_agents):
        # if n_agents > 1:
        #     raise UnityGymException(
        #         "The environment was launched as a single-agent environment, however"
        #         "there is more than one agent in the scene.")
        # elif self._multiagent and n_agents <= 1:
        #     raise UnityGymException(
        #         "The environment was launched as a mutli-agent environment, however"
        #         "there is only one agent in the scene.")
        if self._n_agents is None:
            self._n_agents = n_agents
            logger.info("{} agents within environment.".format(n_agents))
        elif self._n_agents != n_agents:
            raise UnityGymException("The number of agents in the environment has changed since "
                                    "initialization. This is not supported.")

    @property
    def metadata(self):
        return {'render.modes': ['rgb_array']}

    @property
    def reward_range(self):
        return -float('inf'), float('inf')

    @property
    def spec(self):
        return None

    @property
    def action_space(self):
        return self._action_space

    @property
    def observation_space(self):
        return self._observation_space

    @property
    def number_agents(self):
        return self._n_agents
    s_idx = image_buffer.get_current_index()
    input_state = image_buffer.get_state(s_idx)

    ep_reward = 0.0
    ep_count = 0

    epsilon = max(1.0 * (1 - epi_i / 100), 0.05 * (1 - epi_i / 1000))

    while True:
        a_category = q_main.epsilon_sample(
            torch.FloatTensor(input_state).to(device).view(
                1, input_channel_size, height, width), epsilon)
        a_deploy = action_dict[a_category]

        info = env.step(a_deploy)["Learner"]

        end = info.local_done[0]

        ep_count += 1
        r = info.rewards[0] * reward_scale
        s2_frame = info.visual_observations[0][0]

        image_buffer.animal_add(s2_frame)
        s2_idx = image_buffer.get_current_index()
        input_state = image_buffer.get_state(s2_idx)

        replay_buffer.store(np.array([s_idx]), np.array([a_category]),
                            np.array([r]), np.array([end]), np.array([s2_idx]))

        s_idx = s2_idx
Exemplo n.º 4
0
class Worker(object):
    def __init__(self, name, globalAC):
        env_id = int(name[-1])
        self.env = UnityEnvironment(file_name='env/AnimalAI',
                                    worker_id=env_id,
                                    seed=0,
                                    docker_training=False,
                                    n_arenas=1,
                                    play=False,
                                    inference=True,
                                    resolution=None)

        reset = self.env.reset(train_mode=True)

        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []

        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            #reset = self.env.reset(train_mode=True)
            reset = self.env.reset(train_mode=True,
                                   arenas_configurations=ARENA)

            brain = reset['Learner']
            s = np.array(brain.visual_observations,
                         dtype='float32').reshape(84, 84,
                                                  3).flatten()[np.newaxis, :]
            ep_r = 0

            rnn_state = self.AC.state_init

            for ep_t in range(MAX_STEPS):
                a = self.AC.choose_action(s, rnn_state)
                rnn_state = a[2]

                if a[0] == 0:
                    info = [
                        self.env.step(vector_action=[0, 1]) for i in range(30)
                    ][-1]
                else:
                    info = self.env.step(vector_action=a[1])
                brain = info['Learner']
                s_ = np.array(brain.visual_observations,
                              dtype='float32').reshape(
                                  84, 84, 3).flatten()[np.newaxis, :]
                r = brain.rewards[0]
                done = brain.local_done[0]

                end = True if (ep_t == MAX_STEPS - 1) else False
                if r == 0: r = -0.0125
                ep_r += r

                buffer_s.append(s)
                buffer_a.append(a[0])
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or end:  # обновление сети
                    if end:
                        v_s_ = 0
                    else:
                        v_s_ = SESS.run(
                            self.AC.v, {
                                self.AC.s: s_,
                                self.AC.state_in[0]: rnn_state[0],
                                self.AC.state_in[1]: rnn_state[1]
                            })[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.array(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        self.AC.state_in[0]: rnn_state[0],
                        self.AC.state_in[1]: rnn_state[1]
                    }
                    self.AC.update_global(feed_dict)

                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1

                if end:
                    if len(GLOBAL_RUNNING_R) == 0:  # запись наград эпизода
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] +
                                                0.01 * ep_r)
                    break
Exemplo n.º 5
0
    model.load_state_dict(torch.load("./models/dqn/dqn.pt"))

    env=UnityEnvironment(file_name=env_path)
    #環境リセット
    action_info = env.reset(arenas_configurations_input=arena_config_in, train_mode=False)
    obs = action_info[brain_name].visual_observations[0][0]
    state = get_state(obs)

    for step in range(1000):
        time.sleep(0.05)
        #ランダム行動
        action_values = model(state)
        action = np.argmax(action_values.cpu().data.numpy())
        conv_action = convert_action(action)
    
        action_info = env.step(conv_action)
        obs = action_info[brain_name].visual_observations[0][0]
        reward = action_info[brain_name].rewards[0]
        done   = action_info[brain_name].local_done[0]
        max_reach=action_info[brain_name].max_reached
        next_state = get_state(obs)
        state = next_state
        #表示
        #print('\n ===== {} step ======'.format(step))
        #print('\naction=', action)
        #print('\nstate=', state.shape)
        #print('\nreward=', reward)
        #print('\ndone=', done)
        #print('\nmax_reach=', max_reach)

    #plt.imshow(state[0][0])