Пример #1
0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
Пример #2
0
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"):

    print("Testing:")
    total_rewards = []
    distance_xs = []
    for i in range(test_n):
        print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}")
        print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class)
        eval_env = DummyVecEnv([eval_env])
        eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed+i)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        if render:
            eval_env.env_method("set_view")
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in range(1000):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if done:
                break
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            if render:
                time.sleep(0.01)

        eval_env.close()
        print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}")

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)
Пример #3
0
def test(seed,
         model,
         train,
         test,
         normalize_kwargs,
         body_info=0,
         render=False):
    print("Testing:")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = DummyVecEnv([
        utils.make_env(rank=0,
                       seed=utils.seed + 1,
                       render=False,
                       robot_body=test,
                       body_info=0)
    ])
    eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs)
    eval_env.seed(seed)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(
        f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}"
    )
    return total_reward, distance_x
            success = True
            try:
                fix_room = room_utils.generate_room(dim=dim_room,
                                                    num_steps=num_gen_steps,
                                                    num_boxes=num_boxes,
                                                    second_player=False)
                _, state, _ = fix_room
            except:
                success = False
        for i in range(len(version_li)):
            version = version_li[i]
            load_path = '{}/agent_v{}.zip'.format(load_dir, version)
            agent.set_parameters(load_path, exact_match=True)
            # agent = agent_li[i]
            done = False
            obs = np.expand_dims(soko_env.env_method('manual_reset', state)[0],
                                 axis=0)
            while not done:
                action, _ = agent.predict(obs, deterministic=True)
                obs, _, done, info = soko_env.step(action)

            # solved
            if info[0]["all_boxes_on_target"]:
                num_solved_li[i] += 1
                if unique_solver_idx == -1:
                    unique_solver_idx = i
                else:
                    unique_solver_idx = -1

        if unique_solver_idx != -1:
            num_unique_solved_li[unique_solver_idx] += 1
class ALGEnv(gym.Env):
    metadata = {
        'render.modes':
        ['human', 'rgb_array', 'tiny_human', 'tiny_rgb_array', 'np_array']
    }

    def __init__(self,
                 dim_room=(10, 10),
                 num_boxes=4,
                 reset=True,
                 log_interval=1000,
                 alg_version=0,
                 train_mode='cnn',
                 agent_lb_path=None,
                 agent_ub_path=None,
                 init_probs=[0.5, 0.5, 0.5]):

        assert train_mode in TRAIN_MODES
        self.train_mode = train_mode
        if log_interval > 0:
            self.log_train_info = True
        else:
            self.log_train_info = False

        # 0: basic playable map
        # 1: playble map
        # 2: hardness adjustable map
        self.alg_version = alg_version
        if alg_version == 0:
            pass
        else:
            env_li = [
                lambda: SokobanEnv(dim_room=dim_room,
                                   max_steps=50,
                                   num_boxes=num_boxes,
                                   train_mode=train_mode,
                                   log_train_info=False)
            ]
            self.soko_env = DummyVecEnv(env_li)
            self.agent_ub = PPO.load(agent_ub_path, env=self.soko_env)
            print('loaded', agent_ub_path, 'as ub')
            if alg_version == 2:
                self.agent_lb = PPO.load(agent_lb_path, env=self.soko_env)
                print('loaded', agent_lb_path, 'as lb')

        # General Configuration
        self.dim_room = dim_room
        self.num_boxes = num_boxes
        self.num_players = 1

        # Training hyperperams
        self.max_prefer_subs = dim_room[0] * dim_room[1] // 2
        self.place_target_prob = init_probs[0]
        self.place_box_prob = init_probs[1]
        self.place_player_prob = init_probs[2]

        # Log info
        self.start_time = time.time()
        self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        # self.sample_map = False
        self.episode_reward = 0
        self.total_reward_per_log_interval = 0
        self.total_steps_per_log_interval = 0
        self.total_subs_per_log_interval = 0
        self.log_interval = log_interval
        self.reseted = False
        self.train_counter = 0

        # Env properties
        self.map = None

        # Penalties and Rewards
        self.penalty_sub_wrong_tile = -5
        self.penalty_exc_btp_tiles = -10
        self.penalty_bad_map_design = -50
        self.penalty_generation_fail = -50
        self.penalty_exc_subs = -10

        self.reward_neighbor_valid_tiles = 2
        self.reward_place_btp_tiles = 5
        self.reward_basic_playable = 40

        if alg_version == 1:
            # too hard or unsolvable
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 50
        elif alg_version == 2:
            self.penalty_agent_lb_solvable = -30
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 10
            self.reward_agent_lb_thou = 50

        # Generation Track
        self.placed_player = 0
        self.placed_boxes = 0
        self.placed_target = 0
        self.env_steps = 0

        # Env Settings
        self.viewer = None
        self.max_steps = dim_room[0] * dim_room[1]
        self.action_space = MultiDiscrete([dim_room[0], dim_room[1], 5])

        if train_mode == 'cnn':
            self.scale = 6
            screen_height, screen_width = (dim_room[0] * self.scale,
                                           dim_room[1] * self.scale)
            self.observation_space = Box(low=0,
                                         high=255,
                                         shape=(screen_height, screen_width,
                                                3),
                                         dtype=np.uint8)
        else:
            self.observation_space = Box(low=0,
                                         high=6,
                                         shape=(dim_room[0], dim_room[1]),
                                         dtype=np.uint8)

        if reset:
            # Initialize Room
            _ = self.reset()

    def random_init_map(self):
        room = np.zeros((self.dim_room[0], self.dim_room[1]), dtype=np.uint8)
        for _ in range(self.num_boxes):
            if np.random.rand(1) < self.place_target_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 2
            if np.random.rand(1) < self.place_box_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 4

        for _ in range(self.num_players):
            if np.random.rand(1) < self.place_player_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 5

        self.placed_target += np.count_nonzero(room == 2)
        self.placed_boxes += np.count_nonzero(room == 4)
        self.placed_player += np.count_nonzero(room == 5)

        return room

    def reset(self):
        self.placed_player = 0
        self.placed_boxes = 0
        self.placed_target = 0
        self.map = self.random_init_map()
        self.env_steps = 0
        self.episode_subs = 0
        self.episode_reward = 0
        self.reseted = True

        if self.train_mode == 'cnn':
            starting_observation = self.render('tiny_rgb_array',
                                               scale=self.scale)
        else:
            starting_observation = self.render('np_array')
        return starting_observation

    def soko_agent_test(self):
        reward = 0

        # v1
        if self.alg_version == 1:
            done = False
            obs = self.soko_env.env_method('manual_reset', self.map)
            while not done:
                action, _ = self.agent_ub.predict(obs, deterministic=True)
                obs, _, done, info = self.soko_env.step(action)

            # agent_ub solvable
            if info[0]["all_boxes_on_target"]:
                reward += self.reward_agent_ub_solvable
                train_result = 0  # good map
            else:
                reward += self.penalty_agent_ub_thou
                train_result = 2  # thou map

        # v2
        else:
            done = False
            obs = self.soko_env.env_method('manual_reset', self.map)
            while not done:
                action, _ = self.agent_ub.predict(obs, deterministic=True)
                obs, _, done, info = self.soko_env.step(action)

            # agent_ub thou
            if not info[0]["all_boxes_on_target"]:
                reward += self.penalty_agent_ub_thou
                train_result = 2  # thou

            # agent_ub solvable
            else:
                reward += self.reward_agent_ub_solvable
                done = False
                obs = self.soko_env.env_method('manual_reset', self.map)
                while not done:
                    action, _ = self.agent_lb.predict(obs, deterministic=True)
                    obs, _, done, info = self.soko_env.step(action)

                # agent_lb solvable
                if info[0]["all_boxes_on_target"]:
                    reward += self.penalty_agent_lb_solvable
                    train_result = 1  # too easy
                else:
                    reward += self.reward_agent_lb_thou
                    train_result = 0  # good map

        return reward, train_result

    def step(self, action):
        '''
        Tile type:
            0: Wall
            1: Floor
            2: Target
            3: Box On Target
            4: Box
            5: Player
            6: Player On Target
        act:
            0: Finish Generation
            1: Floor
            2: Box Target
            3: Box
            4: Player
        '''
        x, y, act = action
        reward = 0
        done = False
        self.env_steps += 1
        # not finish generation
        if act != 0:
            if self.map[x][y] != 0:
                reward += self.penalty_sub_wrong_tile

            # is wall tile, can substitute
            else:
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if _x in range(self.dim_room[0]) and _y in range(
                            self.dim_room[1]):
                        if self.map[_x, _y] != 0:
                            reward += self.reward_neighbor_valid_tiles

                if act == 1:
                    self.map[x][y] = 1
                    self.episode_subs += 1
                    if self.episode_subs >= self.max_prefer_subs:
                        reward += self.penalty_exc_subs
                        # print(self.episode_subs)

                # place box target
                elif act == 2:
                    if self.placed_target >= self.num_boxes:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_target += 1
                        self.map[x][y] = 2
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

                # place box
                elif act == 3:
                    if self.placed_boxes >= self.num_boxes:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_boxes += 1
                        self.map[x][y] = 4
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

                # place player
                elif act == 4:
                    if self.placed_player >= self.num_players:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_player += 1
                        self.map[x][y] = 5
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

            if self.is_maxsteps():
                done = True

        # finished generation
        else:
            done = True

        if done:
            _train_result = -1  # not used for training
            _fail_type = -1  # not failed
            if (self.placed_player != self.num_players
                    or self.placed_boxes != self.num_boxes
                    or self.placed_target != self.num_boxes):
                reward += self.penalty_generation_fail
                _fail_type = 0  # wrong number btp tiles
            else:
                if not self.basic_playable(self.map):
                    reward += self.penalty_bad_map_design
                    _fail_type = 1  # not basic playable
                else:
                    reward += self.reward_basic_playable
                    if self.alg_version == 0:
                        _train_result = 0
                    else:
                        _train_reward, _train_result = self.soko_agent_test()
                        reward += _train_reward

        self.episode_reward += reward

        # Convert the observation to RGB frame
        if self.train_mode == 'cnn':
            observation = self.render(mode='tiny_rgb_array', scale=self.scale)
        else:
            observation = self.render(mode='np_array')

        info = {
            "coordinate": (x, y),
            "action": act,
            "curr_steps": self.env_steps,
        }

        if self.reseted:
            self.reseted = False
            self.train_counter += 1

        if done:
            info["total_steps"] = self.env_steps
            info["train_result"] = _train_result
            info['fail_type'] = _fail_type

            self.train_result_summary[_train_result] += 1
            self.fail_type_summary[_fail_type] += 1
            self.total_reward_per_log_interval += self.episode_reward
            self.total_steps_per_log_interval += self.env_steps
            self.total_subs_per_log_interval += self.episode_subs

            # if _fail_type == -1 and self.sample_map:
            #     print('Sample map:')
            #     print(self.map)
            #     print('*********************************************')
            # self.sample_map = False

            if self.log_train_info and self.train_counter % self.log_interval == 0:
                end_time = time.time()
                duration = end_time - self.start_time
                avg_reward = self.total_reward_per_log_interval / self.log_interval
                avg_steps = self.total_steps_per_log_interval / self.log_interval
                avg_subs = self.total_subs_per_log_interval / self.log_interval
                print('[{}] Summary'.format(self.train_counter))
                print('Duration: %.2fs' % (duration))
                print('Average reward current log interval: ', avg_reward)
                print('Average steps current log interval: ', avg_steps)
                print('Average subs current log interval: ', avg_subs)

                print('Good Map                  :',
                      self.train_result_summary[0])
                if self.alg_version == 2:
                    print('Too easy map              :',
                          self.train_result_summary[1])
                if self.alg_version != 0:
                    print('Too hard or unsolvable map:',
                          self.train_result_summary[2])
                print('Not for training map      :',
                      self.train_result_summary[-1])

                print('Generated wrong number of btp tiles:',
                      self.fail_type_summary[0])
                print('Generated not basic playable map   :',
                      self.fail_type_summary[1])
                print('Unable to finish by max step       :',
                      self.fail_type_summary[2])
                print('Succeeded generate map for training:',
                      self.fail_type_summary[-1])
                print('*********************************************')

                self.total_reward_per_log_interval = 0
                self.total_steps_per_log_interval = 0
                self.total_subs_per_log_interval = 0
                self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
                self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
                self.sample_map = True
                self.start_time = time.time()

        return observation, reward, done, info

    def render(self, mode=None, close=None, scale=16):
        if mode is None:
            if self.train_mode == 'cnn':
                mode = 'human'
            else:
                mode = 'np_array'
        assert mode in RENDERING_MODES

        if 'rgb_array' in mode:
            img = self.get_image(mode, scale)
            return img

        elif 'np_array' in mode:
            return self.map

        elif 'human' in mode:
            from gym.envs.classic_control import rendering
            if self.viewer is None or not self.viewer.isopen:
                self.viewer = rendering.SimpleImageViewer()
            img = self.get_image(mode, scale)
            self.viewer.imshow(img)
            return self.viewer.isopen

        else:
            super(ALGEnv, self).render(mode=mode)  # just raise an exception

    def get_image(self, mode, scale=1):
        if mode.startswith('tiny_'):
            img = room_to_tiny_world_rgb(self.map, scale=scale)
        else:
            img = room_to_rgb(self.map)
        return img

    def basic_playable(self, room):
        # # player can reach all boxes and all targets
        # for player_coord in np.argwhere(room==5):
        #     des = np.concatenate((np.argwhere(room==2), np.argwhere(room==4)), axis=0)
        #     if not self.contaminate(room, player_coord, des):
        #         return False

        # player can reach all none wall tiles

        if not self.contaminate_room(room):
            return False

        # no three walls around any box
        if self.box_stuck(room):
            return False
        return True

    def box_stuck(self, room):
        room = deepcopy(room)
        room = np.pad(room, 1, 'constant', constant_values=0)
        for (x, y) in np.argwhere(room == 4):
            if (room[x - 1, y] == room[x, y - 1] == 0
                    or room[x - 1, y] == room[x, y + 1] == 0
                    or room[x + 1, y] == room[x, y - 1] == 0
                    or room[x + 1, y] == room[x, y - 1] == 0):
                return True
            num_wall = 0
            for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]:
                if room[_x, _y] == 0:
                    num_wall += 1
            if num_wall >= 3:
                return True
        return False

    # player can reach any none wall tile within room
    def contaminate_room(self, room):
        room = deepcopy(room)
        room = np.pad(room, 1, 'constant', constant_values=0)
        (x, y) = np.argwhere(room == 5)[0]
        room[room != 0] = 1
        room[x, y] = 5
        fixpoint = False
        while not fixpoint:
            fixpoint = True
            for (x, y) in np.argwhere(room == 5):
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if room[_x, _y] not in [0, 5]:
                        room[_x, _y] = 5
                        fixpoint = False
        for i in [1, 2, 4]:
            if i in room:
                return False
        return True

    def contaminate(self, room, src, des):
        room = deepcopy(room)
        (x, y) = src
        src_tile = room[x, y]
        room[room != 0] = 1
        room[x, y] = src_tile
        fixpoint = False
        while not fixpoint:
            fixpoint = True
            for (x, y) in np.argwhere(room == src_tile):
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if _x in range(self.dim_room[0]) and _y in range(
                            self.dim_room[1]):
                        if room[_x, _y] not in [0, src_tile]:
                            room[_x, _y] = src_tile
                            fixpoint = False
        reachable = True
        for (x, y) in des:
            if room[x, y] != src_tile:
                reachable = False
                break
        return reachable

    def is_maxsteps(self):
        return self.env_steps >= self.max_steps

    def deconstruct_map(self, obs_map):
        state_map = copy.deepcopy(obs_map)
        fix_map = copy.deepcopy(obs_map)
        state_map[state_map == 6] = 5
        fix_map[(fix_map == 3) | (fix_map == 6)] = 2
        fix_map[(fix_map == 4) | (fix_map == 5)] = 1
        return fix_map, state_map

    def assemble_map(self, state_map, fix_map):
        obs_map = copy.deepcopy(state_map)
        obs_map[(obs_map == 5) & (fix_map == 2)] = 6
        return obs_map

    def close(self):
        if self.viewer is not None:
            self.viewer.close()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]