예제 #1
0
    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 dir=POLICY_DIR,
                 eval=False,
                 drl="ppo"):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.max_height = max_height
        self.dir = dir
        self.eval = eval
        self.drl = drl
        self.sim = CubeStacking(headless=headless)

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.seed_val = np.random.randint(0, 100000000)

        self.last_cube_xy = None
        self.opponent = None  # this will be set to the PPO policy later

        self.subfolder = None
예제 #2
0
    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=1,
                 fall_disabled=False,
                 top_down=False,
                 dark=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.dark = dark
        self.arena_diag = np.sqrt(self.arena**2 + self.arena**2)
        self.max_height = max_height
        self.top_down = top_down
        self.fall_disabled = fall_disabled
        self._max_episode_steps = self.max_height

        cam = CAM_POSES["9.5_block_close"]
        four_colors = False
        if self.top_down:
            cam = CAM_POSES["top_down_4_block"]
            four_colors = True

        self.sim = CubeStacking(headless=headless,
                                cam=cam,
                                four_colors=four_colors,
                                dark=dark)

        self.eval = eval

        self.action_space = Box(-1, 1, shape=(2, ), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.last_cube_xy = None
예제 #3
0
    def step(self, action):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        if not self.rel_action:
            action *= self.arena
        else:
            if self.last_cube_xy is None:
                self.last_cube_xy = np.random.uniform(-1, 1, 2) * self.arena
            action = self.last_cube_xy + action

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        # cube_xy = np.random.uniform(-4, 4, 2)
        if self.eval:
            player = Player.Player
        else:
            player = None
        higher = self.sim.place_cube(action, player)

        self.last_cube_xy = action  # <- if this is uncommented, PPO puts the cube always at -1,-1 # this should be prevented by uniform random noise

        fall_player = self.sim.last_cube_fell(TEST_STEPS)

        obs = self.sim.render()
        if fall_player:
            return obs, -1, True, {"success": False}

        done = False
        reward = 0
        if not higher:
            reward = -1

        # opponent's turn
        opponent_xy = self._play_opponent(obs)
        opponent_xy_rand = CubeStacking.apply_randomization(
            opponent_xy, RandomPositions[self.randomness])
        self.last_cube_xy += opponent_xy_rand

        if self.eval:
            player = Player.Enemy
        else:
            player = None
        self.sim.place_cube(self.last_cube_xy, player)
        fall_opponent = self.sim.last_cube_fell(TEST_STEPS)
        obs = self.sim.render()

        misc = {}
        if higher and fall_opponent:
            reward = 1
            done = True
            misc["success"] = True

        if self.sim.current_max_z >= MAX_HEIGHT * 2:
            done = True

        return obs, reward, done, misc
예제 #4
0
    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 dir=POLICY_DIR,
                 eval=False,
                 drl="ppo",
                 reward_scheme=0,
                 no_floor=False,
                 textured=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.arena_diag = np.sqrt(self.arena**2 + self.arena**2)
        self.max_height = max_height
        self.dir = dir
        self.eval = eval
        self.drl = drl
        self.no_floor = no_floor
        self.textured = textured
        self.rewards = REWARDS[f"v{reward_scheme}"]
        self.stats = {
            "player_correct_stacks": 0,
            "opponent_correct_stacks": 0,
            "player_floor_placements": 0,
            "opponent_floor_placements": 0,
            "avg_tower_height": deque(maxlen=100),
            "avg_win_rate": deque(maxlen=100),
            "avg_cubes_placed_total": deque(maxlen=100),
            "avg_player_dist_to_ref": deque(maxlen=100),
            "avg_opponent_dist_to_ref": deque(maxlen=100),
            "opponnet_policies": 0
        }
        self.stats_tmp = {}

        if not self.textured:
            self.sim = CubeStacking(
                headless=headless, cam=CAM_POSES["9.5_block_close"])
        else:
            self.sim = CubeStacking(headless=headless, halfbox=True, cam=CAM_POSES["physnet"], four_colors=True)

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.seed_val = np.random.randint(0, 100000000)

        self.ref_cube = None
        self.opponent = None  # this will be set to the PPO policy later

        self.subfolder = None
예제 #5
0
    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 relative_action=False,
                 eval=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.rel_action = relative_action
        self.max_height = max_height
        self.sim = CubeStacking(headless=headless)
        self.eval = eval

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.last_cube_xy = None
예제 #6
0
    def fall_higher_obs(self, action):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        action *= self.arena

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        higher = self.sim.place_cube(action)

        if not self.fall_disabled:
            fall = self.sim.last_cube_fell(TEST_STEPS)
        else:
            fall = False
        obs = self.sim.render()

        return fall, higher, obs
예제 #7
0
    def fall_higher_obs(self, action, player):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        action *= self.arena

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        if self.eval:
            color = player
        else:
            color = None

        higher = self.sim.place_cube(action, color)
        fall = self.sim.last_cube_fell(TEST_STEPS)
        obs = self.sim.render()

        return fall, higher, obs
예제 #8
0
class TowerStacc(gym.Env):
    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=1,
                 fall_disabled=False,
                 top_down=False,
                 dark=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.dark = dark
        self.arena_diag = np.sqrt(self.arena**2 + self.arena**2)
        self.max_height = max_height
        self.top_down = top_down
        self.fall_disabled = fall_disabled
        self._max_episode_steps = self.max_height

        cam = CAM_POSES["9.5_block_close"]
        four_colors = False
        if self.top_down:
            cam = CAM_POSES["top_down_4_block"]
            four_colors = True

        self.sim = CubeStacking(headless=headless,
                                cam=cam,
                                four_colors=four_colors,
                                dark=dark)

        self.eval = eval

        self.action_space = Box(-1, 1, shape=(2, ), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.last_cube_xy = None

    def step(self, action):
        distance = np.linalg.norm(
            self.ref_cube - np.clip(npa(action), -1, 1),
            1)  # the 1 at the end means L1 distance, not L2
        fall, higher, obs = self.fall_higher_obs(action)

        if self.max_height == 1:
            # -Single- case
            return obs, -distance, True, {}

        # in case of self.fall_disabled, this is never true
        if fall:
            return obs, -1, True, {"success": False}

        # end the episode immediately when the block is not on the starter
        if self.top_down and not higher:
            return obs, -distance, True, {"success": False}

        if not higher or distance >= (2 / self.arena_diag):
            return obs, -distance, False, {}

        # if higher, which is the only left case

        done = False
        misc = {}
        if self.sim.current_max_z >= self.max_height * 2:
            done = True
            misc["success"] = True

        return obs, 1, done, misc

    def fall_higher_obs(self, action):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        action *= self.arena

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        higher = self.sim.place_cube(action)

        if not self.fall_disabled:
            fall = self.sim.last_cube_fell(TEST_STEPS)
        else:
            fall = False
        obs = self.sim.render()

        return fall, higher, obs

    def reset(self):
        self.sim.reset()

        cube_xy = np.random.uniform(-self.arena, self.arena, 2)
        self.sim.place_cube(cube_xy, Player.Starter)
        self.ref_cube = cube_xy / self.arena  # to bring into [-1,1]

        obs = self.sim.render()
        return obs

    def render(self, mode='human'):
        pass
        #TODO

    def seed(self, seed=None):
        np.random.seed(seed)
        return super().seed(seed)

    def close(self):
        self.sim.close()
        super().close()
예제 #9
0
from cube_stacking.assets import get_tex, get_urdf, TEXTURES
from cube_stacking.sim import CubeStacking
import matplotlib.pyplot as plt
import numpy as np

WID = 256

cam = {"eye": [-8, 8, 8], "lookat": [3, -3, 0]}

sim = CubeStacking(False, (WID, WID), halfbox=True, cam=cam)
# ASSETS

# texUid = sim.p0.loadTexture(get_tex("grass"))
# sim.p0.changeVisualShape(sim.floor, -1, textureUniqueId=texUid)

textures = []
for t in TEXTURES:
    texUid = sim.p0.loadTexture(get_tex(t))
    textures.append(texUid)

sim.place_cube([-4, 4])
sim.place_cube([-3.5, 4])
sim.place_cube([-3.7, 4])
i = 0
while True:
    sim.step()
    i += 1
    if i == 10000:
        tid = np.random.randint(0, len(textures))
        sim.p0.changeVisualShape(sim.floor, -1, textureUniqueId=textures[tid])
예제 #10
0
class TwoPlayerDirectoryFullArena(gym.Env):

    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 dir=POLICY_DIR,
                 eval=False,
                 drl="ppo",
                 reward_scheme=0,
                 no_floor=False,
                 textured=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.arena_diag = np.sqrt(self.arena**2 + self.arena**2)
        self.max_height = max_height
        self.dir = dir
        self.eval = eval
        self.drl = drl
        self.no_floor = no_floor
        self.textured = textured
        self.rewards = REWARDS[f"v{reward_scheme}"]
        self.stats = {
            "player_correct_stacks": 0,
            "opponent_correct_stacks": 0,
            "player_floor_placements": 0,
            "opponent_floor_placements": 0,
            "avg_tower_height": deque(maxlen=100),
            "avg_win_rate": deque(maxlen=100),
            "avg_cubes_placed_total": deque(maxlen=100),
            "avg_player_dist_to_ref": deque(maxlen=100),
            "avg_opponent_dist_to_ref": deque(maxlen=100),
            "opponnet_policies": 0
        }
        self.stats_tmp = {}

        if not self.textured:
            self.sim = CubeStacking(
                headless=headless, cam=CAM_POSES["9.5_block_close"])
        else:
            self.sim = CubeStacking(headless=headless, halfbox=True, cam=CAM_POSES["physnet"], four_colors=True)

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.seed_val = np.random.randint(0, 100000000)

        self.ref_cube = None
        self.opponent = None  # this will be set to the PPO policy later

        self.subfolder = None

    def fall_higher_obs(self, action, player):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        action *= self.arena

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        if self.eval:
            color = player
        else:
            color = None

        higher = self.sim.place_cube(action, color)
        fall = self.sim.last_cube_fell(TEST_STEPS)
        obs = self.sim.render()

        return fall, higher, obs

    def _prep_stats(self, win):
        self.stats["avg_tower_height"].append((self.sim.current_max_z + 1) / 2)
        self.stats["avg_win_rate"].append(1 if win else 0)
        self.stats["avg_cubes_placed_total"] = len(self.sim.cubes)
        self.stats["success"] = win

    def step(self, action):
        # negative normalized distance to reference cube
        reward_justin_case = -np.linalg.norm(self.ref_cube -
                                             npa(action)) / self.arena_diag
        self.stats["avg_player_dist_to_ref"].append(reward_justin_case)

        fall_player, higher, obs = self.fall_higher_obs(action, Player.Player)

        if higher:
            self.stats["player_correct_stacks"] += 1
        else:
            self.stats["player_floor_placements"] += 1
            if self.no_floor:
                self._prep_stats(False)
                return obs, self.rewards[Rewards.Floor], True, self.stats

        if fall_player:
            self._prep_stats(False)
            return obs, self.rewards[Rewards.PlayerFall], True, self.stats

        if self.max_height == 4 and len(self.sim.cubes) == 4:
            self._prep_stats(False)
            return obs, self.rewards[Rewards.Tie], True, self.stats

        done = False
        reward = reward_justin_case * self.rewards[Rewards.DistanceScale]

        # opponent's turn
        if self.eval:
            time.sleep(.5)
        opponent_xy = self._play_opponent(obs)
        opp_dist = -np.linalg.norm(self.ref_cube -
                                   opponent_xy) / self.arena_diag
        self.stats["avg_opponent_dist_to_ref"].append(opp_dist)

        fall_opponent, higher_opp, obs = self.fall_higher_obs(
            opponent_xy, Player.Enemy)

        if higher_opp:
            self.stats["opponent_correct_stacks"] += 1
        else:
            self.stats["opponent_floor_placements"] += 1

        if higher and fall_opponent:
            self._prep_stats(True)

            # "avg_tower_height": deque(100),
            #             "avg_win_rate": deque(100)
            reward = self.rewards[Rewards.EnemyFall]
            done = True

        if self.sim.current_max_z >= MAX_HEIGHT * 2 - 0.01:
            done = True

        if self.max_height == 4 and len(self.sim.cubes) == 4:
            self._prep_stats(False)
            reward = self.rewards[Rewards.Tie]
            done = True

        return obs, reward, done, self.stats

    def reset(self):
        self.stats["player_correct_stacks"] = 0
        self.stats["opponent_correct_stacks"] = 0
        self.stats["player_floor_placements"] = 0
        self.stats["opponent_floor_placements"] = 0
        if "success" in self.stats:
            del self.stats["success"]

        self.sim.reset()

        if self.textured:
            self.sim.shuffle_textures()

        cube_xy = np.random.uniform(-self.arena, self.arena, 2)
        self.sim.place_cube(cube_xy, Player.Starter)
        self.ref_cube = cube_xy / self.arena  # to bring into [-1,1]

        if self.subfolder is not None:
            self.dir = os.path.join(self.dir, self.subfolder)
            print("switched loading directory to:", self.dir)
            # in order to trigger this only once
            self.subfolder = None

        self._init_opponent()

        # coin toss if player starts or opponent
        if np.random.rand() < .5:
            obs = self.sim.render()
            opponent_xy = self._play_opponent(obs)
            _, _, obs = self.fall_higher_obs(opponent_xy, Player.Enemy)
        else:
            obs = self.sim.render()

        return obs

    def render(self, mode='human'):
        pass
        #TODO

    def seed(self, seed=None):
        self.seed = seed
        np.random.seed(seed)
        return super().seed(seed)

    def close(self):
        self.sim.close()
        super().close()

    def _play_opponent(self, obs):
        if self.opponent is None:
            return np.random.uniform(-1, 1, 2)

        obs = torch.from_numpy(obs).float().to('cpu')
        # obs /= 255
        obs = obs.permute(2, 0, 1)

        if self.drl == "ppo":
            # move obs down on the stacc
            # self.stacked_obs[:, :-3] = self.stacked_obs[:, 3:]
            # add new obs on top of stacc
            # self.stacked_obs[:, -3:] = obs
            self.stacked_obs[:, :] = obs

            with torch.no_grad():
                _, action, _, _ = self.opponent.act(
                    self.stacked_obs,
                    self.opp_recurrent_hidden_states,
                    self.opp_masks,
                    deterministic=True)
            opponent_xy = action.numpy()[0]
            self.opp_masks.fill_(1.0)

        elif self.drl == "td3":
            opponent_xy = self.opponent.select_action(np.array(obs), "cpu")

        return opponent_xy

    def _init_opponent(self):
        # get dire contents

        # print (f"ENV: SEARCHING '{self.dir}', filtering for '-{self.drl.upper()}-', got:",os.listdir(self.dir))
        policies = [
            x for x in os.listdir(self.dir)
            if f"-{self.drl.upper()}-" in x and ".pt" in x[-3:]
        ]
        self.stats["opponnet_policies"] = len(policies)

        if len(policies) == 0:
            print("ENV: no existing policies")
            self.opponent = None
            return

        # if there is only one policy and we've loaded it, we don't need to reload it
        # if there are 3 or fewer policies, then toss a coin to see if we need to relead the policy
        if self.opponent is not None and (len(policies) == 1 or
                                          (len(policies) <= 3 and
                                           np.random.rand() < .5)):
            if self.drl == "ppo":
                self.opp_masks = torch.zeros(1, 1)
                # self.stacked_obs = torch.zeros(
                #     (1, 12, 84, 84)).to(torch.device('cpu'))
                self.stacked_obs = torch.zeros(
                    (1, 3, 84, 84)).to(torch.device('cpu'))
            elif self.drl == "td3":
                pass
            return

        shuffle(policies)
        policy_path = os.path.join(self.dir, policies[0])
        # print(f"ENV: picking opponent policy '{policy_path}'")

        # We need to use the same statistics for normalization as used in training

        if self.drl == "ppo":
            # notice the tuple
            self.opponent, _ = \
                torch.load(policy_path, map_location='cpu')
        elif self.drl == "td3":
            self.opponent = \
                torch.load(policy_path, map_location='cpu')

        # print("GE: USING POLICY:", policy_path)

        if self.drl == "ppo":
            self.opp_recurrent_hidden_states = torch.zeros(
                1, self.opponent.recurrent_hidden_state_size)
            self.opp_masks = torch.zeros(1, 1)
            # self.stacked_obs = torch.zeros(
            #     (1, 12, 84, 84)).to(torch.device('cpu'))
            self.stacked_obs = torch.zeros(
                (1, 3, 84, 84)).to(torch.device('cpu'))
        elif self.drl == "td3":
            self.opponent.actor.eval()
예제 #11
0
class SinglePlayer(gym.Env):

    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 relative_action=False,
                 eval=False):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.rel_action = relative_action
        self.max_height = max_height
        self.sim = CubeStacking(headless=headless)
        self.eval = eval

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.last_cube_xy = None

    def step(self, action):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        if not self.rel_action:
            action *= self.arena
        else:
            action = self.last_cube_xy + action

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        # cube_xy = np.random.uniform(-4, 4, 2)
        if self.eval:
            player = Player.Player
        else:
            player = None

        higher = self.sim.place_cube(action, player)
        # self.last_cube_xy = action # <- if this is uncommented, PPO puts the cube always at -1,-1
        fall_player = self.sim.last_cube_fell(TEST_STEPS)

        if fall_player:
            obs = self.sim.render()
            return obs, -1, True, {"success": False}

        done = False
        reward = 0
        if not higher:
            reward = -1

        # opponent's turn
        opponent_xy = self.last_cube_xy + np.random.uniform(-1, 1, 2)

        if self.eval:
            player = Player.Enemy
        else:
            player = None

        self.sim.place_cube(opponent_xy, player)
        self.last_cube_xy = opponent_xy
        fall_opponent = self.sim.last_cube_fell(TEST_STEPS)
        obs = self.sim.render()

        misc = {}
        if higher and fall_opponent:
            reward = 1
            done = True
            misc["success"] = True

        if self.sim.current_max_z >= MAX_HEIGHT * 2:
            done = True

        return obs, reward, done, misc

    def reset(self):
        self.sim.reset()
        cube_xy = np.random.uniform(-self.arena, self.arena, 2)

        if self.eval:
            player = Player.Starter
        else:
            player = None

        self.sim.place_cube(cube_xy, player)
        self.last_cube_xy = cube_xy
        obs = self.sim.render()
        return obs

    def render(self, mode='human'):
        pass
        #TODO

    def seed(self, seed=None):
        np.random.seed(seed)
        return super().seed(seed)

    def close(self):
        self.sim.close()
        super().close()
예제 #12
0
class TwoPlayerDirectory(gym.Env):

    def __init__(self,
                 randomness,
                 headless=True,
                 max_height=MAX_HEIGHT,
                 arena=4,
                 dir=POLICY_DIR,
                 eval=False,
                 drl="ppo"):
        super().__init__()
        self.randomness = randomness
        self.arena = arena
        self.max_height = max_height
        self.dir = dir
        self.eval = eval
        self.drl = drl
        self.sim = CubeStacking(headless=headless)

        self.action_space = Box(-1, 1, shape=(2,), dtype=np.float32)
        self.observation_space = Box(0, 255, shape=(84, 84, 3), dtype=np.uint8)

        self.seed_val = np.random.randint(0, 100000000)

        self.last_cube_xy = None
        self.opponent = None  # this will be set to the PPO policy later

        self.subfolder = None

    def step(self, action):
        assert len(action) == 2
        action = np.clip(action, -1, 1)
        if not self.rel_action:
            action *= self.arena
        else:
            if self.last_cube_xy is None:
                self.last_cube_xy = np.random.uniform(-1, 1, 2) * self.arena
            action = self.last_cube_xy + action

        action = CubeStacking.apply_randomization(
            action, RandomPositions[self.randomness])

        # cube_xy = np.random.uniform(-4, 4, 2)
        if self.eval:
            player = Player.Player
        else:
            player = None
        higher = self.sim.place_cube(action, player)

        self.last_cube_xy = action  # <- if this is uncommented, PPO puts the cube always at -1,-1 # this should be prevented by uniform random noise

        fall_player = self.sim.last_cube_fell(TEST_STEPS)

        obs = self.sim.render()
        if fall_player:
            return obs, -1, True, {"success": False}

        done = False
        reward = 0
        if not higher:
            reward = -1

        # opponent's turn
        opponent_xy = self._play_opponent(obs)
        opponent_xy_rand = CubeStacking.apply_randomization(
            opponent_xy, RandomPositions[self.randomness])
        self.last_cube_xy += opponent_xy_rand

        if self.eval:
            player = Player.Enemy
        else:
            player = None
        self.sim.place_cube(self.last_cube_xy, player)
        fall_opponent = self.sim.last_cube_fell(TEST_STEPS)
        obs = self.sim.render()

        misc = {}
        if higher and fall_opponent:
            reward = 1
            done = True
            misc["success"] = True

        if self.sim.current_max_z >= MAX_HEIGHT * 2:
            done = True

        return obs, reward, done, misc

    def reset(self):
        self.sim.reset()

        # cube_xy = np.random.uniform(-self.arena, self.arena, 2)
        # self.sim.place_cube(cube_xy)
        # self.last_cube_xy = cube_xy

        if self.subfolder is not None:
            self.dir = os.path.join(self.dir, self.subfolder)
            print("switched loading directory to:", self.dir)
            # in order to trigger this only once
            self.subfolder = None

        self._init_opponent()

        # coin toss if player starts or opponent
        if np.random.rand() < .5:
            obs = self.sim.render()
            opponent_xy = self._play_opponent(obs)
            self.last_cube_xy = np.random.uniform(-1, 1, 2) * \
                                self.arena + opponent_xy

            if self.eval:
                player = Player.Enemy
            else:
                player = None
            self.sim.place_cube(self.last_cube_xy, player)
        else:
            self.last_cube_xy = None

        obs = self.sim.render()

        return obs

    def render(self, mode='human'):
        pass
        #TODO

    def seed(self, seed=None):
        self.seed = seed
        np.random.seed(seed)
        return super().seed(seed)

    def close(self):
        self.sim.close()
        super().close()

    def _play_opponent(self, obs):
        if self.opponent is None:
            opponent_xy = np.random.uniform(-1, 1, 2)
        else:
            obs = torch.from_numpy(obs).float().to('cpu')
            obs /= 255
            obs = obs.permute(2, 0, 1)

            if self.drl == "ppo":
                # move obs down on the stacc
                self.stacked_obs[:, :-3] = self.stacked_obs[:, 3:]
                # add new obs on top of stacc
                self.stacked_obs[:, -3:] = obs

                with torch.no_grad():
                    _, action, _, _ = self.opponent.act(
                        self.stacked_obs,
                        self.opp_recurrent_hidden_states,
                        self.opp_masks,
                        deterministic=True)
                opponent_xy = action.numpy()[0]

            elif self.drl == "td3":
                opponent_xy = self.opponent.select_action(np.array(obs), "cpu")

        return opponent_xy

    def _init_opponent(self):
        # get dire contents

        policies = [x for x in os.listdir(self.dir) if f"-{self.drl}.pt" in x]

        if len(policies) == 0:
            # print("ENV: no existing policies")
            self.opponent = None
            return

        # if there is only one policy and we've loaded it, we don't need to reload it
        # if there are 3 or fewer policies, then toss a coin to see if we need to relead the policy
        if self.opponent is not None and (len(policies) == 1 or
                                          (len(policies) <= 3 and
                                           np.random.rand() < .5)):
            if self.drl == "ppo":
                self.opp_masks = torch.zeros(1, 1)
                self.stacked_obs = torch.zeros(
                    (1, 12, 84, 84)).to(torch.device('cpu'))
            elif self.drl == "td3":
                pass
            return

        shuffle(policies)
        policy_path = os.path.join(self.dir, policies[0])
        # print(f"ENV: picking opponent policy '{policy_path}'")

        # We need to use the same statistics for normalization as used in training

        if self.drl == "ppo":
            # notice the tuple
            self.opponent, _ = \
                torch.load(policy_path, map_location='cpu')
        elif self.drl == "td3":
            self.opponent = \
                torch.load(policy_path, map_location='cpu')

        # print("GE: USING POLICY:", policy_path)

        if self.drl == "ppo":
            self.opp_recurrent_hidden_states = torch.zeros(
                1, self.opponent.recurrent_hidden_state_size)
            self.opp_masks = torch.zeros(1, 1)
            self.stacked_obs = torch.zeros((1, 12, 84, 84)).to(torch.device('cpu'))
        elif self.drl == "td3":
            self.opponent.actor.eval()