Exemplo n.º 1
0
 def _thunk():
     env = ObstacleTowerEnv('../ObstacleTower/obstacletower',
                            retro=True,
                            worker_id=rank,
                            realtime_mode=show,
                            config={'total-floors': 20})
     env.seed(seed + rank % 8)
     env = bench.Monitor(env, None, allow_early_resets=True)
     env = OTWrapper(env)
     env = FrameStack(env, 4)
     return env
Exemplo n.º 2
0
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: python record_tail.py <start_floor>\n')
        sys.exit(1)
    start_floor = int(sys.argv[1])
    viewer = EnvInteractor()
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    while True:
        seed = select_seed(floor=start_floor)
        env.seed(seed)
        env.floor(start_floor)
        obs = env.reset()
        viewer.reset()
        record_episode(seed, env, viewer, obs, max_steps=MAX_STEPS)
def seed_hashes():
    mapping = {}
    while len(mapping) < 100:
        if os.path.exists('UnitySDK.log'):
            os.remove('UnitySDK.log')
        while True:
            try:
                env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                                       worker_id=random.randrange(1000))
                break
            except KeyboardInterrupt:
                sys.exit(1)
            except:
                pass
        env.seed(25)  # random argument
        obs = env.reset()
        env.close()
        with open('UnitySDK.log') as f:
            contents = next(l for l in f.readlines() if 'seed:' in l)
        seed = int(contents.split(': ')[-1])
        yield str(obs.flatten().tolist()), seed
    return mapping
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: record_improve.py <recording_path>\n')
        os.exit(1)
    rec = Recording(sys.argv[1])
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    try:
        env.seed(rec.seed)
        if rec.floor:
            env.floor(rec.floor)
        env.reset()
        i = 0
        for i, (action, rew) in enumerate(zip(rec.actions, rec.rewards)):
            _, real_rew, done, _ = env.step(action)
            if not np.allclose(real_rew, rew):
                print('mismatching result at step %d' % i)
                sys.exit(1)
            if done != (i == rec.num_steps - 1):
                print('invalid done result at step %d' % i)
                sys.exit(1)
        print('match succeeded')
    finally:
        env.close()
Exemplo n.º 5
0
import os

from obstacle_tower_env import ObstacleTowerEnv

env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=0)

env.seed(72)
env.floor(12)
env.reset()
for action in [
        18, 18, 18, 18, 18, 18, 30, 24, 24, 21, 18, 18, 30, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 24, 24, 6, 6, 6, 6, 6,
        6, 6, 6, 30, 30, 30, 30, 30, 18, 24, 24, 24, 6, 6, 6, 6, 6, 6, 24, 18,
        24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 24, 24, 24, 18, 30, 18,
        18, 30, 18, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 30, 30,
        24, 24, 24, 30, 30, 30, 30, 30, 18, 18, 18, 18, 30, 30, 30, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 18, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 24, 18, 30, 18, 18, 18, 18, 30, 30, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 30, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 30, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 18, 18, 18,
        18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30,
        30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 24, 21, 18, 24, 24, 24, 24, 18, 18, 18, 24, 18, 18, 18, 18,
        30, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 30, 30, 18, 18, 30, 30, 30, 30, 30, 30, 12, 12, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18,
Exemplo n.º 6
0
class Worker(object):
    def __init__(self,
                 envpath,
                 wid,
                 retro,
                 realtime_mode,
                 env_seed=0,
                 env_floor=0):
        self.wid = wid
        self.env = ObstacleTowerEnv(environment_filename=envpath,
                                    worker_id=wid,
                                    retro=retro,
                                    realtime_mode=realtime_mode)
        self.kprun = GLOBAL_KPRUN
        self.tableAction = self.createActionTable()
        # 設定關卡
        self.env_seed = env_seed
        self.env_floor = env_floor
        self.step = 0
        self.summary = tf.Summary(value=[
            tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                             simple_value=0)
        ])
        self.kprun.train_writer.add_summary(self.summary, 0)

    def createActionTable(self):
        tableAction = []
        for a in range(0, 3):
            for b in range(0, 3):
                for c in range(0, 2):
                    tableAction.append([a, b, c, 0])
        # print("Action option: ", tableAction[0:17])
        return tableAction

    def reward_compute(self, done, reward_total, keys, previous_keys, reward,
                       previous_reward, time_remaining,
                       previous_time_remaining, previous_stage_time_remaining):
        # 定義獎勵公式
        # reward 是從環境傳來的破關數
        # keys 是撿到鑰匙的數量
        # time_remaining 是剩餘時間
        # 過關最大獎勵為10
        # 一把鑰匙為5
        # 時間果實暫時只給0.5,因為結束會結算剩餘時間,會有獎勵累加的問題。
        # 如果過關,給予十倍過關獎勵 - (場景開始的時間-剩餘時間)/1000
        # print("time_remaining ", time_remaining,
        #       " previous_time_remaining ", previous_time_remaining,
        #         " reward ", reward)
        # 通過一個會開門的綠門會加0.1
        if (reward - previous_reward) > 0 and (reward - previous_reward) < 0.3:
            reward_total += 3
        elif (reward - previous_reward) > 0.9:
            # ***如果剩餘時間比場景時間多會變成加分獎勵,可能會極大增加Agent吃時間果實的機率。
            # ***另一種方式是剩餘的時間直接/1000加上去,這樣就沒有累加效果。
            print("Pass ", reward, " Stage!")
            # reward_total += (reward - previous_reward) * 100 - \
            #                 (previous_stage_time_remaining - time_remaining)

            reward_total += 200
            # 過關之後把時間留到下一關,儲存這回合時間供下次計算過關使用
            previous_time_remaining = time_remaining
            previous_stage_time_remaining = time_remaining
            # Lesson 1 repeat
            if reward > 6.5:
                # self.total_step +=1
                # if self.total_step >=5:
                #     done = True
                #     return reward_total, previous_stage_time_remaining, done
                self.env.seed(np.random.randint(5))
                # env.reset()
                done = True
            return reward_total, previous_stage_time_remaining, done

        # 假設過關的時候有順便吃到果實或鑰匙,所以預設為同時可以加成
        if previous_keys > keys:
            # print("Get Key")
            reward_total += 5

        if previous_time_remaining < time_remaining and previous_time_remaining != 0:
            # print("Get time power up")
            reward_total += 2
        else:
            reward_total -= 0.5
        if done and previous_time_remaining > 100:
            print("Agent died")
            # 如果剩餘時間越多就掛點,扣更多
            # reward_total -= (10 + time_remaining / 100)
            reward_total -= 100
        return reward_total, previous_stage_time_remaining, done

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        # 設定關卡
        self.env.seed(self.env_seed)
        self.env.floor(self.env_floor)
        # 只要還沒達到目標回合就LOOP
        while not COORD.should_stop():
            # 紀錄步數
            self.step += 1
            # 重設關卡
            obs = self.env.reset()
            # 初始化
            done = False
            stage_reward = 0.0
            reward = 0
            keys = 0
            # 檢查是否有吃到加時間的,如果是第一回合出來沒有time_remaining,事先定義
            time_remaining = 3000
            previous_stage_time_remaining = time_remaining
            # 預處理圖像
            # previous_preprocessed_observation_image = np.reshape(obs[0], [-1])
            previous_preprocessed_observation_image = obs[0]
            buffer_s, buffer_a, buffer_r = [], [], []
            # 只要沒死
            while not done:
                # 如果模型正在更新就等待更新完成
                if not ROLLING_EVENT.is_set():
                    # 等待更新完成
                    ROLLING_EVENT.wait()
                    # 清除記憶體,使用新的代理收集資料
                    buffer_s, buffer_a, buffer_r = [], [], []

                # 儲存上一個動作狀態,供計算獎勵用
                previous_keys = keys
                previous_reward = reward
                previous_time_remaining = time_remaining

                # 根據上一次的狀態決定動作
                action = self.kprun.choose_action(
                    previous_preprocessed_observation_image)
                action = np.clip(np.random.normal(action, 1.), *[6, 12])

                # 做出動作,獲得場景資訊,已過關數,代理資訊
                observation, reward, done, info = self.env.step(
                    np.array(self.tableAction[int(action)]))

                # 預處理模型需要的資料
                observation_image, keys, time_remaining = observation
                # preprocessed_observation_image = np.reshape(
                #     observation_image, [-1])
                preprocessed_observation_image = observation_image
                stage_reward, previous_stage_time_remaining, done = self.reward_compute(
                    done=done,
                    reward_total=stage_reward,
                    keys=keys,
                    previous_keys=previous_keys,
                    reward=reward,
                    previous_reward=previous_reward,
                    time_remaining=time_remaining,
                    previous_time_remaining=previous_time_remaining,
                    previous_stage_time_remaining=previous_stage_time_remaining
                )
                # Normalize reward~不知道中文怎麼打
                stage_reward = stage_reward + 8 / 8

                # 把這次狀態存入 記憶體
                buffer_s.append(np.array([preprocessed_observation_image]))
                buffer_a.append(action)
                buffer_r.append(stage_reward)

                # 儲存下一步要參考的圖像
                previous_preprocessed_observation_image = preprocessed_observation_image

                # 達到更新時,自己先做處理。
                GLOBAL_UPDATE_COUNTER += 1
                # 太多自己就先處理更新
                if len(buffer_s) == EP_LEN - \
                        1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.kprun.get_v(preprocessed_observation_image)
                    # 計算折扣獎勵
                    discounted_r = []
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()
                    # 整理維度
                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    # 把資料放入共享記憶體
                    QUEUE.put(bs)
                    QUEUE.put(ba)
                    QUEUE.put(br)
                    # print("len(buffer_s)", len(buffer_s))
                    # print("bs.shape", bs.shape)
                    # 清空暫存
                    buffer_s, buffer_a, buffer_r = [], [], []
                    # 如果整個模型步數到達最小BATCH 就整個更新
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        # 停止收集資料
                        ROLLING_EVENT.clear()
                        # 更新PPO
                        UPDATE_EVENT.set()
                    # 達到最多EP停止訓練
                    if GLOBAL_EP >= EP_MAX:
                        COORD.request_stop()
                        break
            # 紀錄獎勵
            self.summary = tf.Summary(value=[
                tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                                 simple_value=stage_reward)
            ])
            self.kprun.train_writer.add_summary(self.summary, self.step)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % stage_reward,
            )
        self.env.close()
Exemplo n.º 7
0
import json
import os

from anyrl.utils.ffmpeg import export_video
import numpy as np
from obstacle_tower_env import ObstacleTowerEnv

from obs_tower2.util import big_obs

with open('stuck_box.json', 'r') as in_file:
    data = json.load(in_file)

env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=1)
env.seed(56)
env.reset()


def f():
    for i, act in enumerate(data):
        obs, _, _, info = env.step(act)
        if i > 5275:
            yield big_obs(obs, info)


export_video('stuck_box.mp4', 168, 168, 10, f())
Exemplo n.º 8
0
            os.remove(os.path.join(dir, file))

    parser = argparse.ArgumentParser()
    parser.add_argument('environment_filename',
                        default='./ObstacleTower/obstacletower',
                        nargs='?')
    parser.add_argument('--docker_training', action='store_true')
    parser.set_defaults(docker_training=False)
    args = parser.parse_args()

    env = ObstacleTowerEnv(args.environment_filename,
                           docker_training=args.docker_training,
                           retro=False,
                           realtime_mode=False)
    logger.setLevel(logging.WARNING)
    env.seed(4)

    if env.is_grading():
        episode_reward = run_evaluation(env)
    else:
        total_frames = 0
        episode_number = 0
        while True:
            episode_number += 1
            total_frames += run_episode(env, episode_number)

            if episode_number % 200 == 0:
                print(f'Total Frames: {total_frames}')
                episode_reward = run_episode(env, episode_number, test=True)

            if episode_number >= MAX_EPISODES:
Exemplo n.º 9
0
import os
import random

from obstacle_tower_env import ObstacleTowerEnv

counter = {}
env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=2)
while True:
    env.seed(random.randrange(100))
    env.reset()
    for _ in range(50):
        obs, _, _, _ = env.step(0)
    key = str(obs.flatten().tolist())
    counter[key] = True
    print('got %d start states' % len(counter))
Exemplo n.º 10
0
    parser.add_argument('environment_filename', default='./ObstacleTower/obstacletower', nargs='?')
    parser.add_argument('--docker_training', action='store_true')
    parser.set_defaults(docker_training=False)
    args = parser.parse_args()
    
    env = ObstacleTowerEnv(args.environment_filename, docker_training=args.docker_training, realtime_mode=True)

    model = get_model()
    optimizer = tf.train.AdamOptimizer()
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    checkpoint.restore(tf.train.latest_checkpoint('./tf_saves/'))

    total_count = 0
    for i in range(0, 101):
        #setup environment
        env.seed(i)
        obs = env.reset()
        reward = 0
        actions = []
        rerun_actions = False
        obs = env.reset()
        while True:
            observation = process_image(obs)
            prediction = model(tf.cast([observation], dtype=tf.float32))[0]
            print('prediction', prediction)
            selection = np.argmax(prediction)
            print('selection', selection)
            action = action_options[selection]
            print('action', action)
            obs, step_reward, done, info = env.step(action)
Exemplo n.º 11
0
#!/usr/bin/env python3

from obstacle_tower_env import ObstacleTowerEnv
from matplotlib import pyplot as plt

ENV_PATH = './obstacle-tower-challenge/ObstacleTower/obstacletower'
env = ObstacleTowerEnv(ENV_PATH, retro=False, realtime_mode=True)

# Seeds can be chosen from range of 0-100.
env.seed(5)

# Floors can be chosen from range of 0-24.
env.floor(15)

# The environment provided has a MultiDiscrete action space, where the 4 dimensions are:

# 0. Movement (No-Op/Forward/Back)
# 1. Camera Rotation (No-Op/Counter-Clockwise/Clockwise)
# 2. Jump (No-Op/Jump)
# 3. Movement (No-Op/Right/Left)

print('action space', env.action_space)

# The observation space provided includes a 168x168 image (the camera from the simulation)
# as well as the number of keys held by the agent (0-5) and the amount of time remaining.

print('observation space', env.observation_space)

# Interacting with the environment

obs = env.reset()