def testing(self):
     from keepitpossible.common import action_table
     self.table_action = action_table.create_action_table()
     self.MODEL.load()
     done = False
     reward = 0.0
     env = ObstacleTowerEnv(environment_filename=self.SCHEDULE.ENV_PATH,
                            worker_id=self.SCHEDULE.N_WORKER + 1,
                            retro=False,
                            realtime_mode=True)
     obs = env.reset()
     previous_preprocessed_observation_image = obs[0]
     while not done:
         action = self.MODEL.choose_action(
             previous_preprocessed_observation_image)
         # 做出動作,獲得場景資訊,已過關數,代理資訊
         for _ in self.table_action[int(action)]:
             observation, reward, done, info = env.step(_)
             print(
                 "Action_Chose: ",
                 action,
                 "Action: ",
                 _,
                 " Reward: ",
                 reward)
             if done:
                 break
         # 預處理模型需要的資料
         observation_image, keys, time_remaining = observation
         preprocessed_observation_image = observation_image
         previous_preprocessed_observation_image = preprocessed_observation_image
     env.close()
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: record_improve.py <recording_path>\n')
        os.exit(1)
    rec = Recording(sys.argv[1])
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    try:
        env.seed(rec.seed)
        if rec.floor:
            env.floor(rec.floor)
        env.reset()
        i = 0
        for i, (action, rew) in enumerate(zip(rec.actions, rec.rewards)):
            _, real_rew, done, _ = env.step(action)
            if not np.allclose(real_rew, rew):
                print('mismatching result at step %d' % i)
                sys.exit(1)
            if done != (i == rec.num_steps - 1):
                print('invalid done result at step %d' % i)
                sys.exit(1)
        print('match succeeded')
    finally:
        env.close()
Пример #3
0
def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: python record_tail.py <start_floor>\n')
        sys.exit(1)
    start_floor = int(sys.argv[1])
    viewer = EnvInteractor()
    env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                           worker_id=random.randrange(11, 20))
    while True:
        seed = select_seed(floor=start_floor)
        env.seed(seed)
        env.floor(start_floor)
        obs = env.reset()
        viewer.reset()
        record_episode(seed, env, viewer, obs, max_steps=MAX_STEPS)
def seed_hashes():
    mapping = {}
    while len(mapping) < 100:
        if os.path.exists('UnitySDK.log'):
            os.remove('UnitySDK.log')
        while True:
            try:
                env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'],
                                       worker_id=random.randrange(1000))
                break
            except KeyboardInterrupt:
                sys.exit(1)
            except:
                pass
        env.seed(25)  # random argument
        obs = env.reset()
        env.close()
        with open('UnitySDK.log') as f:
            contents = next(l for l in f.readlines() if 'seed:' in l)
        seed = int(contents.split(': ')[-1])
        yield str(obs.flatten().tolist()), seed
    return mapping
Пример #5
0
# 1. Camera Rotation (No-Op/Counter-Clockwise/Clockwise)
# 2. Jump (No-Op/Jump)
# 3. Movement (No-Op/Right/Left)

print(env.action_space.nvec)
print(env.observation_space)


#plt.imshow(obs[0])
#plt.show()
#print(env.unwrapped.get_action_meanings())


# tower 0, floor 10 = second room holds key
config = {'tower-seed': 0, 'starting-floor': 10, 'agent-perspective': 0, 'allowed-rooms': 1, 'allowed-modules': 0, 'allowed-floors': 0}
obs = env.reset(config=config)

action = env.action_space.sample()
allowed_action = False
allowed_actions = np.array([np.array([1, 0, 0, 0]), np.array([0, 1, 0, 0]), np.array([0, 2, 0, 0]), np.array([1, 0, 1, 0])])

# Took only an action of the allowed actions
while not allowed_action:
    if (allowed_actions == action).all(1).any():
        allowed_action = True
    else:
        action = env.action_space.sample()

action = np.array([1, 0, 0, 0])
obs, reward, done, info = env.step(action)
obs, reward, done, info = env.step(action)
Пример #6
0
def main():
    #Load parse parameters
    #parser = otc_arg_parser()
    #args = parser.parse_args()

    #Challenge environment
    # if args.env == 'ObtRetro-v6':
    #     env = ObstacleTowerEnv(
    #         '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
    #         timeout_wait=6000,
    #         retro=args.retro,
    #         realtime_mode=args.test)
    #     env = RetroWrapper(env, args.sample_normal)
    #     env = OTCPreprocessing(env, args.action_reduction)
    #     # if show_obs:
    #     #     env = RenderObservations(env)
    #     #     env = KeyboardControlWrapper(env)
    # else:
    env = ObstacleTowerEnv(
        '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
        retro=args.retro,
        realtime_mode=args.test,
        timeout_wait=6000)

    #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000)

    #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library
    #print("ACTIONS:", env._flattener.action_lookup)

    print('FEATURES :', args.features)

    #Preprocess the environment (Grey Scales and action space reduction)
    env = OTCPreprocessing(env, args.action_reduction, args.features)
    env = DummyVecEnv([lambda: env])
    #env = VecEnv(1, env.observation_space, env.action_space)

    print("ACTION SPACE  ///////////:", env.action_space)
    print("OBSERVATION SPACE ///////////////:", env.observation_space)
    #env = make_vec_env(env, n_envs=4)

    ########Training########

    #Study of the impact of different values of the PPO params
    if args.study:
        params_test(MlpPolicy, env)

    #If no Study Mode
    else:
        #If no Test Mode
        if not args.test:

            seed = random.seed(0)

            if args.pretrained_model:

                t = 300000

                model = PPO2.load(args.pretrained_model,
                                  env=env,
                                  tensorboard_log=args.tensorboard_logdir)

            else:

                t = 0

                #If Generalized Advantage Estimator is used
                if args.use_gae:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 lam=args.gae_lambda,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)

                #If Generalized Advantage Estimator is not used
                else:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)
        else:

            model = PPO2.load(args.pretrained_model, env=env)

        #model.learn(total_timesteps=50000)
        #model.save("ObstacleTower_prueba")

        filename = 'argsparams.txt'
        os.makedirs(args.results_dir, exist_ok=True)
        myfile = open(args.results_dir + filename, 'a')
        myfile.write(
            'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n '
            'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' %
            (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef,
             args.max_grad_norm, args.gamma, args.ppo_epoch))
        myfile.close()

        if not args.test:
            while t < args.num_env_steps:
                #TRAIN MODEL
                if t == 0:
                    model.learn(total_timesteps=args.eval_interval)

                else:
                    model.learn(total_timesteps=args.eval_interval,
                                reset_num_timesteps=False)

                os.makedirs(GLOBAL_PATH, exist_ok=True)
                print("Saving in '" + GLOBAL_PATH + "'")
                model.save(GLOBAL_PATH + args.training_name + "_" +
                           str(int(t)).zfill(10))

                avg_reward, avg_floor = test(
                    t, model, env=env, global_path=args.results_dir)  # Test
                log('T = ' + str(t) + ' / ' + str(args.num_env_steps) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' +
                    str(avg_floor))

                t += args.eval_interval
        else:
            obs = env.reset()
            t = 0
            while t < args.num_env_steps:

                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                #print('action :', info)
                env.render('rgb_array')
class Worker(threading.Thread):
    episode_count = 0
    mean_reward = 0
    best_score = 0
    global_steps = 0
    save_lock = threading.Lock()

    def __init__(self, result_queue, idx, save_dir, params):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.worker_idx = idx
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=self.worker_idx,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        # self.local_model = CNN(self.action_size, self.input_shape)
        self.local_model = CnnGru(self.action_size, self.input_shape)

        self.ac_ckpt = params['ckpt']
        self.ac_manager = params['ckpt_mgr']

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_' + str(
            self.worker_idx)
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()

    def get_updated_reward(self, reward, new_health, new_keys, done):
        new_health = float(new_health)
        new_reward = 0.0
        if done:  # reset params when game is terminated
            self._last_health = 99999.
            self._last_keys = 0
        else:
            # opened a door, solved a puzzle, picked up a key
            if 0.1 <= reward < 1:
                new_reward += 0.5

            # crossing a floor - between [1, 4]
            if reward >= 1:
                new_reward += (new_health / 10000)

            # found time orb / crossed a floor
            if new_health > self._last_health:
                new_reward += 0.5

        return new_reward

    def log_worker_metrics(self, episode_reward, loss, step):
        with self.worker_summary_writer.as_default():
            with tf.name_scope('worker'):
                tf.summary.scalar('reward', episode_reward, step=step)
                tf.summary.scalar('loss', loss, step=step)
            self.worker_summary_writer.flush()

    def run(self):
        mem = Memory()
        ep_count = 0
        timestep = 0
        entropy_term = 0
        ep_reward = 0.
        ep_steps = 0
        ep_loss = 0.

        done = False
        obs = self.env.reset()
        state, self._last_keys, self._last_health, _ = obs

        while timestep <= self.timesteps:
            i = 0
            with tf.GradientTape() as tape:
                while i < self.batch_size:
                    # collect experience
                    # get action as per policy
                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, axis=0)
                    action_probs, critic_value = self.local_model(
                        [state, float(self._last_health)], training=True)

                    entropy = -np.sum(action_probs * np.log(action_probs))
                    entropy_term += entropy

                    # choose most probable action
                    dist = tfp.distributions.Categorical(probs=action_probs,
                                                         dtype=tf.float32)
                    action_index = int(dist.sample().numpy())
                    action = self._action_lookup[action_index]

                    # perform action in game env
                    for i in range(4):  # frame skipping
                        obs, reward, done, _ = self.env.step(action)
                        state, new_keys, new_health, cur_floor = obs
                        reward = self.get_updated_reward(
                            reward, new_health, new_keys, done)
                        self._last_health = new_health
                        self._last_keys = new_keys
                        ep_reward += reward
                        ep_steps += 1
                        i += 1
                        timestep += 1

                    # store experience
                    mem.store(action_prob=tf.math.log(
                        action_probs[0, action_index]),
                              value=critic_value[0, 0],
                              reward=reward)

                    if done:
                        break

                # backpropagation
                total_loss = self.local_model.compute_loss(
                    mem, state, done, self.gamma, self.eps, entropy_term)
                ep_loss += total_loss
                Worker.global_steps += ep_steps

            grads = tape.gradient(total_loss,
                                  self.local_model.trainable_variables
                                  )  # calculate local gradients
            self.opt.apply_gradients(
                zip(grads, self.global_model.trainable_variables)
            )  # send local gradients to global model
            self.local_model.set_weights(self.global_model.get_weights(
            ))  # update local model with new weights
            mem.clear()

            if done:
                Worker.mean_reward = (Worker.mean_reward * Worker.episode_count
                                      + ep_reward) / (Worker.episode_count + 1)

                self.log_worker_metrics(ep_reward, ep_loss, ep_count)
                print(
                    "Episode: {} | Mean Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}"
                    .format(Worker.episode_count, Worker.mean_reward,
                            ep_reward, ep_loss, ep_steps, Worker.global_steps,
                            self.worker_idx))
                self.result_queue.put((Worker.mean_reward, total_loss))
                Worker.episode_count += 1
                ep_count += 1

                obs = self.env.reset()
                state, _, _, _ = obs

                # use a lock to save local model and to print to prevent data races.
                if ep_reward > Worker.best_score:
                    with Worker.save_lock:
                        self.ac_manager.save()
                        print("Saved checkpoint for step {}".format(
                            int(self.ac_ckpt.step)))
                        self.ac_ckpt.step.assign_add(1)

                        keras.models.save_model(self.global_model,
                                                self.model_path)
                        print('\nSaved best model to: {}, episode score: {}\n'.
                              format(self.model_path, ep_reward))
                        Worker.best_score = ep_reward

                entropy_term = 0
                ep_reward = 0.
                ep_steps = 0
                ep_loss = 0.

        self.result_queue.put(None)
        self.env.close()
Пример #8
0
class Worker(object):
    def __init__(self,
                 envpath,
                 wid,
                 retro,
                 realtime_mode,
                 env_seed=0,
                 env_floor=0):
        self.wid = wid
        self.env = ObstacleTowerEnv(environment_filename=envpath,
                                    worker_id=wid,
                                    retro=retro,
                                    realtime_mode=realtime_mode)
        self.kprun = GLOBAL_KPRUN
        self.tableAction = self.createActionTable()
        # 設定關卡
        self.env_seed = env_seed
        self.env_floor = env_floor
        self.step = 0
        self.summary = tf.Summary(value=[
            tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                             simple_value=0)
        ])
        self.kprun.train_writer.add_summary(self.summary, 0)

    def createActionTable(self):
        tableAction = []
        for a in range(0, 3):
            for b in range(0, 3):
                for c in range(0, 2):
                    tableAction.append([a, b, c, 0])
        # print("Action option: ", tableAction[0:17])
        return tableAction

    def reward_compute(self, done, reward_total, keys, previous_keys, reward,
                       previous_reward, time_remaining,
                       previous_time_remaining, previous_stage_time_remaining):
        # 定義獎勵公式
        # reward 是從環境傳來的破關數
        # keys 是撿到鑰匙的數量
        # time_remaining 是剩餘時間
        # 過關最大獎勵為10
        # 一把鑰匙為5
        # 時間果實暫時只給0.5,因為結束會結算剩餘時間,會有獎勵累加的問題。
        # 如果過關,給予十倍過關獎勵 - (場景開始的時間-剩餘時間)/1000
        # print("time_remaining ", time_remaining,
        #       " previous_time_remaining ", previous_time_remaining,
        #         " reward ", reward)
        # 通過一個會開門的綠門會加0.1
        if (reward - previous_reward) > 0 and (reward - previous_reward) < 0.3:
            reward_total += 3
        elif (reward - previous_reward) > 0.9:
            # ***如果剩餘時間比場景時間多會變成加分獎勵,可能會極大增加Agent吃時間果實的機率。
            # ***另一種方式是剩餘的時間直接/1000加上去,這樣就沒有累加效果。
            print("Pass ", reward, " Stage!")
            # reward_total += (reward - previous_reward) * 100 - \
            #                 (previous_stage_time_remaining - time_remaining)

            reward_total += 200
            # 過關之後把時間留到下一關,儲存這回合時間供下次計算過關使用
            previous_time_remaining = time_remaining
            previous_stage_time_remaining = time_remaining
            # Lesson 1 repeat
            if reward > 6.5:
                # self.total_step +=1
                # if self.total_step >=5:
                #     done = True
                #     return reward_total, previous_stage_time_remaining, done
                self.env.seed(np.random.randint(5))
                # env.reset()
                done = True
            return reward_total, previous_stage_time_remaining, done

        # 假設過關的時候有順便吃到果實或鑰匙,所以預設為同時可以加成
        if previous_keys > keys:
            # print("Get Key")
            reward_total += 5

        if previous_time_remaining < time_remaining and previous_time_remaining != 0:
            # print("Get time power up")
            reward_total += 2
        else:
            reward_total -= 0.5
        if done and previous_time_remaining > 100:
            print("Agent died")
            # 如果剩餘時間越多就掛點,扣更多
            # reward_total -= (10 + time_remaining / 100)
            reward_total -= 100
        return reward_total, previous_stage_time_remaining, done

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        # 設定關卡
        self.env.seed(self.env_seed)
        self.env.floor(self.env_floor)
        # 只要還沒達到目標回合就LOOP
        while not COORD.should_stop():
            # 紀錄步數
            self.step += 1
            # 重設關卡
            obs = self.env.reset()
            # 初始化
            done = False
            stage_reward = 0.0
            reward = 0
            keys = 0
            # 檢查是否有吃到加時間的,如果是第一回合出來沒有time_remaining,事先定義
            time_remaining = 3000
            previous_stage_time_remaining = time_remaining
            # 預處理圖像
            # previous_preprocessed_observation_image = np.reshape(obs[0], [-1])
            previous_preprocessed_observation_image = obs[0]
            buffer_s, buffer_a, buffer_r = [], [], []
            # 只要沒死
            while not done:
                # 如果模型正在更新就等待更新完成
                if not ROLLING_EVENT.is_set():
                    # 等待更新完成
                    ROLLING_EVENT.wait()
                    # 清除記憶體,使用新的代理收集資料
                    buffer_s, buffer_a, buffer_r = [], [], []

                # 儲存上一個動作狀態,供計算獎勵用
                previous_keys = keys
                previous_reward = reward
                previous_time_remaining = time_remaining

                # 根據上一次的狀態決定動作
                action = self.kprun.choose_action(
                    previous_preprocessed_observation_image)
                action = np.clip(np.random.normal(action, 1.), *[6, 12])

                # 做出動作,獲得場景資訊,已過關數,代理資訊
                observation, reward, done, info = self.env.step(
                    np.array(self.tableAction[int(action)]))

                # 預處理模型需要的資料
                observation_image, keys, time_remaining = observation
                # preprocessed_observation_image = np.reshape(
                #     observation_image, [-1])
                preprocessed_observation_image = observation_image
                stage_reward, previous_stage_time_remaining, done = self.reward_compute(
                    done=done,
                    reward_total=stage_reward,
                    keys=keys,
                    previous_keys=previous_keys,
                    reward=reward,
                    previous_reward=previous_reward,
                    time_remaining=time_remaining,
                    previous_time_remaining=previous_time_remaining,
                    previous_stage_time_remaining=previous_stage_time_remaining
                )
                # Normalize reward~不知道中文怎麼打
                stage_reward = stage_reward + 8 / 8

                # 把這次狀態存入 記憶體
                buffer_s.append(np.array([preprocessed_observation_image]))
                buffer_a.append(action)
                buffer_r.append(stage_reward)

                # 儲存下一步要參考的圖像
                previous_preprocessed_observation_image = preprocessed_observation_image

                # 達到更新時,自己先做處理。
                GLOBAL_UPDATE_COUNTER += 1
                # 太多自己就先處理更新
                if len(buffer_s) == EP_LEN - \
                        1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.kprun.get_v(preprocessed_observation_image)
                    # 計算折扣獎勵
                    discounted_r = []
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()
                    # 整理維度
                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    # 把資料放入共享記憶體
                    QUEUE.put(bs)
                    QUEUE.put(ba)
                    QUEUE.put(br)
                    # print("len(buffer_s)", len(buffer_s))
                    # print("bs.shape", bs.shape)
                    # 清空暫存
                    buffer_s, buffer_a, buffer_r = [], [], []
                    # 如果整個模型步數到達最小BATCH 就整個更新
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        # 停止收集資料
                        ROLLING_EVENT.clear()
                        # 更新PPO
                        UPDATE_EVENT.set()
                    # 達到最多EP停止訓練
                    if GLOBAL_EP >= EP_MAX:
                        COORD.request_stop()
                        break
            # 紀錄獎勵
            self.summary = tf.Summary(value=[
                tf.Summary.Value(tag="Stage_reward " + str(self.wid),
                                 simple_value=stage_reward)
            ])
            self.kprun.train_writer.add_summary(self.summary, self.step)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % stage_reward,
            )
        self.env.close()
Пример #9
0
import torch
from obstacle_tower_env import ObstacleTowerEnv
from matplotlib import pyplot as plt
from collections import deque
from sac import SoftActorCriticAgent
import torchvision.transforms.functional as TF
import numpy as np
import matplotlib.pyplot as plt

env = ObstacleTowerEnv(retro=False, realtime_mode=False)
print(env.action_space)
print(env.observation_space)

agent = SoftActorCriticAgent()

state = env.reset()
#print(state.shape)
state = state[0]
state = TF.to_tensor(state)
print(state.size)
scores = []
mean_scores_100 = deque(maxlen=100)
version = 'v3'
for episode in range(400):
    timesteps = 0
    rewards = 0
    for steps in range(10000):
        timesteps += 1
        actions, actions_env_format = agent.select_actions(state)
        next_state, reward, done, info = env.step(actions_env_format)
        next_state = next_state[0]
class Worker(threading.Thread):
    episode_count = 0
    running_reward = 0
    best_score = 0
    global_steps = 0
    save_lock = threading.Lock()

    def __init__(self, result_queue, params, save_dir):
        super(Worker, self).__init__()
        self.result_queue = result_queue
        self.save_dir = save_dir
        self.model_path = os.path.join(self.save_dir, 'model_a3c_distributed')

        self.env = ObstacleTowerEnv(params['env_path'],
                                    worker_id=1,
                                    retro=False,
                                    realtime_mode=False,
                                    greyscale=False,
                                    config=train_env_reset_config)

        self.action_size = params['action_size']
        self._action_lookup = params['action_lookup']
        self.input_shape = self.env.observation_space[0].shape  # (84, 84, 3)
        self._last_health = 99999.
        self._last_keys = 0

        self.global_model = params['global_model']
        self.mirrored_strategy = tf.distribute.MirroredStrategy()
        with self.mirrored_strategy.scope():
            # self.local_model = CNN(self.action_size, self.input_shape)
            self.local_model = CnnGru(self.action_size, self.input_shape)

        self.current_time = params['log_timestamp']
        train_log_dir = './logs/' + self.current_time + '/worker_1'
        self.worker_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.timesteps = params['timesteps']
        self.batch_size = params['batch_size']
        self.gamma = params['gamma']
        self.lr = params['lr']
        self.opt = params['optimizer']
        self.eps = np.finfo(np.float32).eps.item()
        self.ep_loss = 0.0

    def get_updated_reward(self, reward, new_health, new_keys, done):
        new_health = float(new_health)
        if done:  # penalize when game is terminated
            self._last_health = 99999.
            self._last_keys = 0
            reward = -1
        else:
            # crossing a floor- between [1, 4]
            if reward >= 1:
                reward += (new_health / 10000)

            # found time orb / crossed a floor
            if new_health > self._last_health:
                reward += 0.1

            # found a key
            if new_keys > self._last_keys:
                reward += 0.1

        return reward

    def log_worker_metrics(self, episode_reward, avg_reward, loss, step):
        with self.worker_summary_writer.as_default():
            tf.summary.scalar('reward', episode_reward, step=step)
            tf.summary.scalar('moving_reward', avg_reward, step=step)
            tf.summary.scalar('loss', loss, step=step)
            self.worker_summary_writer.flush()

    def run(self):
        mem = Memory()
        rewards = []
        ep_count = 1
        timestep = 0
        entropy_term = 0
        ep_reward = 0.
        ep_steps = 0
        ep_loss = 0.

        done = False
        obs = self.env.reset()
        state, _, _, _ = obs

        while timestep <= self.timesteps:
            with tf.GradientTape() as tape:
                for i in range(self.batch_size):
                    # collect experience
                    # get action as per policy
                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, axis=0)
                    action_probs, critic_value = self.local_model(
                        state, training=True)

                    entropy = -np.sum(action_probs * np.log(action_probs))
                    entropy_term += entropy

                    # choose most probable action
                    action_index = np.random.choice(self.action_size,
                                                    p=np.squeeze(action_probs))
                    action = self._action_lookup[action_index]

                    # perform action in game env
                    for i in range(4):  # frame skipping
                        obs, reward, done, _ = self.env.step(action)
                        state, new_keys, new_health, cur_floor = obs

                        reward = self.get_updated_reward(
                            reward, new_health, new_keys, done)
                        self._last_health = new_health
                        self._last_keys = new_keys

                        ep_reward += reward
                        ep_steps += 1
                        timestep += 1

                    # store experience
                    mem.store(action_prob=action_probs[0, action_index],
                              value=critic_value[0, 0],
                              reward=reward)

                    if done:
                        break

                # backpropagation
                total_loss = self.local_model.compute_loss(
                    mem, state, done, self.gamma, self.eps, entropy_term)
                ep_loss += total_loss
                Worker.global_steps += ep_steps

            grads = tape.gradient(total_loss,
                                  self.local_model.trainable_variables
                                  )  # calculate local gradients
            # self.opt.apply_gradients(zip(grads, self.global_model.trainable_variables))  # send local gradients to global model
            # self.local_model.set_weights(self.global_model.get_weights())  # update local model with new weights
            mem.clear()

            if done:
                rewards.append(ep_reward)
                Worker.running_reward = sum(rewards[-10:]) / 10

                self.log_worker_metrics(ep_reward, Worker.running_reward,
                                        ep_loss, ep_count)
                print(
                    "Episode: {} | Average Reward: {:.3f} | Episode Reward: {:.3f} | Loss: {:.3f} | Steps: {} | Total Steps: {} | Worker: {}"
                    .format(Worker.episode_count, Worker.running_reward,
                            ep_reward, ep_loss, ep_steps, Worker.global_steps,
                            1))
                self.result_queue.put((Worker.running_reward, total_loss))
                Worker.episode_count += 1
                ep_count += 1

                obs = self.env.reset()
                state, _, _, _ = obs

                # use a lock to save local model and to print to prevent data races.
                if ep_reward > Worker.best_score:
                    with Worker.save_lock:
                        print(
                            '\nSaving best model to: {}, episode score: {}\n'.
                            format(self.model_path, ep_reward))
                        keras.models.save_model(self.local_model,
                                                self.model_path)
                        Worker.best_score = ep_reward

                entropy_term = 0
                ep_reward = 0.
                ep_steps = 0
                ep_loss = 0.

        keras.models.save_model(self.local_model, self.model_path)
        self.result_queue.put(None)
        self.env.close()
class RandomAgent:
    """Random Agent that will play the specified game
      Args:
        env_name: Name of the environment to be played
        max_eps: Maximum number of episodes to run agent for.
    """
    def __init__(self,
                 env_path,
                 train=False,
                 evaluate=False,
                 eval_seeds=[],
                 max_eps=100,
                 save_dir=None,
                 plot=False):
        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=False,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=False,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)
        self.max_episodes = max_eps
        self.global_moving_average_reward = 0
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.plot = plot
        self.res_queue = Queue()

    def train(self):
        start_time = time.time()
        reward_avg = 0
        global_steps = 0
        moving_average_rewards = []
        for episode in range(self.max_episodes):
            done = False
            self.env.reset()
            reward_sum = 0.0
            steps = 0
            while not done:
                # Sample randomly from the action space and step
                _, reward, done, _ = self.env.step(
                    self.env.action_space.sample())
                steps += 1
                global_steps += 1
                reward_sum += reward

            if self.plot:
                # Record statistics
                moving_average_rewards.append(reward_sum)

            reward_avg += reward_sum
            self.global_moving_average_reward = record(
                episode, reward_sum, 0, self.global_moving_average_reward,
                self.res_queue, 0, steps, global_steps)
        end_time = time.time()
        print("\nTraining complete. Time taken = {} secs".format(end_time -
                                                                 start_time))
        final_avg = reward_avg / float(self.max_episodes)
        print("Average score across {} episodes: {}".format(
            self.max_episodes, final_avg))

        if self.plot:
            plt.plot(moving_average_rewards)
            plt.ylabel('Moving average episode reward')
            plt.xlabel('Step')
            plt.savefig(
                os.path.join(self.save_dir, 'model_random_moving_average.png'))

        self.env.close()
        return final_avg

    def play_single_episode(self):
        action_space = ActionSpace()
        print("Playing single episode...")
        done = False
        step_counter = 0
        reward_sum = 0
        obs = self.env.reset()
        state, _, _, _ = obs

        try:
            while not done:
                action = self.env.action_space.sample()
                obs, reward, done, info = self.env.step(action)
                reward_sum += reward
                print("{}. Reward: {}, action: {}".format(
                    step_counter, reward_sum,
                    action_space.get_action_meaning(action)))
                step_counter += 1
        except KeyboardInterrupt:
            print("Received Keyboard Interrupt. Shutting down.")
        finally:
            if not self.evaluate:
                self.env.close()
            return reward_sum

    def evaluate(self):
        # run episodes until evaluation is complete
        while not self.env.evaluation_complete:
            episode_reward = self.play_single_episode()

        pprint(self.env.results)
        self.env.close()
Пример #12
0
def params_test(policy,
                env,
                n_steps=args.n_steps_test,
                ppo_epochs=args.ppo_epoch_test,
                clip_params=args.clip_param_test,
                gammas=args.gamma_test,
                lambdas=args.gae_lambda_test,
                loss_coefs=args.value_loss_coef_test,
                entropy_coefs=args.entropy_coef_test,
                lrs=args.lr_test):

    i = 0
    for n_step in n_steps:  #[512]
        for ppo_epoch in ppo_epochs:  #[4]
            for clip_param in clip_params:  #[0.1, 0.2]
                for gamma in gammas:  #[0.99, 0.9997]
                    for lb in lambdas:  #[0.95]
                        for value_loss_coef in loss_coefs:  #[0.5]
                            for entropy_coef in entropy_coefs:  #[0.01, 0.001]
                                for lr in lrs:  #[2.5e-4, 7e-4]

                                    i += 1

                                    # if i in [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16]: #Falta el 6 y el 14 por hacer completo
                                    #     pass
                                    # else:

                                    # if i == 15:
                                    #     print('Continue Training')
                                    #     trained_model = "/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/python_scripts/Obstacle_Tower_Carmen_Raposo/results/June-17-2020_01_46AM/model/__15__study_0000350000.zip"
                                    #     model = PPO2.load(trained_model, env=env, tensorboard_log="/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/python_scripts/Obstacle_Tower_Carmen_Raposo/results/June-17-2020_01_46AM/tensorboard/15/")
                                    #     t = 375000
                                    #     GLOBAL_PATH = "/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/python_scripts/Obstacle_Tower_Carmen_Raposo/results/June-17-2020_01_46AM/model/"
                                    #     filename = 'argsparams' + str(i) + '.txt'
                                    #     os.makedirs(args.results_dir, exist_ok=True)
                                    #
                                    #
                                    # else:
                                    print(
                                        'Start Training: \n n_step: %f \n ppo_epoch: %f \n clip_param: %f \n gamma: %f'
                                        '\n lambda: %f \n value_loss_coef: %f \n entropy_coef: %f \n learning_rate : %f'
                                        % (n_step, ppo_epoch, clip_param,
                                           gamma, lb, value_loss_coef,
                                           entropy_coef, lr))
                                    #Fixed seed
                                    seed = random.seed(0)
                                    #env.seed(5)

                                    if args.use_gae_test:

                                        model = PPO2(
                                            policy,
                                            env,
                                            n_steps=n_step,
                                            verbose=1,
                                            tensorboard_log=args.
                                            tensorboard_logdir + str(i) + '/',
                                            cliprange=clip_param,
                                            learning_rate=lr,
                                            ent_coef=entropy_coef,
                                            vf_coef=value_loss_coef,
                                            max_grad_norm=args.max_grad_norm,
                                            gamma=gamma,
                                            lam=lb,
                                            noptepochs=ppo_epoch,
                                            seed=seed)
                                    else:

                                        model = PPO2(
                                            policy,
                                            env,
                                            n_steps=n_step,
                                            verbose=1,
                                            tensorboard_log=args.
                                            tensorboard_logdir + str(i) + '/',
                                            cliprange=clip_param,
                                            learning_rate=lr,
                                            ent_coef=entropy_coef,
                                            vf_coef=value_loss_coef,
                                            max_grad_norm=args.max_grad_norm,
                                            gamma=gamma,
                                            noptepochs=ppo_epoch,
                                            seed=seed)

                                    #Save the values of the configured parameters
                                    filename = 'argsparams' + str(i) + '.txt'
                                    os.makedirs(args.results_dir,
                                                exist_ok=True)
                                    myfile = open(args.results_dir + filename,
                                                  'w+')
                                    myfile.write(
                                        'n_step: %f \n ppo_epoch: %f \n clip_param: %f \n gamma: %f \n lambda: %f '
                                        '\n value_loss_coef: %f \n entropy_coef: %f \n learning_rate : %f'
                                        % (n_step, ppo_epoch, clip_param,
                                           gamma, lb, value_loss_coef,
                                           entropy_coef, lr))
                                    myfile.close()
                                    t = 0

                                    #t = 0
                                    while t < args.num_env_steps_test:
                                        # TRAIN MODEL
                                        try:
                                            if t == 0:
                                                model.learn(
                                                    total_timesteps=args.
                                                    eval_interval)

                                            else:
                                                model.learn(
                                                    total_timesteps=args.
                                                    eval_interval,
                                                    reset_num_timesteps=False)

                                            os.makedirs(GLOBAL_PATH,
                                                        exist_ok=True)
                                            print("Saving in '" + GLOBAL_PATH +
                                                  "'")
                                            model.save(GLOBAL_PATH + '__' +
                                                       str(i) + '__' +
                                                       args.training_name +
                                                       "_" +
                                                       str(int(t)).zfill(10))

                                            avg_reward, avg_floor = test(
                                                t,
                                                model,
                                                env=env,
                                                global_path=GLOBAL_PATH +
                                                '__' + str(i),
                                                i=i)  # Test
                                            log('T = ' + str(t) + ' / ' +
                                                str(args.num_env_steps_test) +
                                                ' | Avg. reward: ' +
                                                str(avg_reward) +
                                                ' | Avg. floor: ' +
                                                str(avg_floor))

                                            t += args.eval_interval
                                        except Exception as e:

                                            env.close()

                                            myfile = open(
                                                GLOBAL_PATH + filename, 'a')
                                            myfile.write(
                                                '\n An exception %s has occured at step %f'
                                                % (e, t))
                                            myfile.close()

                                            del model

                                            from obstacle_tower_env import ObstacleTowerEnv
                                            env = ObstacleTowerEnv(
                                                '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
                                                retro=args.retro,
                                                realtime_mode=args.test,
                                                timeout_wait=6000)

                                            break

                                    env.reset()
                                    break
                                    del model
Пример #13
0
     t = threading.Thread(target=worker.work, args=())
     t.start()  # training
     threads.append(t)
 # 建立模型更新的執行緒
 threads.append(threading.Thread(target=GLOBAL_KPRUN.update, ))
 threads[-1].start()
 COORD.join(threads)
 # 儲存模型
 GLOBAL_KPRUN.save()
 time.sleep(5)
 # 試跑
 env = ObstacleTowerEnv('./ObstacleTower/obstacletower.exe',
                        worker_id=10,
                        retro=False,
                        realtime_mode=True)
 obs = env.reset()
 print("執行測試環境,如果要離開請按Q")
 previous_preprocessed_observation_image = np.reshape(obs[0], [-1])
 while True:
     action = GLOBAL_KPRUN.choose_action(
         previous_preprocessed_observation_image)
     # 多執行緒會有跑不動的問題
     if np.isnan(action):
         action = np.random.randint(6, high=12)
     # 做出動作,獲得場景資訊,已過關數,代理資訊
     observation, reward, done, info = env.step(
         np.array(GLOBAL_KPRUN.tableAction[int(action)]))
     # 預處理模型需要的資料
     observation_image, keys, time_remaining = observation
     preprocessed_observation_image = np.reshape(observation_image, [-1])
     if 0xFF == ord('q'):
Пример #14
0
class StableA2C():
    def __init__(self,
                 env_path,
                 train,
                 evaluate,
                 policy_name='CnnPolicy',
                 save_dir='./model_files/',
                 eval_seeds=[]):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.model_path = os.path.join(self.save_dir, 'model_stable_a2c')
        self.log_dir = './logs/stable_a2c'
        self.policy_name = policy_name
        self.evaluate = evaluate

        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=True,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)

    def load_model(self):
        print('Loading model from: {}'.format(self.model_path))
        model = A2C.load(self.model_path)
        model.set_env(self.env)
        model.tensorboard_log = self.log_dir
        return model

    def train(self, timesteps=10000, continue_training=False):
        start_time = time.time()
        if not continue_training:
            print("Initializing from scratch")
            model = A2C(self.policy_name,
                        self.env,
                        verbose=1,
                        tensorboard_log=self.log_dir)
        else:
            model = self.load_model()
            print("Restored from {}".format(self.model_path))

        model.learn(total_timesteps=timesteps)
        print('\nTraining complete. Time taken = {} secs'.format(time.time() -
                                                                 start_time))
        model.save(self.model_path)

    def play_single_episode(self):
        """ have the trained agent play a single game """
        action_space = ActionSpace()
        done = False
        reward_sum = 0
        step_counter = 0

        model = self.load_model()
        obs = self.env.reset()
        try:
            print("Playing single episode...")
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = self.env.step(action)
                print("{}. Reward: {}, action: {}".format(
                    step_counter, reward_sum,
                    action_space.get_full_action_meaning(action)))
                self.env.render()
                step_counter += 1
                reward_sum += reward
        except KeyboardInterrupt:
            print("Received Keyboard Interrupt. Shutting down.")
        finally:
            if not self.evaluate:
                self.env.close()
                print("Environment closed.")
            print("Game play completed.")
            return reward_sum

    def evaluate(self):
        """ run episodes until evaluation is complete """
        while not self.env.evaluation_complete:
            episode_reward = self.play_single_episode()

        pprint(self.env.results)
        self.env.close()
Пример #15
0
class WrappedObstacleTowerEnv():
    def __init__(self,
                 environment_filename=None,
                 docker_training=False,
                 worker_id=0,
                 retro=False,
                 timeout_wait=30,
                 realtime_mode=False,
                 num_actions=3,
                 mobilenet=False,
                 gray_scale=False,
                 autoencoder=None,
                 floor=0):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training, worker_id,
                                                    retro, timeout_wait,
                                                    realtime_mode)
        if floor != 0:
            self._obstacle_tower_env.floor(floor)
        self._flattener = ActionFlattener([3, 3, 2, 3])
        self._action_space = self._flattener.action_space
        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        if mobilenet:
            self.image_module = WrappedKerasLayer(retro, self.mobilenet)
        self._done = False
        if autoencoder:
            print("Loading autoencoder from {}".format(autoencoder))
            self.autoencoder = build_autoencoder(autoencoder)
            print("Done.")
        else:
            self.autoencoder = None

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def gray_process_observation(self, observation):
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        gray_observation = np.mean(np.array(obs_image), axis=-1, keepdims=True)
        gray_observation = (gray_observation / 255)

        # gray_observation = self.autoencoder.predict(gray_observation)
        return gray_observation

    def _preprocess_observation(self, observation):
        """
        Re-sizes visual observation to 84x84
        """
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((224, 224), Image.NEAREST)
        return np.array(obs_image)

    def reset(self):
        observation = self._obstacle_tower_env.reset()
        observation, key, time = observation
        self._done = False
        if self.mobilenet:
            if self.autoencoder:
                observation = self.autoencoder.predict(observation[None, :])[0]
            return self.image_module(self._preprocess_observation(
                observation)), observation, key, time
        elif self.gray_scale:
            gray_observation = self.gray_process_observation(observation)
            if self.autoencoder:
                gray_observation = self.autoencoder.predict(
                    gray_observation[None, :])[0]
            return gray_observation, observation
        else:
            return self._preprocess_observation(observation), observation

    def step(self, action):
        #if self._done:
        #    return self.reset()

        if action == 0:  # forward
            action = [1, 0, 0, 0]
        elif action == 1:  # rotate camera left
            action = [0, 1, 0, 0]
        elif action == 2:  # rotate camera right
            action = [0, 2, 0, 0]
        elif action == 3:  # jump forward
            action = [1, 0, 1, 0]
        # elif action == 5:
        #     action = [2, 0, 0, 0]
        # elif action == 6:
        #     action = [0, 0, 0, 1]
        # elif action == 7:
        #     action = [0, 0, 0, 2]

        observation, reward, done, info = self._obstacle_tower_env.step(action)
        observation, key, time = observation
        self._done = done

        if self.mobilenet:
            if self.autoencoder:
                observation = self.autoencoder.predict(observation[None, :])[0]
            return (self.image_module(
                self._preprocess_observation(observation)), reward, done,
                    info), observation, key, time
        elif self.gray_scale:
            gray_observation = self.gray_process_observation(observation)
            if self.autoencoder:
                gray_observation = self.autoencoder.predict(
                    gray_observation[None, :])[0]
            return (gray_observation, reward, done, info), observation
        else:
            return (self._preprocess_observation(observation), reward, done,
                    info), observation

    def close(self):
        self._obstacle_tower_env.close()

    def floor(self, floor):
        self._obstacle_tower_env.floor(floor)
Пример #16
0
import os

from obstacle_tower_env import ObstacleTowerEnv

env = ObstacleTowerEnv(os.environ['OBS_TOWER_PATH'], worker_id=0)

env.seed(72)
env.floor(12)
env.reset()
for action in [
        18, 18, 18, 18, 18, 18, 30, 24, 24, 21, 18, 18, 30, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 30, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 30, 30, 30, 30, 24, 24, 6, 6, 6, 6, 6,
        6, 6, 6, 30, 30, 30, 30, 30, 18, 24, 24, 24, 6, 6, 6, 6, 6, 6, 24, 18,
        24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 6, 6, 6, 24, 24, 24, 18, 30, 18,
        18, 30, 18, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 30, 24, 24, 30, 30,
        24, 24, 24, 30, 30, 30, 30, 30, 18, 18, 18, 18, 30, 30, 30, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 24, 24, 24, 24, 24, 24, 24, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 24, 18, 18, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 24, 18, 30, 18, 18, 18, 18, 30, 30, 30, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 30, 18, 18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 30, 24, 24, 24, 24, 24, 24, 24, 24, 18, 30, 18, 18, 18,
        18, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 30,
        30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 24, 21, 18, 24, 24, 24, 24, 18, 18, 18, 24, 18, 18, 18, 18,
        30, 18, 18, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        24, 24, 24, 24, 24, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 30, 30, 30, 18, 18, 30, 30, 30, 30, 30, 30, 12, 12, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18, 18, 18, 18, 18, 18, 18,
Пример #17
0
def main():
    basicConfig(level=INFO)
    env = ObstacleTowerEnv(str(PRJ_ROOT / 'obstacletower'), retro=False, worker_id=9)
    done = False
    env.floor(1)
    env.reset()

    screen = Screen()
    random_actor = RandomRepeatActor(continue_rate=0.9)
    random_actor.reset(schedules=[
        (Action.CAMERA_RIGHT, 3),
        (Action.CAMERA_LEFT, 6),
        (Action.CAMERA_RIGHT, 3),
        (Action.NOP, 5),
        (Action.FORWARD, 8),
        (Action.RIGHT, 2),
        (Action.LEFT, 4),
        (Action.RIGHT, 2),
    ])

    frame_history = FrameHistory(env)
    moving_checker = MovingChecker(frame_history)
    position_estimator = PositionEstimator(moving_checker)
    map_observation = MapObservation(position_estimator, moving_checker)
    event_handlers: List[EventHandler] = [
        frame_history,
        moving_checker,
        position_estimator,
        map_observation,
    ]

    while not done:
        for h in event_handlers:
            h.begin_loop()

        screen.show("original", frame_history.last_frame)
        cv2.waitKey(0)

        for h in event_handlers:
            h.before_step()

        action = random_actor.decide_action(moving_checker.did_move)
        obs, reward, done, info = env.step(action)
        if reward != 0:
            logger.info(f"Get Reward={reward} Keys={obs[1]}")
        # logger.info(f"Keys={obs[1]} Time Remain={obs[2]}")

        params = EventParamsAfterStep(action, obs, reward, done, info)
        for h in event_handlers:
            h.after_step(params)

        screen.show("map", map_observation.concat_images())

        if len(frame_history.small_frame_pixel_diffs) > 0:
            f1 = frame_history.small_frame_pixel_diffs[-1]
            if len(frame_history.small_frame_pixel_diffs) > 1:
                f2 = frame_history.small_frame_pixel_diffs[-2]
                f1 = np.concatenate((f2, f1), axis=1)
            screen.show("diff", f1)

        for h in event_handlers:
            h.end_loop()
Пример #18
0
class WrappedObstacleTowerEnv():

    def __init__(
        self,
        environment_filename=None,
        docker_training=False,
        worker_id=0,
        retro=False,
        timeout_wait=3000,
        realtime_mode=False,
        num_actions=3,
        stack_size=4,
        mobilenet=False,
        gray_scale=False,
        floor=0,
        visual_theme=0
        ):
        '''
        Arguments:
          environment_filename: The file path to the Unity executable.  Does not require the extension.
          docker_training: Whether this is running within a docker environment and should use a virtual
            frame buffer (xvfb).
          worker_id: The index of the worker in the case where multiple environments are running.  Each
            environment reserves port (5005 + worker_id) for communication with the Unity executable.
          retro: Resize visual observation to 84x84 (int8) and flattens action space.
          timeout_wait: Time for python interface to wait for environment to connect.
          realtime_mode: Whether to render the environment window image and run environment at realtime.
        '''

        self._obstacle_tower_env = ObstacleTowerEnv(environment_filename,
                                                    docker_training,
                                                    worker_id,
                                                    retro,
                                                    timeout_wait,
                                                    realtime_mode)
        if floor is not 0:
            self._obstacle_tower_env.floor(floor)
        self.start_floor = floor
        self.current_floor = floor

        self.mobilenet = mobilenet
        self.gray_scale = gray_scale
        self.retro = retro
        if mobilenet:
            self.state_size = [1280]
        elif gray_scale:
            self.state_size = [84, 84, 1]
        elif retro:
            self.state_size = [84, 84, 3]
        else:
            self.state_size = [168, 168, 3]

        self.stack_size = stack_size
        self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)]
        self.total_reward = 0
        self.current_reward = 0
        self.max_floor = 25
        self.visual_theme = visual_theme

        self.id = worker_id

    def gray_preprocess_observation(self, observation):
        '''
        Re-sizes obs to 84x84 and compresses to grayscale
        '''
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((84, 84), Image.NEAREST)
        gray_observation = np.mean(np.array(obs_image),axis=-1,keepdims=True)
        return gray_observation / 255

    def mobile_preprocess_observation(self, observation):
        """
        Re-sizes obs to 224x224 for mobilenet
        """
        observation = (observation * 255).astype(np.uint8)
        obs_image = Image.fromarray(observation)
        obs_image = obs_image.resize((224, 224), Image.NEAREST)
        return self.mobilenet(np.array(obs_image))

    def reset(self):
        # Reset env, stack and floor
        # (We save state as an attribute so child objects can access it)
        config = {"total-floors": 15}
        self.state = self._obstacle_tower_env.reset(config)
        self.state, reward, done, info = self._obstacle_tower_env.step(18)
        self.current_floor = self.start_floor
        self.stack = [np.random.random(self.state_size).astype(np.float32) for _ in range(self.stack_size)]
        self.total_reward = 0
        self.current_reward = 0

        # Preprocess current obs and add to stack
        if self.retro:
            observation = (self.state / 255).astype(np.float32)
        else:
            observation, key, time = self.state

        if self.mobilenet:
            observation = self.mobile_preprocess_observation(observation)
        elif self.gray_scale:
            observation = self.gray_preprocess_observation(observation)

        self.stack = self.stack[1:] + [observation]

        # Build our state (MUST BE A TUPLE)
        #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy()
        one_hot_floor = np.zeros(self.max_floor)
        one_hot_floor[self.current_floor] += 1
        floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32)
        stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32)
        if self.retro is True:
            ret_state = (stacked_state, floor_data)
        else:
            # Clip time to 2000, then normalize
            time = (2000. if time > 2000 else time) / 2000.
            key_time_data = np.array([key, time]).astype(np.float32)
            #key_time_data = np.array([key]).astype(np.float32)
            ret_state = (stacked_state, floor_data, key_time_data)

        return ret_state, info

    def step(self, action):
        # Convert int action to vector required by the env
        if self.retro:
            if action == 0: # forward
                action = 18
            elif action == 1: # rotate camera left
                action = 24
            elif action == 2: # rotate camera right
                action = 30
            elif action == 3: # jump forward
                action = 21
            elif action == 4:
                action = 6
            elif action == 5:
                action = 12
        else:
            if action == 0: # forward
                action = [1, 0, 0, 0]
            elif action == 1: # rotate camera left
                action = [1, 1, 0, 0]
            elif action == 2: # rotate camera right
                action = [1, 2, 0, 0]
            elif action == 3: # jump forward
                action = [1, 0, 1, 0]

        # Take the step and record data
        # (We save state as an attribute so child objects can access it)
        self.state, reward, done, info = self._obstacle_tower_env.step(action)

        # Keep track of current floor reward and total reward
        if reward >= 0.95:
            self.current_floor += 1
            self.current_reward = 0
            done = True
        else:
            self.current_reward += reward
        self.total_reward += reward
        
        if (done and reward < 0.95) or self.current_floor == 15:
            # Save info and reset when an episode ends
            info["episode_info"] = {"floor": self.current_floor, "total_reward": self.total_reward}
            ret_state, _ = self.reset()
        else:
            # Preprocess current obs and add to stack
            if self.retro:
                observation = (self.state / 255).astype(np.float32)
            else:
                observation, key, time = self.state

            if self.mobilenet:
                observation = self.mobile_preprocess_observation(observation)
            elif self.gray_scale:
                observation = self.gray_preprocess_observation(observation)

            self.stack = self.stack[1:] + [observation]

            # Build our state (MUST BE A TUPLE)
            #one_hot_floor = tf.one_hot(self.current_floor, self.max_floor).numpy()
            one_hot_floor = np.zeros(self.max_floor)
            one_hot_floor[self.current_floor] += 1
            floor_data = np.append(one_hot_floor, self.current_reward).astype(np.float32)
            stacked_state = np.concatenate(self.stack, axis=-1).astype(np.float32)
            if self.retro is True:
                ret_state = (stacked_state, floor_data)
            else:
                # Clip time to 2000, then normalize
                time = (2000. if time > 2000 else time) / 2000.
                key_time_data = np.array([key, time]).astype(np.float32)
                #key_time_data = np.array([key]).astype(np.float32)
                ret_state = (stacked_state, floor_data, key_time_data)

        return ret_state, reward, done, info

    def close(self):
        self._obstacle_tower_env.close()

    def floor(self, floor):
        self._obstacle_tower_env.floor(floor)
        self.start_floor = floor