Пример #1
0
def load_metabo_policy(logpath, load_iter, env, device, deterministic):
    with open(os.path.join(logpath, "params_" + str(load_iter)), "rb") as f:
        train_params = pkl.load(f)

    pi = NeuralAF(observation_space=env.observation_space,
                  action_space=env.action_space,
                  deterministic=deterministic,
                  options=train_params["policy_options"]).to(device)
    with open(os.path.join(logpath, "weights_" + str(load_iter)), "rb") as f:
        pi.load_state_dict(torch.load(f))
    with open(os.path.join(logpath, "stats_" + str(load_iter)), "rb") as f:
        stats = pkl.load(f)

    return pi, train_params, stats
Пример #2
0
# register environment
register(id=env_spec["env_id"],
         entry_point="metabo.environment.metabo_gym:MetaBO",
         max_episode_steps=env_spec["T"],
         reward_threshold=None,
         kwargs=env_spec)

# log data and weights go here, use this folder for evaluation afterwards
logpath = os.path.join(rootdir, "log", env_spec["env_id"],
                       datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S"))

# set up policy
policy_fn = lambda observation_space, action_space, deterministic: NeuralAF(
    observation_space=observation_space,
    action_space=action_space,
    deterministic=deterministic,
    options=ppo_spec["policy_options"])

# do training
print(
    "Training on {}.\nFind logs, weights, and learning curve at {}\n\n".format(
        env_spec["env_id"], logpath))
ppo = PPO(policy_fn=policy_fn,
          params=ppo_spec,
          logpath=logpath,
          save_interval=1)
# learning curve is plotted online in separate process
p = mp.Process(target=plot_learning_curve_online,
               kwargs={
                   "logpath": logpath,
Пример #3
0
def eval_experiment(eval_spec):
    env_id = eval_spec["env_id"]
    env_seed_offset = eval_spec["env_seed_offset"]
    policy = eval_spec["policy"]
    logpath = eval_spec["logpath"]
    policy_specs = eval_spec["policy_specs"]
    savepath = eval_spec["savepath"]
    n_workers = eval_spec["n_workers"]
    n_episodes = eval_spec["n_episodes"]
    assert n_episodes % n_workers == 0
    T = eval_spec["T"]
    if policy != "MetaBO":
        pi = None
        deterministic = None
        load_iter = None

    os.makedirs(savepath, exist_ok=True)

    env_seeds = env_seed_offset + np.arange(n_workers)
    dummy_env = gym.make(env_id)
    timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d-%H-%M-%S")
    taf_datafile = policy_specs[
        "TAF_datafile"] if "TAF_datafile" in policy_specs else None
    write_overview_logfile(savepath=savepath,
                           timestamp=timestamp,
                           env=dummy_env,
                           policy=policy,
                           env_seeds=env_seeds,
                           taf_datafile=taf_datafile,
                           policy_specs=policy_specs)
    env_specs = dummy_env.spec._kwargs

    # prepare the policies
    if policy == "GP-UCB":
        feature_order = dummy_env.unwrapped.feature_order_eval_envs
        D = dummy_env.unwrapped.D
        policy_fn = lambda *_: UCB(feature_order=feature_order,
                                   kappa=policy_specs["kappa"],
                                   D=D,
                                   delta=policy_specs["delta"])
    elif policy == "EI":
        feature_order = dummy_env.unwrapped.feature_order_eval_envs
        policy_fn = lambda *_: EI(feature_order=feature_order)
    elif policy == "TAF-ME":
        policy_fn = lambda *_: TAF(datafile=policy_specs["TAF_datafile"],
                                   mode="me")
    elif policy == "TAF-RANKING":
        policy_fn = lambda *_: TAF(
            datafile=policy_specs["TAF_datafile"], mode="ranking", rho=1.0)
    elif policy == "PI":
        feature_order = dummy_env.unwrapped.feature_order_eval_envs
        policy_fn = lambda *_: PI(feature_order=feature_order,
                                  xi=policy_specs["xi"])
    elif policy == "EPS-GREEDY":
        feature_order = dummy_env.unwrapped.feature_order_eps_greedy
        policy_fn = lambda *_: EpsGreedy(datafile=policy_specs["datafile"],
                                         feature_order=feature_order,
                                         eps=policy_specs["eps"])
    elif policy == "GMM-UCB":
        feature_order = dummy_env.unwrapped.feature_order_gmm_ucb
        policy_fn = lambda *_: GMM_UCB(datafile=policy_specs["datafile"],
                                       feature_order=feature_order,
                                       ucb_kappa=policy_specs["ucb_kappa"],
                                       w=policy_specs["w"],
                                       n_components=policy_specs["n_components"
                                                                 ])
    elif policy == "MetaBO":
        load_iter = eval_spec["load_iter"]
        deterministic = eval_spec["deterministic"]
        pi, policy_specs, _ = load_metabo_policy(logpath=logpath,
                                                 load_iter=load_iter,
                                                 env=dummy_env,
                                                 device="cpu",
                                                 deterministic=deterministic)

        policy_fn = lambda osp, asp, det: NeuralAF(observation_space=osp,
                                                   action_space=asp,
                                                   deterministic=det,
                                                   options=policy_specs[
                                                       "policy_options"])
    elif policy == "Random":
        pass  # will be dealt with separately below
    else:
        raise ValueError("Unknown policy!")
    dummy_env.close()

    # evaluate the experiment
    if policy != "Random":
        br = BatchRecorder(size=T * n_episodes,
                           env_id=env_id,
                           env_seeds=env_seeds,
                           policy_fn=policy_fn,
                           n_workers=n_workers,
                           deterministic=deterministic)
        if policy == "MetaBO":
            br.set_worker_weights(pi=pi)
        br.record_batch(gamma=1.0,
                        lam=1.0)  # gamma, lam do not matter for evaluation
        transitions = Transition(*zip(*br.memory.copy()))
        rewards = transitions.reward
        br.cleanup()
    else:
        env = gym.make(env_id)
        env.seed(env_seed_offset)
        rewards = []
        for _ in range(n_episodes):
            rewards = rewards + env.unwrapped.get_random_sampling_reward()
        env.close()

    # save result
    result = Result(logpath=logpath,
                    env_id=env_id,
                    env_specs=env_specs,
                    policy=policy,
                    policy_specs=policy_specs,
                    deterministic=deterministic,
                    load_iter=load_iter,
                    T=T,
                    n_episodes=n_episodes,
                    rewards=rewards)
    fn = "result_metabo_iter_{:04d}".format(
        load_iter) if policy == "MetaBO" else "result_{}".format(policy)
    with open(os.path.join(savepath, fn), "wb") as f:
        pkl.dump(result, f)