示例#1
0
def make_agents(env):
    new_load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model"
    new_model = PPO.load(new_load_path, env)

    old_load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model"
    # old_load_path = "zoo/ppo_headsup/latest/best_model"
    old_model = PPO.load(old_load_path, env)

    # random1 = RandomAgent(env)
    # random2 = RandomAgent(env)

    return [new_model, old_model]
示例#2
0
def train(load_path):
    env = LoveLetterMultiAgentEnv(num_players=4)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        model = PPO(MlpPolicy, env)

    random_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    agents = [model, *random_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(
        LOGDIR, "final_model"))  # probably never get to this point.

    env.close()
示例#3
0
def make_agents(env):
    load_path = "zoo/ppo_masking/final_model"
    model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    random2 = RandomAgent(env)
    random3 = RandomAgent(env)

    return [model, random1, random2, random3]
示例#4
0
def make_agents(env):
    # load_path = "zoo/ppo_masking/final_model"
    # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model"
    load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model"
    model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    # random2 = RandomAgent(env)
    # random3 = RandomAgent(env)

    return [model, random1]  #, random2, random3]
示例#5
0
 def _load(self, env_cls, env_kwargs, agent_kwargs):
     with open(self.kwargs_path, 'rb') as kwargs_file:
         kwargs = pickle.load(kwargs_file)
     kwargs['env'].update(env_kwargs)
     kwargs['agent'].update(agent_kwargs)
     env = self._build_env(env_cls, kwargs['env'],
                           kwargs['agent']['n_steps'])
     agent = PPO.load(path=self.agent_path,
                      env=env,
                      tensorboard_log=self.tensorboard_path,
                      **kwargs['agent'])
     return agent, env
示例#6
0
def make_agents(env):
    # new_load_path = "zoo/ppo_recreate_best/latest/best_model"
    # new_load_path = "zoo/ppo_reward_bugfix2/latest/best_model"
    new_load_path = "zoo/ppo_reward_bugfix4/latest/best_model"
    # new_load_path = "zoo/ppo_masking_fast_elimination3/best_model"
    new_model = PPO.load(new_load_path, env)

    # new_load_path = "zoo/ppo_headsup/latest/best_model"
    # new_model2 = PPO.load(new_load_path, env)

    # old_load_path = "ppo2/final_model"
    # old_load_path = "zoo/ppo_masking_fast_elimination3/best_model"
    old_load_path = "zoo/ppo_reward_bugfix2/latest/best_model"
    old_load_path2 = "zoo/ppo_recreate_best/latest/best_model"
    # old_load_path = "zoo/ppo_headsup/latest/best_model"
    old_model = PPO.load(old_load_path, env)
    old_model2 = PPO.load(old_load_path2, env)

    # random1 = RandomAgent(env)
    # random2 = RandomAgent(env)

    return [old_model, old_model2, new_model]
示例#7
0
def train(output_folder, load_path):
    base_output = Path(output_folder)
    full_output = base_output / datetime.datetime.now().isoformat(
        timespec="seconds")
    # latest = base_output / "latest"
    # latest.symlink_to(full_output)

    logger.configure(folder=str(full_output))

    env = LoveLetterMultiAgentEnv(num_players=4,
                                  reward_fn=Rewards.fast_elimination_reward)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        # def test_fn(env):
        #     return env.valid_action_mask()
        #
        model = PPO(MlpPolicy, env, verbose=1,
                    ent_coef=0.05)  #, action_mask_fn=test_fn)

    other_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    # other_agents = [
    #     PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env),
    # ]
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    # ]
    agents = [model, *other_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(
        env,
        best_model_save_path=str(full_output),
        log_path=str(full_output),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
    )

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(str(full_output / "final_model"))

    env.close()
示例#8
0
scenario = os.path.join(code_location, "scenarios", game,
                        "custom_rewards.json")
# state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.100cc.1P.DK.Start.state")
state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game,
                     "MarioCircuit1.GP.50cc.1P.Luigi.Start.state")
# state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "DonutPlains1.GP.50cc.1P.Koopa.Start.state")

model_name = os.path.join(
    code_location, "models",
    "ppo_SuperMarioKart-Snes_e304080a-dd37-4efa-9140-aecc0079e710_final")

env = get_env(game, state, scenario)
# Record a movie of the output
# moviepath = "testmodel.mp4"
# env = MovieRecordWrapper(env, savedir=moviepath)

env = DummyVecEnv([lambda: env])
model = PPO.load(model_name)
model.set_env(env)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print("Step reward: {}".format(rewards))
    # cumulative_reward = np.sum(rewards) + cumulative_reward
    env.render()
    if np.any(dones):
        # print("Cumulative reward: {}".format(cumulative_reward))
        time.sleep(1)
        break
示例#9
0
project_name = "miki.pacman/MK2"
google_drive_checkpoints_path = "MK2/saves"
exp_id = "MK-19"
params = get_exp_params(exp_id, project_name)

params.update({"state_versions": [16, 17, 18, 19]})

if __name__ == '__main__':
    with tempfile.TemporaryDirectory(dir="/tmp") as temp:
        checkpointer = GoogleDriveCheckpointer(
            project_experiments_path=google_drive_checkpoints_path,
            exp_id=exp_id)
        checkpoints_list = checkpointer.get_list_of_checkpoints()
        checkpoint = checkpoints_list[len(checkpoints_list) // 2]
        checkpointer.download_checkpoints([checkpoint], temp)

        env1, env2, env3 = params["env_function"](params, train=False)
        model = PPO.load(os.path.join(temp, checkpoint))
        p1 = {"policy": model, "frameskip": params["frameskip"], "env": env2}
        p2 = {"policy": "human", "frameskip": 60, "env": env3}

        for i in range(4):
            PygameInteractiveEnvRecorder(
                fps=60,
                env=env1,
                p1=p1,
                p2=p2,
                render_n_frames_after_done=250,
                record_output_path=f"/tmp/{exp_id}_video_{i}.mp4").run()
示例#10
0
agent_cfg['max_grad_norm'] = float('inf')
agent_cfg['seed'] = SEED

# ====================== Run the optimization ======================

# Create a multiprocess environment
env_creator = lambda: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS)
train_env = SubprocVecEnv([env_creator for _ in range(int(N_THREADS // 2))],
                          start_method='fork')
test_env = DummyVecEnv([env_creator])

# Create the learning agent according to the chosen algorithm
train_agent = PPO(MlpPolicy,
                  train_env,
                  **agent_cfg,
                  tensorboard_log=log_path,
                  verbose=True)
train_agent.eval_env = test_env

# Run the learning process
checkpoint_path = train(train_agent, max_timesteps=100000)

# ===================== Enjoy the trained agent ======================

# Create testing agent
test_agent = train_agent.load(checkpoint_path)
test_agent.eval_env = test_env

# Run the testing process
test(test_agent, max_episodes=1)