def make_agents(env): new_load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model" new_model = PPO.load(new_load_path, env) old_load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model" # old_load_path = "zoo/ppo_headsup/latest/best_model" old_model = PPO.load(old_load_path, env) # random1 = RandomAgent(env) # random2 = RandomAgent(env) return [new_model, old_model]
def train(load_path): env = LoveLetterMultiAgentEnv(num_players=4) env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) if load_path: model = PPO.load(load_path, env) else: model = PPO(MlpPolicy, env) random_agents = [RandomAgent(env, SEED + i) for i in range(3)] agents = [model, *random_agents] env.set_agents(agents) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point. env.close()
def make_agents(env): load_path = "zoo/ppo_masking/final_model" model = PPO.load(load_path, env) random1 = RandomAgent(env) random2 = RandomAgent(env) random3 = RandomAgent(env) return [model, random1, random2, random3]
def make_agents(env): # load_path = "zoo/ppo_masking/final_model" # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model" load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model" model = PPO.load(load_path, env) random1 = RandomAgent(env) # random2 = RandomAgent(env) # random3 = RandomAgent(env) return [model, random1] #, random2, random3]
def _load(self, env_cls, env_kwargs, agent_kwargs): with open(self.kwargs_path, 'rb') as kwargs_file: kwargs = pickle.load(kwargs_file) kwargs['env'].update(env_kwargs) kwargs['agent'].update(agent_kwargs) env = self._build_env(env_cls, kwargs['env'], kwargs['agent']['n_steps']) agent = PPO.load(path=self.agent_path, env=env, tensorboard_log=self.tensorboard_path, **kwargs['agent']) return agent, env
def make_agents(env): # new_load_path = "zoo/ppo_recreate_best/latest/best_model" # new_load_path = "zoo/ppo_reward_bugfix2/latest/best_model" new_load_path = "zoo/ppo_reward_bugfix4/latest/best_model" # new_load_path = "zoo/ppo_masking_fast_elimination3/best_model" new_model = PPO.load(new_load_path, env) # new_load_path = "zoo/ppo_headsup/latest/best_model" # new_model2 = PPO.load(new_load_path, env) # old_load_path = "ppo2/final_model" # old_load_path = "zoo/ppo_masking_fast_elimination3/best_model" old_load_path = "zoo/ppo_reward_bugfix2/latest/best_model" old_load_path2 = "zoo/ppo_recreate_best/latest/best_model" # old_load_path = "zoo/ppo_headsup/latest/best_model" old_model = PPO.load(old_load_path, env) old_model2 = PPO.load(old_load_path2, env) # random1 = RandomAgent(env) # random2 = RandomAgent(env) return [old_model, old_model2, new_model]
def train(output_folder, load_path): base_output = Path(output_folder) full_output = base_output / datetime.datetime.now().isoformat( timespec="seconds") # latest = base_output / "latest" # latest.symlink_to(full_output) logger.configure(folder=str(full_output)) env = LoveLetterMultiAgentEnv(num_players=4, reward_fn=Rewards.fast_elimination_reward) env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) if load_path: model = PPO.load(load_path, env) else: # def test_fn(env): # return env.valid_action_mask() # model = PPO(MlpPolicy, env, verbose=1, ent_coef=0.05) #, action_mask_fn=test_fn) other_agents = [RandomAgent(env, SEED + i) for i in range(3)] # other_agents = [ # PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env), # ] # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # ] agents = [model, *other_agents] env.set_agents(agents) eval_callback = EvalCallback( env, best_model_save_path=str(full_output), log_path=str(full_output), eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, ) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(str(full_output / "final_model")) env.close()
scenario = os.path.join(code_location, "scenarios", game, "custom_rewards.json") # state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.100cc.1P.DK.Start.state") state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.50cc.1P.Luigi.Start.state") # state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "DonutPlains1.GP.50cc.1P.Koopa.Start.state") model_name = os.path.join( code_location, "models", "ppo_SuperMarioKart-Snes_e304080a-dd37-4efa-9140-aecc0079e710_final") env = get_env(game, state, scenario) # Record a movie of the output # moviepath = "testmodel.mp4" # env = MovieRecordWrapper(env, savedir=moviepath) env = DummyVecEnv([lambda: env]) model = PPO.load(model_name) model.set_env(env) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) print("Step reward: {}".format(rewards)) # cumulative_reward = np.sum(rewards) + cumulative_reward env.render() if np.any(dones): # print("Cumulative reward: {}".format(cumulative_reward)) time.sleep(1) break
project_name = "miki.pacman/MK2" google_drive_checkpoints_path = "MK2/saves" exp_id = "MK-19" params = get_exp_params(exp_id, project_name) params.update({"state_versions": [16, 17, 18, 19]}) if __name__ == '__main__': with tempfile.TemporaryDirectory(dir="/tmp") as temp: checkpointer = GoogleDriveCheckpointer( project_experiments_path=google_drive_checkpoints_path, exp_id=exp_id) checkpoints_list = checkpointer.get_list_of_checkpoints() checkpoint = checkpoints_list[len(checkpoints_list) // 2] checkpointer.download_checkpoints([checkpoint], temp) env1, env2, env3 = params["env_function"](params, train=False) model = PPO.load(os.path.join(temp, checkpoint)) p1 = {"policy": model, "frameskip": params["frameskip"], "env": env2} p2 = {"policy": "human", "frameskip": 60, "env": env3} for i in range(4): PygameInteractiveEnvRecorder( fps=60, env=env1, p1=p1, p2=p2, render_n_frames_after_done=250, record_output_path=f"/tmp/{exp_id}_video_{i}.mp4").run()
agent_cfg['max_grad_norm'] = float('inf') agent_cfg['seed'] = SEED # ====================== Run the optimization ====================== # Create a multiprocess environment env_creator = lambda: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS) train_env = SubprocVecEnv([env_creator for _ in range(int(N_THREADS // 2))], start_method='fork') test_env = DummyVecEnv([env_creator]) # Create the learning agent according to the chosen algorithm train_agent = PPO(MlpPolicy, train_env, **agent_cfg, tensorboard_log=log_path, verbose=True) train_agent.eval_env = test_env # Run the learning process checkpoint_path = train(train_agent, max_timesteps=100000) # ===================== Enjoy the trained agent ====================== # Create testing agent test_agent = train_agent.load(checkpoint_path) test_agent.eval_env = test_env # Run the testing process test(test_agent, max_episodes=1)