Python PPO示例，stable_baselines3.ppo.PPO Python示例

示例#1

0

显示文件

def make_agents(env):
    # load_path = "zoo/ppo_masking/final_model"
    # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model"
    load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model"
    model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    # random2 = RandomAgent(env)
    # random3 = RandomAgent(env)

    return [model, random1]  #, random2, random3]

示例#2

0

显示文件

 def _load(self, env_cls, env_kwargs, agent_kwargs):
     with open(self.kwargs_path, 'rb') as kwargs_file:
         kwargs = pickle.load(kwargs_file)
     kwargs['env'].update(env_kwargs)
     kwargs['agent'].update(agent_kwargs)
     env = self._build_env(env_cls, kwargs['env'],
                           kwargs['agent']['n_steps'])
     agent = PPO.load(path=self.agent_path,
                      env=env,
                      tensorboard_log=self.tensorboard_path,
                      **kwargs['agent'])
     return agent, env

示例#3

0

显示文件

def run_experiment(args):
    # Again could have used the SB3 tools here, buuuut...
    vecEnv = []
    for i in range(args.n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    vecEnv = DummyVecEnv(vecEnv)

    constraint = AVAILABLE_CONSTRAINTS[args.constraint]
    agent = None
    if constraint == "ClipPPO":
        # Create a vanilla PPO
        agent = PPO("MlpPolicy",
                    vecEnv,
                    verbose=2,
                    device="cpu",
                    n_steps=args.n_steps,
                    clip_range=args.clip_range,
                    learning_rate=args.learning_rate,
                    gamma=args.gamma,
                    ent_coef=args.ent_coef,
                    gae_lambda=1.0,
                    n_epochs=args.n_epochs)
    else:
        constraint = constraint(args)

        agent = SmallStepPPO("MlpPolicy",
                             vecEnv,
                             verbose=2,
                             device="cpu",
                             n_steps=args.n_steps,
                             step_constraint=constraint,
                             learning_rate=args.learning_rate,
                             step_constraint_max_updates=args.max_updates,
                             gamma=args.gamma,
                             ent_coef=args.ent_coef,
                             gae_lambda=1.0)

    output_log_file = None
    if args.output_log:
        output_log_file = open(args.output_log, "w")
        logger.Logger.CURRENT = logger.Logger(
            folder=None,
            output_formats=[logger.HumanOutputFormat(output_log_file)])

    agent.learn(total_timesteps=args.total_timesteps)

    if args.output is not None:
        agent.save(os.path.join(args.output, AGENT_FILE))

    vecEnv.close()
    if output_log_file:
        output_log_file.close()

示例#4

0

显示文件

def make_agents(env):
    # new_load_path = "zoo/ppo_recreate_best/latest/best_model"
    # new_load_path = "zoo/ppo_reward_bugfix2/latest/best_model"
    new_load_path = "zoo/ppo_reward_bugfix4/latest/best_model"
    # new_load_path = "zoo/ppo_masking_fast_elimination3/best_model"
    new_model = PPO.load(new_load_path, env)

    # new_load_path = "zoo/ppo_headsup/latest/best_model"
    # new_model2 = PPO.load(new_load_path, env)

    # old_load_path = "ppo2/final_model"
    # old_load_path = "zoo/ppo_masking_fast_elimination3/best_model"
    old_load_path = "zoo/ppo_reward_bugfix2/latest/best_model"
    old_load_path2 = "zoo/ppo_recreate_best/latest/best_model"
    # old_load_path = "zoo/ppo_headsup/latest/best_model"
    old_model = PPO.load(old_load_path, env)
    old_model2 = PPO.load(old_load_path2, env)

    # random1 = RandomAgent(env)
    # random2 = RandomAgent(env)

    return [old_model, old_model2, new_model]

示例#5

0

显示文件

文件： train_ppo_kl.py 项目： kronion/gym-love-letter

def train(output_folder, load_path):
    base_output = Path(output_folder)
    full_output = base_output / datetime.datetime.now().isoformat(
        timespec="seconds")
    # latest = base_output / "latest"
    # latest.symlink_to(full_output)

    logger.configure(folder=str(full_output))

    env = LoveLetterMultiAgentEnv(num_players=4,
                                  reward_fn=Rewards.fast_elimination_reward)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        # def test_fn(env):
        #     return env.valid_action_mask()
        #
        model = PPO(MlpPolicy, env, verbose=1,
                    ent_coef=0.05)  #, action_mask_fn=test_fn)

    other_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    # other_agents = [
    #     PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env),
    # ]
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    # ]
    agents = [model, *other_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(
        env,
        best_model_save_path=str(full_output),
        log_path=str(full_output),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
    )

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(str(full_output / "final_model"))

    env.close()

示例#6

0

显示文件

文件： train_ppo2.py 项目： kronion/gym-love-letter

def train(load_path):
    env = LoveLetterMultiAgentEnv(num_players=4, reward_fn=Rewards.game_completion_reward)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        model = PPO(MlpPolicy, env)

    random_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    agents = [model, *random_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(LOGDIR, "final_model"))  # probably never get to this point.

    env.close()

示例#7

0

显示文件

scenario = os.path.join(code_location, "scenarios", game,
                        "custom_rewards.json")
# state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.100cc.1P.DK.Start.state")
state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game,
                     "MarioCircuit1.GP.50cc.1P.Luigi.Start.state")
# state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "DonutPlains1.GP.50cc.1P.Koopa.Start.state")

model_name = os.path.join(
    code_location, "models",
    "ppo_SuperMarioKart-Snes_e304080a-dd37-4efa-9140-aecc0079e710_final")

env = get_env(game, state, scenario)
# Record a movie of the output
# moviepath = "testmodel.mp4"
# env = MovieRecordWrapper(env, savedir=moviepath)

env = DummyVecEnv([lambda: env])
model = PPO.load(model_name)
model.set_env(env)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print("Step reward: {}".format(rewards))
    # cumulative_reward = np.sum(rewards) + cumulative_reward
    env.render()
    if np.any(dones):
        # print("Cumulative reward: {}".format(cumulative_reward))
        time.sleep(1)
        break

示例#8

0

显示文件

    print("testing SB3 TD3")
    test_trainer(
        100, 100,
        SB3OffPolicyTrainer(
            continious_env_fn,
            TD3("MlpPolicy", continious_env_fn(), device="cpu")))
    print("testing SB3 SAC")
    test_trainer(
        100, 100,
        SB3OffPolicyTrainer(
            continious_env_fn,
            SAC("MlpPolicy", continious_env_fn(), device="cpu")))
    print("testing SB3 DDPG")
    test_trainer(
        100, 100,
        SB3OffPolicyTrainer(
            continious_env_fn,
            DDPG("MlpPolicy", continious_env_fn(), device="cpu")))
    print("testing SB3 PPO")
    test_trainer(
        100, 100,
        SB3OnPolicyTrainer(
            discrete_env_fn,
            PPO("MlpPolicy", discrete_env_fn(), device="cpu", n_steps=10)))
    print("testing SB3 PPO with continuous env")
    test_trainer(
        100, 100,
        SB3OnPolicyTrainer(
            continious_env_fn,
            PPO("MlpPolicy", continious_env_fn(), device="cpu", n_steps=10)))

示例#9

0

显示文件

 def _create(self, env_cls, env_kwargs, agent_kwargs):
     env = self._build_env(env_cls, env_kwargs, agent_kwargs['n_steps'])
     agent = PPO(env=env,
                 tensorboard_log=self.tensorboard_path,
                 **agent_kwargs)
     return agent, env

示例#10

0

显示文件

文件： play.py 项目： mikipacman/retro-rl

project_name = "miki.pacman/MK2"
google_drive_checkpoints_path = "MK2/saves"
exp_id = "MK-19"
params = get_exp_params(exp_id, project_name)

params.update({"state_versions": [16, 17, 18, 19]})

if __name__ == '__main__':
    with tempfile.TemporaryDirectory(dir="/tmp") as temp:
        checkpointer = GoogleDriveCheckpointer(
            project_experiments_path=google_drive_checkpoints_path,
            exp_id=exp_id)
        checkpoints_list = checkpointer.get_list_of_checkpoints()
        checkpoint = checkpoints_list[len(checkpoints_list) // 2]
        checkpointer.download_checkpoints([checkpoint], temp)

        env1, env2, env3 = params["env_function"](params, train=False)
        model = PPO.load(os.path.join(temp, checkpoint))
        p1 = {"policy": model, "frameskip": params["frameskip"], "env": env2}
        p2 = {"policy": "human", "frameskip": 60, "env": env3}

        for i in range(4):
            PygameInteractiveEnvRecorder(
                fps=60,
                env=env1,
                p1=p1,
                p2=p2,
                render_n_frames_after_done=250,
                record_output_path=f"/tmp/{exp_id}_video_{i}.mp4").run()

示例#11

0

显示文件

    batch_size=128,
    learning_rate=0.001,
    n_epochs=8,
    gamma=0.99,
    ent_coef=0.01,
    vf_coef=0.5,
    gae_lambda=0.95,
    clip_range=0.2,
    clip_range_vf=float('inf'),
    max_grad_norm=0.5  #float('inf')
)

# Create the learning agent according to the chosen algorithm
agent = PPO(MlpPolicy,
            env,
            **config,
            tensorboard_log=tensorboard_data_path,
            verbose=True)

# Load an agent if desired
# agent = PPO2.load("cartpole_ppo2_baseline.pkl")

# Run the learning process
agent.learn(total_timesteps=400000, log_interval=5, reset_num_timesteps=False)

# Save the agent if desired
# agent.save("cartpole_ppo2_baseline.pkl")

### Enjoy a trained agent

# duration of the simulations in seconds

示例#12

0

显示文件

文件： train_ppo_refactor.py 项目： mcgeddes11/retro-ai-hacking

    # env = DummyVecEnv([lambda: get_env(game, state, scenario)])
    # env = VecNormalize(env, norm_obs=True, norm_reward=False)
    env = VecCheckNan(env, raise_exception=True)

    # Create a callback to save every n timesteps
    prefix = "ppo_" + game + "_" + experiment_id
    checkpoint_callback = CheckpointCallback(
        save_freq=100000,
        save_path="C:\\Projects\\OpenAI Games\\retro-ai-hacking\\models",
        name_prefix=prefix)

    savefile_name = prefix + "_final"

    savefile_name = os.path.join(
        "C:\\Projects\\OpenAI Games\\retro-ai-hacking\\models", savefile_name)

    model = PPO(
        CnnPolicy,
        env,
        verbose=1,
        n_steps=128,
        n_epochs=3,
        learning_rate=2.5e-4,
        batch_size=32,
        ent_coef=0.01,
        vf_coef=1.0,
        tensorboard_log="C:\\Projects\\OpenAI Games\\retro-ai-hacking\\tb_logs"
    )
    model.learn(total_timesteps=1000000, callback=checkpoint_callback)
    model.save(savefile_name)

示例#13

0

显示文件

文件： cartpole_ppo.py 项目： Wandercraft/jiminy

agent_cfg['clip_range_vf'] = float('inf')
agent_cfg['max_grad_norm'] = float('inf')
agent_cfg['seed'] = SEED

# ====================== Run the optimization ======================

# Create a multiprocess environment
env_creator = lambda: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS)
train_env = SubprocVecEnv([env_creator for _ in range(int(N_THREADS // 2))],
                          start_method='fork')
test_env = DummyVecEnv([env_creator])

# Create the learning agent according to the chosen algorithm
train_agent = PPO(MlpPolicy,
                  train_env,
                  **agent_cfg,
                  tensorboard_log=log_path,
                  verbose=True)
train_agent.eval_env = test_env

# Run the learning process
checkpoint_path = train(train_agent, max_timesteps=100000)

# ===================== Enjoy the trained agent ======================

# Create testing agent
test_agent = train_agent.load(checkpoint_path)
test_agent.eval_env = test_env

# Run the testing process
test(test_agent, max_episodes=1)