Exemplo n.º 1
0
        policy_fn=policy_fn,
        vf_fn=vf_fn,
        lr_vf=1e-3,
        gamma=0.98,
        lambda_=0.96,
        delta=0.001,
        replay_buffer_size=250 * 8,
        policy_update_batch_size=512,
        vf_update_batch_size=512,
        vf_update_iterations=20,
        conjugate_gradient_iterations=20,
        conjugate_gradient_tol=1e-5,
        line_search_iterations=10,
        line_search_coefficient=0.5,
    )
    train_loop = EpisodeTrainLoop(
        agent=agent,
        n_episodes=125 * 8,
        max_episode_length=250,
        ckpt_dir=ckpt_dir,
        log_dir=log_dir,
        ckpt_every=100,
        log_every=10,
        update_every=8,
        metrics=[AverageReturn(8), AverageEpisodeLength(8)])

    if args.mode == 'train':
        train_loop.run()
    if args.mode == 'evaluate':
        evaluate_policy(agent.env, agent.policy)
Exemplo n.º 2
0
        env=env,
        policy_fn=policy_fn,
        qf_fn=qf_fn,
        lr_policy=1e-3,
        lr_qf=1e-3,
        gamma=0.99,
        polyak=0.995,
        alpha=0.2,
        replay_buffer_size=5000,
        update_iterations=50,
        update_batch_size=32,
    )

    train_loop = StepTrainLoop(
        agent=agent,
        n_steps=500 * 200,
        max_episode_length=200,
        initial_random_steps=20 * 200,
        ckpt_dir=ckpt_dir,
        log_dir=log_dir,
        ckpt_every=20 * 200,
        log_every=5 * 200,
        update_every=50,
        metrics=[AverageReturn(5)],
    )

    if args.mode == 'train':
        train_loop.run()
    if args.mode == 'evaluate':
        evaluate_policy(agent.env, agent.policy)
Exemplo n.º 3
0
                                      action_space.n)
    vf_fn = lambda: ValueFunctionNetwork(env.observation_space.shape)
    agent = VPGGAE(
        env=env,
        policy_fn=policy_fn,
        vf_fn=vf_fn,
        lr_policy=1e-3,
        lr_vf=1e-3,
        gamma=0.98,
        lambda_=0.96,
        vf_update_iterations=20,
        replay_buffer_size=250 * 2,
        policy_update_batch_size=256,
        vf_update_batch_size=256,
    )
    train_loop = EpisodeTrainLoop(
        agent=agent,
        n_episodes=1000,
        max_episode_length=250,
        ckpt_dir=ckpt_dir,
        log_dir=log_dir,
        ckpt_every=100,
        log_every=10,
        update_every=2,
        metrics=[AverageReturn(2), AverageEpisodeLength(2)])

    if args.mode == 'train':
        train_loop.run()
    if args.mode == 'evaluate':
        evaluate_policy(agent.env, agent.policy)
Exemplo n.º 4
0
        lr_policy=1e-3,
        lr_qf=1e-3,
        gamma=0.99,
        polyak=0.995,
        replay_buffer_size=50_000,
        update_iterations=50,
        update_batch_size=32,
        update_policy_delay=2,
        transition_action_noise=0.1,
        target_action_noise=0.2,
        target_action_noise_clip=0.5,
    )

    train_loop = StepTrainLoop(
        agent=agent,
        n_steps=10_000 * 1000,
        max_episode_length=1000,
        initial_random_steps=20 * 1000,
        ckpt_dir=ckpt_dir,
        log_dir=log_dir,
        ckpt_every=10 * 1000,
        log_every=1 * 1000,
        update_every=50,
        metrics=[AverageReturn(5), AverageEpisodeLength(5)],
    )

    if args.mode == 'train':
        train_loop.run()
    if args.mode == 'evaluate':
        evaluate_policy(agent.env, agent.policy)
Exemplo n.º 5
0
    vf_fn = lambda: ValueFunctionNetwork(env.observation_space.shape)
    agent = PPOClip(env=env,
                    policy_fn=policy_fn,
                    vf_fn=vf_fn,
                    lr_policy=1e-3,
                    lr_vf=1e-3,
                    gamma=0.98,
                    lambda_=0.96,
                    epsilon=0.05,
                    vf_update_iterations=20,
                    policy_update_iterations=5,
                    policy_update_batch_size=64,
                    vf_update_batch_size=64,
                    replay_buffer_size=100_000)

    train_loop = EpisodeTrainLoop(
        agent=agent,
        n_episodes=10_000,
        max_episode_length=100_000,
        ckpt_dir=ckpt_dir,
        log_dir=log_dir,
        ckpt_every=10,
        log_every=1,
        update_every=1,
        metrics=[AverageReturn(1), AverageEpisodeLength(1)])

    if args.mode == 'train':
        train_loop.run()
    if args.mode == 'evaluate':
        evaluate_policy(agent.env, agent.policy)