Exemplo n.º 1
0
def test_train_with_sac():
    env = gym.make('Pendulum-v0')
    eval_env = gym.make('Pendulum-v0')

    algo = SAC(n_epochs=1)

    buffer = ReplayBuffer(1000, env)

    train(env,
          algo,
          buffer,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)
Exemplo n.º 2
0
def test_train_with_dqn():
    env = gym.make('CartPole-v0')
    eval_env = gym.make('CartPole-v0')

    algo = DQN(n_epochs=1)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)
Exemplo n.º 3
0
def test_train_atari_with_dqn():
    import d4rl_atari
    env = gym.make('breakout-mixed-v0', stack=False)
    eval_env = gym.make('breakout-mixed-v0', stack=False)

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          n_steps=100,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)

    assert algo.impl.observation_shape == (4, 84, 84)
Exemplo n.º 4
0
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.online.iterators import train

env = gym.make('CartPole-v0')
eval_env = gym.make('CartPole-v0')

# setup algorithm
dqn = DQN(n_epochs=30,
          batch_size=32,
          learning_rate=2.5e-4,
          target_update_interval=100,
          use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=10000)

# start training
train(env,
      dqn,
      buffer,
      explorer,
      eval_env=eval_env,
      n_steps_per_epoch=1000,
      n_updates_per_epoch=100)
Exemplo n.º 5
0
import gym

from d3rlpy.algos import SAC
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.iterators import train

env = gym.make('Pendulum-v0')
eval_env = gym.make('Pendulum-v0')

# setup algorithm
sac = SAC(n_epochs=100, batch_size=100, use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# start training
# probablistic policies does not need explorers
train(env,
      sac,
      buffer,
      eval_env=eval_env,
      n_steps_per_epoch=1000,
      n_updates_per_epoch=100)
Exemplo n.º 6
0
# prepare dataset and environment
dataset, env = get_pybullet('hopper-bullet-random-v0')
_, eval_env = get_pybullet('hopper-bullet-random-v0')

train_episodes, test_episodes = train_test_split(dataset)

# setup algorithm
awac = AWAC(n_epochs=30,
            encoder_params={'hidden_units': [256, 256, 256, 256]},
            use_gpu=True)

## pretrain
awac.fit(train_episodes[:10000],
         eval_episodes=test_episodes,
         scorers={
             'environment': evaluate_on_environment(env),
             'advantage': discounted_sum_of_advantage_scorer,
             'value_scale': average_value_estimation_scorer
         })

# fine-tuning
awac.n_epochs = 1000
buffer = ReplayBuffer(1000000, env, train_episodes[:10000])
train(env,
      awac,
      buffer,
      eval_env=eval_env,
      eval_epsilon=0.0,
      n_steps_per_epoch=1000,
      n_updates_per_epoch=1000)