示例#1
0
def test_fit_batch_online_atari_with_dqn():
    import d4rl_atari

    make_env = lambda: ChannelFirst(DummyAtari())
    env = AsyncBatchEnv([make_env for _ in range(2)])
    eval_env = ChannelFirst(DummyAtari())

    algo = DQN(n_frames=4)

    buffer = BatchReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_batch_online(
        env,
        buffer,
        explorer,
        n_epochs=1,
        n_steps_per_epoch=500,
        n_updates_per_epoch=1,
        eval_env=eval_env,
        logdir="test_data",
    )

    assert algo.impl.observation_shape == (4, 84, 84)
示例#2
0
def test_fit_batch_online_atari_with_dqn():
    import d4rl_atari

    make_env = lambda: gym.make("breakout-mixed-v0", stack=False)
    env = AsyncBatchEnv([make_env for _ in range(2)])
    eval_env = gym.make("breakout-mixed-v0", stack=False)

    algo = DQN(n_frames=4)

    buffer = BatchReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_batch_online(
        env,
        buffer,
        explorer,
        n_epochs=1,
        n_steps_per_epoch=500,
        n_updates_per_epoch=1,
        eval_env=eval_env,
        logdir="test_data",
        tensorboard=False,
    )

    assert algo.impl.observation_shape == (4, 84, 84)
示例#3
0
def test_discrete_fqe(observation_shape, action_size, q_func_factory, scalers):
    scaler, reward_scaler = scalers
    algo = DQN()
    fqe = DiscreteFQE(
        algo=algo,
        scaler=scaler,
        reward_scaler=reward_scaler,
        q_func_factory=q_func_factory,
    )
    ope_tester(fqe, observation_shape)
    algo.create_impl(observation_shape, action_size)
    algo_update_tester(fqe, observation_shape, action_size, discrete=True)
示例#4
0
def test_collect_atari_with_dqn():
    import d4rl_atari

    env = ChannelFirst(DummyAtari())

    algo = DQN(n_frames=4)

    explorer = LinearDecayEpsilonGreedy()

    buffer = algo.collect(env, explorer=explorer, n_steps=100)

    assert algo.impl.observation_shape == (4, 84, 84)
    assert buffer.size() > 90 and buffer.size() < 100
示例#5
0
def test_evaluate():
    dataset, _ = get_cartpole()
    train_episodes = dataset.episodes[:10]
    test_episodes = dataset.episodes[-10:]

    algo = DQN(n_epochs=1)
    algo.fit(train_episodes, logdir='test_data')

    scores = _evaluate(algo, test_episodes, True)

    eval_keys = [
        'td_error', 'advantage', 'average_value', 'value_std', 'action_match'
    ]

    for key in eval_keys:
        assert key in scores
示例#6
0
def test_fit_online_cartpole_with_dqn():
    env = gym.make('CartPole-v0')
    eval_env = gym.make('CartPole-v0')

    algo = DQN()

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(env,
                    buffer,
                    explorer,
                    n_epochs=1,
                    eval_env=eval_env,
                    logdir='test_data',
                    tensorboard=False)
示例#7
0
def test_fit_online_cartpole_with_dqn():
    env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")

    algo = DQN()

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
    )
示例#8
0
def test_compare():
    dataset, _ = get_cartpole()
    train_episodes = dataset.episodes[:10]
    test_episodes = dataset.episodes[-10:]

    algo = DQN(n_epochs=1)
    algo.fit(train_episodes, logdir='test_data')

    base_algo = DQN(n_epochs=1)
    base_algo.fit(train_episodes, logdir='test_data')

    score = _compare(algo, base_algo, test_episodes, True)
示例#9
0
def test_fit_batch_online_cartpole_with_dqn():
    make_env = lambda: gym.make("CartPole-v0")
    env = AsyncBatchEnv([make_env for _ in range(5)])
    eval_env = gym.make("CartPole-v0")

    algo = DQN()

    buffer = BatchReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_batch_online(
        env,
        buffer,
        explorer,
        n_epochs=1,
        n_steps_per_epoch=500,
        n_updates_per_epoch=1,
        eval_env=eval_env,
        logdir="test_data",
    )
def train(params):
    # setup algorithm
    dqn = DQN(batch_size=params.get("batch_size"),
              learning_rate=params.get("learning_rate"),
              target_update_interval=params.get("target_update_interval"),
              q_func_factory=QRQFunctionFactory(
                  n_quantiles=params.get("n_quantiles")),
              n_steps=params.get("train_freq"),
              gamma=params.get("gamma"),
              n_critics=1,
              target_reduction_type="min",
              use_gpu=True)

    # setup replay buffer
    buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

    # setup explorers
    explorer = LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=params.get("exploration_final_eps"),
        duration=100000)

    # start training
    dqn.fit_online(
        env,
        buffer,
        n_steps=params.get("train_steps"),
        explorer=
        explorer,  # you don't need this with probablistic policy algorithms
        tensorboard_dir=log_dir,
        eval_env=eval_env)

    dqn.save_model(exp_name)
示例#11
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    dqn = DQN(
        n_frames=4,  # frame stacking
        q_func_type=args.q_func_type,
        scaler='pixel',
        use_gpu=args.gpu)

    dqn.fit(train_episodes,
            eval_episodes=test_episodes,
            n_epochs=100,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
示例#12
0
def test_fit_online_atari_with_dqn():
    import d4rl_atari

    env = ChannelFirst(DummyAtari())
    eval_env = ChannelFirst(DummyAtari())

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
    )

    assert algo.impl.observation_shape == (4, 84, 84)
示例#13
0
def test_fit_online_atari_with_dqn():
    import d4rl_atari

    env = gym.make("breakout-mixed-v0", stack=False)
    eval_env = gym.make("breakout-mixed-v0", stack=False)

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
        tensorboard=False,
    )

    assert algo.impl.observation_shape == (4, 84, 84)
示例#14
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    dqn = DQN(n_epochs=100,
              q_func_type=args.q_func_type,
              scaler='pixel',
              use_batch_norm=False,
              use_gpu=device)

    dqn.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
示例#15
0
def test_train_with_dqn():
    env = gym.make('CartPole-v0')
    eval_env = gym.make('CartPole-v0')

    algo = DQN(n_epochs=1)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)
示例#16
0
def test_train_atari_with_dqn():
    import d4rl_atari
    env = gym.make('breakout-mixed-v0', stack=False)
    eval_env = gym.make('breakout-mixed-v0', stack=False)

    algo = DQN(n_epochs=1, n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          eval_env=eval_env,
          n_steps_per_epoch=1000,
          n_updates_per_epoch=1,
          logdir='test_data',
          tensorboard=False)

    assert algo.impl.observation_shape == (4, 84, 84)
示例#17
0
def test_channel_first_with_2_dim_obs():
    env = DummyAtari(squeeze=True)

    width, height = env.observation_space.shape

    wrapper = ChannelFirst(env)

    # check reset
    observation = wrapper.reset()
    assert observation.shape == (1, width, height)

    # check step
    observation, _, _, _ = wrapper.step(wrapper.action_space.sample())
    assert observation.shape == (1, width, height)

    # check with algorithm
    dqn = DQN()
    dqn.build_with_env(wrapper)
    dqn.predict([observation])
示例#18
0
def test_channel_first_with_2_dim_obs():
    env = AtariPreprocessing(gym.make("BreakoutNoFrameskip-v4"))

    width, height = env.observation_space.shape

    wrapper = ChannelFirst(env)

    # check reset
    observation = wrapper.reset()
    assert observation.shape == (1, width, height)

    # check step
    observation, _, _, _ = wrapper.step(wrapper.action_space.sample())
    assert observation.shape == (1, width, height)

    # check with algorithm
    dqn = DQN()
    dqn.build_with_env(wrapper)
    dqn.predict([observation])
示例#19
0
def test_channel_first():
    env = gym.make("Breakout-v0")

    width, height, channel = env.observation_space.shape

    wrapper = ChannelFirst(env)

    # check reset
    observation = wrapper.reset()
    assert observation.shape == (channel, width, height)

    # check step
    observation, _, _, _ = wrapper.step(wrapper.action_space.sample())
    assert observation.shape == (channel, width, height)

    # check with algorithm
    dqn = DQN()
    dqn.build_with_env(wrapper)
    dqn.predict([observation])
示例#20
0
def test_channel_first():
    env = DummyAtari(grayscale=False)

    width, height, channel = env.observation_space.shape

    wrapper = ChannelFirst(env)

    # check reset
    observation = wrapper.reset()
    assert observation.shape == (channel, width, height)

    # check step
    observation, _, _, _ = wrapper.step(wrapper.action_space.sample())
    assert observation.shape == (channel, width, height)

    # check with algorithm
    dqn = DQN()
    dqn.build_with_env(wrapper)
    dqn.predict([observation])
示例#21
0
from d3rlpy.algos import DQN
from d3rlpy.datasets import get_cartpole
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.context import parallel
from sklearn.model_selection import cross_validate

# obtain dataset
dataset, env = get_cartpole()

# setup algowithm with GPU enabled
dqn = DQN(n_epochs=1, use_gpu=True)

# cross validation with multiple GPUs assigned to individual processs
with parallel():
    env_score = evaluate_on_environment(env)
    scores = cross_validate(dqn,
                            dataset,
                            fit_params={'show_progress': False},
                            scoring={'environment': env_score},
                            n_jobs=3)  # 3 parallel training processes

print(scores)
示例#22
0
from d3rlpy.algos import DQN
from d3rlpy.datasets import get_cartpole
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.context import parallel
from sklearn.model_selection import GridSearchCV

# obtain dataset
dataset, env = get_cartpole()

# setup algowithm with GPU enabled
dqn = DQN(use_gpu=True)

# grid search with multiple GPUs assigned to individual processs
with parallel():
    env_score = evaluate_on_environment(env)
    gscv = GridSearchCV(estimator=dqn,
                        param_grid={
                            'learning_rate': [1e-3, 3e-4, 1e-4],
                            'gamma': [0.99, 0.95, 0.9]
                        },
                        scoring={'environment': env_score},
                        refit=False,
                        n_jobs=3)
    gscv.fit(dataset.episodes, n_epochs=1, show_progress=False)

print(gscv.grid_scores_)
示例#23
0
import gym

from d3rlpy.algos import DQN
from d3rlpy.envs import AsyncBatchEnv
from d3rlpy.online.buffers import BatchReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy

if __name__ == '__main__':
    env = AsyncBatchEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])
    eval_env = gym.make('CartPole-v0')

    # setup algorithm
    dqn = DQN(batch_size=32,
              learning_rate=1e-3,
              target_update_interval=1000,
              use_gpu=False)

    # replay buffer for experience replay
    buffer = BatchReplayBuffer(maxlen=100000, env=env)

    # epilon-greedy explorer
    explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                        end_epsilon=0.1,
                                        duration=100000)

    # start training
    dqn.fit_batch_online(env,
                         buffer,
                         explorer,
                         n_epochs=100,
                         eval_interval=1,
示例#24
0
import gym

from d3rlpy.algos import DQN
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy

env = gym.make('CartPole-v0')
eval_env = gym.make('CartPole-v0')

# setup algorithm
dqn = DQN(batch_size=32,
          learning_rate=2.5e-4,
          target_update_interval=100,
          use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=10000)

# start training
dqn.fit_online(env,
               buffer,
               explorer,
               n_steps=30000,
               eval_env=eval_env,
               n_steps_per_epoch=1000,
               update_start_step=1000)
示例#25
0
from d3rlpy.algos import DQN
from d3rlpy.models.optimizers import RMSpropFactory
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.envs import Atari

# get wrapped atari environment
env = Atari(gym.make('BreakoutNoFrameskip-v4'))
eval_env = Atari(gym.make('BreakoutNoFrameskip-v4'), is_eval=True)

# setup algorithm
dqn = DQN(batch_size=32,
          learning_rate=2.5e-4,
          optim_factory=RMSpropFactory(),
          target_update_interval=10000 // 4,
          q_func_factory='mean',
          scaler='pixel',
          n_frames=4,
          use_gpu=True)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=1000000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=1000000)

# start training
dqn.fit_online(env,
               buffer,
示例#26
0
from d3rlpy.algos import DQN
from d3rlpy.datasets import get_cartpole
from d3rlpy.metrics.scorer import evaluate_on_environment

# obtain dataset
dataset, env = get_cartpole()

# setup algorithm
dqn = DQN(n_epochs=1)

# train
dqn.fit(dataset.episodes)

# evaluate trained algorithm
evaluate_on_environment(env, render=True)(dqn)
示例#27
0
import gym

from d3rlpy.algos import DQN
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.online.iterators import train

env = gym.make('CartPole-v0')
eval_env = gym.make('CartPole-v0')

# setup algorithm
dqn = DQN(n_epochs=30,
          batch_size=32,
          learning_rate=2.5e-4,
          target_update_interval=100,
          use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=10000)

# start training
train(env,
      dqn,
      buffer,
      explorer,
      eval_env=eval_env,
def train(params):
    # setup algorithm
    if pretrain:

        dqn = DQN(batch_size=params.get("batch_size"),
                  learning_rate=params.get("learning_rate"),
                  target_update_interval=params.get("target_update_interval"),
                  q_func_factory=QRQFunctionFactory(
                      n_quantiles=params.get("n_quantiles")),
                  n_steps=params.get("train_freq"),
                  gamma=params.get("gamma"),
                  n_critics=1,
                  target_reduction_type="min",
                  use_gpu=True)

        # setup replay buffer
        buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

        # setup explorers
        explorer = LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=params.get("exploration_final_eps"),
            duration=100000)

        # start training
        dqn.fit_online(
            env,
            buffer,
            n_steps=params.get("train_steps"),
            explorer=
            explorer,  # you don't need this with probablistic policy algorithms
            tensorboard_dir=log_dir,
            eval_env=eval_env)

        print("Saving Model")
        dqn.save_model(exp_name)

        print("convert buffer to dataset")
        dataset = buffer.to_mdp_dataset()
        # save MDPDataset
        dataset.dump('{0}.h5'.format(exp_name))

    print("Loading Dataset for Offline Training")
    dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name))
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
    # The dataset can then be used to train a d3rlpy model

    cql = DiscreteCQL(learning_rate=6.25e-05,
                      encoder_factory='default',
                      q_func_factory='mean',
                      batch_size=32,
                      n_frames=1,
                      n_steps=1,
                      gamma=0.99,
                      n_critics=1,
                      bootstrap=False,
                      share_encoder=False,
                      target_reduction_type='min',
                      target_update_interval=8000,
                      use_gpu=True,
                      scaler=None,
                      augmentation=None,
                      generator=None,
                      impl=None)

    cql_exp = params.get("model_name") + "_offline_" + params.get(
        "environment")
    cql_log = '../../../logs/' + cql_exp

    cql.fit(dataset.episodes,
            eval_episodes=test_episodes,
            n_epochs=1000,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
            },
            tensorboard_dir=cql_log)

    cql.save_model(cql_exp)
示例#29
0
文件: fqf.py 项目: navidmdn/d3rlpy
from d3rlpy.algos import DQN
from d3rlpy.models.optimizers import AdamFactory
from d3rlpy.online.buffers import ReplayBuffer
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.envs import Atari

# get wrapped atari environment
env = Atari(gym.make('BreakoutNoFrameskip-v4'))
eval_env = Atari(gym.make('BreakoutNoFrameskip-v4'), is_eval=True)

# setup algorithm
dqn = DQN(batch_size=32,
          learning_rate=5e-5,
          optim_factory=AdamFactory(eps=1e-2 / 32),
          target_update_interval=10000 // 4,
          q_func_factory='fqf',
          scaler='pixel',
          n_frames=4,
          use_gpu=True)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=1000000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.01,
                                    duration=1000000)

# start training
dqn.fit_online(env,
               buffer,