Exemplo n.º 1
0
def test_replay_buffer_with_episode(maxlen, data_size):
    env = gym.make("CartPole-v0")

    observation_shape = env.observation_space.shape
    action_size = env.action_space.n

    observations = np.random.random((data_size, *observation_shape))
    actions = np.random.randint(action_size, size=data_size, dtype=np.int32)
    rewards = np.random.random(data_size)

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations.astype("f4"),
        actions=actions,
        rewards=rewards.astype("f4"),
    )

    buffer = ReplayBuffer(maxlen, env, episodes=[episode])

    # check episode initialization
    assert len(buffer) == data_size - 1

    # check append_episode
    buffer.append_episode(episode)
    assert len(buffer) == 2 * (data_size - 1)
def train(params):
    # setup algorithm
    dqn = DQN(batch_size=params.get("batch_size"),
              learning_rate=params.get("learning_rate"),
              target_update_interval=params.get("target_update_interval"),
              q_func_factory=QRQFunctionFactory(
                  n_quantiles=params.get("n_quantiles")),
              n_steps=params.get("train_freq"),
              gamma=params.get("gamma"),
              n_critics=1,
              target_reduction_type="min",
              use_gpu=True)

    # setup replay buffer
    buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

    # setup explorers
    explorer = LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=params.get("exploration_final_eps"),
        duration=100000)

    # start training
    dqn.fit_online(
        env,
        buffer,
        n_steps=params.get("train_steps"),
        explorer=
        explorer,  # you don't need this with probablistic policy algorithms
        tensorboard_dir=log_dir,
        eval_env=eval_env)

    dqn.save_model(exp_name)
Exemplo n.º 3
0
def test_replay_buffer(n_episodes, batch_size, maxlen, gamma):
    env = gym.make('CartPole-v0')

    buffer = ReplayBuffer(maxlen, env, gamma)

    total_step = 0
    for episode in range(n_episodes):
        observation, reward, terminal = env.reset(), 0.0, False
        while not terminal:
            action = env.action_space.sample()
            buffer.append(observation, action, reward, terminal)
            observation, reward, terminal, _ = env.step(action)
            total_step += 1
        buffer.append(observation, action, reward, terminal)
        total_step += 1

    assert len(buffer) == maxlen

    observation_shape = env.observation_space.shape
    batch = buffer.sample(batch_size)
    assert len(batch) == batch_size
    assert batch.observations.shape == (batch_size, ) + observation_shape
    assert batch.actions.shape == (batch_size, 1)
    assert batch.rewards.shape == (batch_size, 1)
    assert batch.next_observations.shape == (batch_size, ) + observation_shape
    assert batch.next_actions.shape == (batch_size, 1)
    assert batch.next_rewards.shape == (batch_size, 1)
    assert batch.terminals.shape == (batch_size, 1)
    assert len(batch.returns) == batch_size
    assert len(batch.consequent_observations) == batch_size
Exemplo n.º 4
0
def test_replay_buffer(n_episodes, batch_size, maxlen):
    env = gym.make("CartPole-v0")

    buffer = ReplayBuffer(maxlen, env)

    total_step = 0
    for episode in range(n_episodes):
        observation, reward, terminal = env.reset(), 0.0, False
        while not terminal:
            action = env.action_space.sample()
            buffer.append(observation.astype("f4"), action, reward, terminal)
            observation, reward, terminal, _ = env.step(action)
            total_step += 1
        buffer.append(observation.astype("f4"), action, reward, terminal)
        total_step += 1

    assert len(buffer) == maxlen

    observation_shape = env.observation_space.shape
    batch = buffer.sample(batch_size)
    assert len(batch) == batch_size
    assert batch.observations.shape == (batch_size,) + observation_shape
    assert batch.actions.shape == (batch_size,)
    assert batch.rewards.shape == (batch_size, 1)
    assert batch.next_observations.shape == (batch_size,) + observation_shape
    assert batch.next_actions.shape == (batch_size,)
    assert batch.next_rewards.shape == (batch_size, 1)
    assert batch.terminals.shape == (batch_size, 1)
    assert isinstance(batch.observations, np.ndarray)
    assert isinstance(batch.next_observations, np.ndarray)
Exemplo n.º 5
0
def test_replay_buffer_with_clip_episode(n_episodes, batch_size, maxlen,
                                         clip_episode_flag):
    env = gym.make("CartPole-v0")

    buffer = ReplayBuffer(maxlen, env)

    observation, reward, terminal = env.reset(), 0.0, False
    clip_episode = False
    while not clip_episode:
        action = env.action_space.sample()
        observation, reward, terminal, _ = env.step(action)
        clip_episode = terminal
        if clip_episode_flag and terminal:
            terminal = False
        buffer.append(
            observation=observation.astype("f4"),
            action=action,
            reward=reward,
            terminal=terminal,
            clip_episode=clip_episode,
        )

    # make a transition for a new episode
    for _ in range(2):
        buffer.append(
            observation=observation.astype("f4"),
            action=action,
            reward=reward,
            terminal=False,
        )

    assert buffer.transitions[-2].terminal != clip_episode_flag
    assert buffer.transitions[-2].next_transition is None
    assert buffer.transitions[-1].prev_transition is None
Exemplo n.º 6
0
def test_train_with_sac():
    env = gym.make('Pendulum-v0')
    eval_env = gym.make('Pendulum-v0')

    algo = SAC(n_epochs=1)

    buffer = ReplayBuffer(1000, env)

    train(env,
          algo,
          buffer,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)
Exemplo n.º 7
0
def test_fit_online_pendulum_with_sac():
    env = gym.make('Pendulum-v0')
    eval_env = gym.make('Pendulum-v0')

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(env,
                    buffer,
                    n_epochs=1,
                    eval_env=eval_env,
                    logdir='test_data',
                    tensorboard=False)
Exemplo n.º 8
0
def test_fit_online_pendulum_with_sac():
    env = gym.make("Pendulum-v0")
    eval_env = gym.make("Pendulum-v0")

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(
        env,
        buffer,
        n_steps=500,
        eval_env=eval_env,
        logdir="test_data",
    )
Exemplo n.º 9
0
def test_replay_buffer(n_episodes, batch_size, maxlen, create_mask, mask_size):
    env = gym.make("CartPole-v0")

    buffer = ReplayBuffer(maxlen,
                          env,
                          create_mask=create_mask,
                          mask_size=mask_size)

    total_step = 0
    for episode in range(n_episodes):
        observation, reward, terminal = env.reset(), 0.0, False
        while not terminal:
            action = env.action_space.sample()
            buffer.append(observation.astype("f4"), action, reward, terminal)
            observation, reward, terminal, _ = env.step(action)
            total_step += 1
        buffer.append(observation.astype("f4"), action, reward, terminal)
        total_step += 1

    assert len(buffer) == maxlen

    # check static dataset conversion
    dataset = buffer.to_mdp_dataset()
    transitions = []
    for episode in dataset:
        transitions += episode.transitions
    assert len(transitions) >= len(buffer)

    observation_shape = env.observation_space.shape
    batch = buffer.sample(batch_size)
    assert len(batch) == batch_size
    assert batch.observations.shape == (batch_size, ) + observation_shape
    assert batch.actions.shape == (batch_size, )
    assert batch.rewards.shape == (batch_size, 1)
    assert batch.next_observations.shape == (batch_size, ) + observation_shape
    assert batch.next_actions.shape == (batch_size, )
    assert batch.next_rewards.shape == (batch_size, 1)
    assert batch.terminals.shape == (batch_size, 1)
    assert isinstance(batch.observations, np.ndarray)
    assert isinstance(batch.next_observations, np.ndarray)
    if create_mask:
        assert batch.masks.shape == (mask_size, batch_size, 1)
    else:
        assert batch.masks is None
Exemplo n.º 10
0
def test_train_with_dqn():
    env = gym.make('CartPole-v0')
    eval_env = gym.make('CartPole-v0')

    algo = DQN(n_epochs=1)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)
Exemplo n.º 11
0
def test_fit_online_cartpole_with_dqn():
    env = gym.make('CartPole-v0')
    eval_env = gym.make('CartPole-v0')

    algo = DQN()

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(env,
                    buffer,
                    explorer,
                    n_epochs=1,
                    eval_env=eval_env,
                    logdir='test_data',
                    tensorboard=False)
Exemplo n.º 12
0
def test_fit_online_cartpole_with_dqn():
    env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")

    algo = DQN()

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
    )
Exemplo n.º 13
0
def test_train_atari_with_dqn():
    import d4rl_atari
    env = gym.make('breakout-mixed-v0', stack=False)
    eval_env = gym.make('breakout-mixed-v0', stack=False)

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    train(env,
          algo,
          buffer,
          explorer,
          n_steps=100,
          eval_env=eval_env,
          logdir='test_data',
          tensorboard=False)

    assert algo.impl.observation_shape == (4, 84, 84)
Exemplo n.º 14
0
def test_fit_online_atari_with_dqn():
    import d4rl_atari

    env = ChannelFirst(DummyAtari())
    eval_env = ChannelFirst(DummyAtari())

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
    )

    assert algo.impl.observation_shape == (4, 84, 84)
Exemplo n.º 15
0
def test_fit_online_atari_with_dqn():
    import d4rl_atari

    env = ChannelFirst(gym.make("breakout-mixed-v0"))
    eval_env = ChannelFirst(gym.make("breakout-mixed-v0"))

    algo = DQN(n_frames=4)

    buffer = ReplayBuffer(1000, env)

    explorer = LinearDecayEpsilonGreedy()

    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=100,
        eval_env=eval_env,
        logdir="test_data",
        tensorboard=False,
    )

    assert algo.impl.observation_shape == (4, 84, 84)
Exemplo n.º 16
0
def test_timelimit_aware(timelimit_aware):
    env = gym.make("Pendulum-v0")

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(
        env,
        buffer,
        n_steps=500,
        logdir="test_data",
        timelimit_aware=timelimit_aware,
    )

    terminal_count = 0
    for i in range(len(buffer)):
        terminal_count += int(buffer.transitions[i].terminal)

    if timelimit_aware:
        assert terminal_count == 0
    else:
        assert terminal_count > 0
Exemplo n.º 17
0
                    stack=False,
                    clip_reward=False,
                    terminate_on_life_loss=False)

# setup algorithm
dqn = DoubleDQN(batch_size=32,
                learning_rate=2.5e-4,
                optim_factory=AdamFactory(eps=1e-2 / 32),
                target_update_interval=10000,
                q_func_factory='mean',
                scaler='pixel',
                n_frames=4,
                use_gpu=True)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=1000000, env=env)

# epilon-greedy explorer
explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0,
                                    end_epsilon=0.1,
                                    duration=1000000)

# start training
dqn.fit_online(env,
               buffer,
               explorer,
               eval_env=eval_env,
               eval_epsilon=0.01,
               n_steps=50000000,
               n_steps_per_epoch=100000,
               update_interval=4,
def train(params):
    # setup algorithm
    if pretrain:

        dqn = DQN(batch_size=params.get("batch_size"),
                  learning_rate=params.get("learning_rate"),
                  target_update_interval=params.get("target_update_interval"),
                  q_func_factory=QRQFunctionFactory(
                      n_quantiles=params.get("n_quantiles")),
                  n_steps=params.get("train_freq"),
                  gamma=params.get("gamma"),
                  n_critics=1,
                  target_reduction_type="min",
                  use_gpu=True)

        # setup replay buffer
        buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

        # setup explorers
        explorer = LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=params.get("exploration_final_eps"),
            duration=100000)

        # start training
        dqn.fit_online(
            env,
            buffer,
            n_steps=params.get("train_steps"),
            explorer=
            explorer,  # you don't need this with probablistic policy algorithms
            tensorboard_dir=log_dir,
            eval_env=eval_env)

        print("Saving Model")
        dqn.save_model(exp_name)

        print("convert buffer to dataset")
        dataset = buffer.to_mdp_dataset()
        # save MDPDataset
        dataset.dump('{0}.h5'.format(exp_name))

    print("Loading Dataset for Offline Training")
    dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name))
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
    # The dataset can then be used to train a d3rlpy model

    cql = DiscreteCQL(learning_rate=6.25e-05,
                      encoder_factory='default',
                      q_func_factory='mean',
                      batch_size=32,
                      n_frames=1,
                      n_steps=1,
                      gamma=0.99,
                      n_critics=1,
                      bootstrap=False,
                      share_encoder=False,
                      target_reduction_type='min',
                      target_update_interval=8000,
                      use_gpu=True,
                      scaler=None,
                      augmentation=None,
                      generator=None,
                      impl=None)

    cql_exp = params.get("model_name") + "_offline_" + params.get(
        "environment")
    cql_log = '../../../logs/' + cql_exp

    cql.fit(dataset.episodes,
            eval_episodes=test_episodes,
            n_epochs=1000,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
            },
            tensorboard_dir=cql_log)

    cql.save_model(cql_exp)
Exemplo n.º 19
0
from sklearn.model_selection import train_test_split

# prepare dataset and environment
dataset, env = get_pybullet('hopper-bullet-random-v0')
_, eval_env = get_pybullet('hopper-bullet-random-v0')

train_episodes, test_episodes = train_test_split(dataset)

# setup algorithm
awac = AWAC(encoder_params={'hidden_units': [256, 256, 256, 256]},
            use_gpu=True)

## pretrain
awac.fit(train_episodes[:10000],
         eval_episodes=test_episodes,
         n_epochs=30,
         scorers={
             'environment': evaluate_on_environment(env),
             'advantage': discounted_sum_of_advantage_scorer,
             'value_scale': average_value_estimation_scorer
         })

# fine-tuning
awac.fit_online(env,
                ReplayBuffer(1000000, env, train_episodes[:10000]),
                n_epochs=1000,
                eval_env=eval_env,
                eval_epsilon=0.0,
                n_steps_per_epoch=1000,
                n_updates_per_epoch=1000)