def test_compare(): dataset, _ = get_cartpole() train_episodes = dataset.episodes[:10] test_episodes = dataset.episodes[-10:] algo = DQN(n_epochs=1) algo.fit(train_episodes, logdir='test_data') base_algo = DQN(n_epochs=1) base_algo.fit(train_episodes, logdir='test_data') score = _compare(algo, base_algo, test_episodes, True)
def train(params): # setup algorithm dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) dqn.save_model(exp_name)
def test_fit_batch_online_atari_with_dqn(): import d4rl_atari make_env = lambda: ChannelFirst(DummyAtari()) env = AsyncBatchEnv([make_env for _ in range(2)]) eval_env = ChannelFirst(DummyAtari()) algo = DQN(n_frames=4) buffer = BatchReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_batch_online( env, buffer, explorer, n_epochs=1, n_steps_per_epoch=500, n_updates_per_epoch=1, eval_env=eval_env, logdir="test_data", ) assert algo.impl.observation_shape == (4, 84, 84)
def test_fit_batch_online_atari_with_dqn(): import d4rl_atari make_env = lambda: gym.make("breakout-mixed-v0", stack=False) env = AsyncBatchEnv([make_env for _ in range(2)]) eval_env = gym.make("breakout-mixed-v0", stack=False) algo = DQN(n_frames=4) buffer = BatchReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_batch_online( env, buffer, explorer, n_epochs=1, n_steps_per_epoch=500, n_updates_per_epoch=1, eval_env=eval_env, logdir="test_data", tensorboard=False, ) assert algo.impl.observation_shape == (4, 84, 84)
def test_discrete_fqe(observation_shape, action_size, q_func_factory, scalers): scaler, reward_scaler = scalers algo = DQN() fqe = DiscreteFQE( algo=algo, scaler=scaler, reward_scaler=reward_scaler, q_func_factory=q_func_factory, ) ope_tester(fqe, observation_shape) algo.create_impl(observation_shape, action_size) algo_update_tester(fqe, observation_shape, action_size, discrete=True)
def test_collect_atari_with_dqn(): import d4rl_atari env = ChannelFirst(DummyAtari()) algo = DQN(n_frames=4) explorer = LinearDecayEpsilonGreedy() buffer = algo.collect(env, explorer=explorer, n_steps=100) assert algo.impl.observation_shape == (4, 84, 84) assert buffer.size() > 90 and buffer.size() < 100
def test_evaluate(): dataset, _ = get_cartpole() train_episodes = dataset.episodes[:10] test_episodes = dataset.episodes[-10:] algo = DQN(n_epochs=1) algo.fit(train_episodes, logdir='test_data') scores = _evaluate(algo, test_episodes, True) eval_keys = [ 'td_error', 'advantage', 'average_value', 'value_std', 'action_match' ] for key in eval_keys: assert key in scores
def test_fit_online_cartpole_with_dqn(): env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') algo = DQN() buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online(env, buffer, explorer, n_epochs=1, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_train_with_dqn(): env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') algo = DQN(n_epochs=1) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_fit_online_cartpole_with_dqn(): env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") algo = DQN() buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", )
def test_channel_first_with_2_dim_obs(): env = DummyAtari(squeeze=True) width, height = env.observation_space.shape wrapper = ChannelFirst(env) # check reset observation = wrapper.reset() assert observation.shape == (1, width, height) # check step observation, _, _, _ = wrapper.step(wrapper.action_space.sample()) assert observation.shape == (1, width, height) # check with algorithm dqn = DQN() dqn.build_with_env(wrapper) dqn.predict([observation])
def test_channel_first(): env = DummyAtari(grayscale=False) width, height, channel = env.observation_space.shape wrapper = ChannelFirst(env) # check reset observation = wrapper.reset() assert observation.shape == (channel, width, height) # check step observation, _, _, _ = wrapper.step(wrapper.action_space.sample()) assert observation.shape == (channel, width, height) # check with algorithm dqn = DQN() dqn.build_with_env(wrapper) dqn.predict([observation])
def test_channel_first_with_2_dim_obs(): env = AtariPreprocessing(gym.make("BreakoutNoFrameskip-v4")) width, height = env.observation_space.shape wrapper = ChannelFirst(env) # check reset observation = wrapper.reset() assert observation.shape == (1, width, height) # check step observation, _, _, _ = wrapper.step(wrapper.action_space.sample()) assert observation.shape == (1, width, height) # check with algorithm dqn = DQN() dqn.build_with_env(wrapper) dqn.predict([observation])
def test_channel_first(): env = gym.make("Breakout-v0") width, height, channel = env.observation_space.shape wrapper = ChannelFirst(env) # check reset observation = wrapper.reset() assert observation.shape == (channel, width, height) # check step observation, _, _, _ = wrapper.step(wrapper.action_space.sample()) assert observation.shape == (channel, width, height) # check with algorithm dqn = DQN() dqn.build_with_env(wrapper) dqn.predict([observation])
def test_train_atari_with_dqn(): import d4rl_atari env = gym.make('breakout-mixed-v0', stack=False) eval_env = gym.make('breakout-mixed-v0', stack=False) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, n_steps=100, eval_env=eval_env, logdir='test_data', tensorboard=False) assert algo.impl.observation_shape == (4, 84, 84)
def test_fit_batch_online_cartpole_with_dqn(): make_env = lambda: gym.make("CartPole-v0") env = AsyncBatchEnv([make_env for _ in range(5)]) eval_env = gym.make("CartPole-v0") algo = DQN() buffer = BatchReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_batch_online( env, buffer, explorer, n_epochs=1, n_steps_per_epoch=500, n_updates_per_epoch=1, eval_env=eval_env, logdir="test_data", )
def test_fit_online_atari_with_dqn(): import d4rl_atari env = ChannelFirst(DummyAtari()) eval_env = ChannelFirst(DummyAtari()) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", ) assert algo.impl.observation_shape == (4, 84, 84)
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) dqn = DQN( n_frames=4, # frame stacking q_func_type=args.q_func_type, scaler='pixel', use_gpu=args.gpu) dqn.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def test_fit_online_atari_with_dqn(): import d4rl_atari env = ChannelFirst(gym.make("breakout-mixed-v0")) eval_env = ChannelFirst(gym.make("breakout-mixed-v0")) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", tensorboard=False, ) assert algo.impl.observation_shape == (4, 84, 84)
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dqn = DQN(n_epochs=100, q_func_type=args.q_func_type, scaler='pixel', use_batch_norm=False, use_gpu=device) dqn.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.context import parallel from sklearn.model_selection import cross_validate # obtain dataset dataset, env = get_cartpole() # setup algowithm with GPU enabled dqn = DQN(n_epochs=1, use_gpu=True) # cross validation with multiple GPUs assigned to individual processs with parallel(): env_score = evaluate_on_environment(env) scores = cross_validate(dqn, dataset, fit_params={'show_progress': False}, scoring={'environment': env_score}, n_jobs=3) # 3 parallel training processes print(scores)
import gym from d3rlpy.algos import DQN from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy from d3rlpy.online.iterators import train env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') # setup algorithm dqn = DQN(n_epochs=30, batch_size=32, learning_rate=2.5e-4, target_update_interval=100, use_gpu=False) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=100000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, duration=10000) # start training train(env, dqn, buffer, explorer, eval_env=eval_env,
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment # obtain dataset dataset, env = get_cartpole() # setup algorithm dqn = DQN(n_epochs=1) # train dqn.fit(dataset.episodes) # evaluate trained algorithm evaluate_on_environment(env, render=True)(dqn)
from d3rlpy.algos import DQN from d3rlpy.models.optimizers import RMSpropFactory from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy from d3rlpy.envs import Atari # get wrapped atari environment env = Atari(gym.make('BreakoutNoFrameskip-v4')) eval_env = Atari(gym.make('BreakoutNoFrameskip-v4'), is_eval=True) # setup algorithm dqn = DQN(batch_size=32, learning_rate=2.5e-4, optim_factory=RMSpropFactory(), target_update_interval=10000 // 4, q_func_factory='mean', scaler='pixel', n_frames=4, use_gpu=True) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=1000000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, duration=1000000) # start training dqn.fit_online(env, buffer,
from d3rlpy.algos import DQN from d3rlpy.models.optimizers import AdamFactory from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy from d3rlpy.envs import Atari # get wrapped atari environment env = Atari(gym.make('BreakoutNoFrameskip-v4')) eval_env = Atari(gym.make('BreakoutNoFrameskip-v4'), is_eval=True) # setup algorithm dqn = DQN(batch_size=32, learning_rate=5e-5, optim_factory=AdamFactory(eps=1e-2 / 32), target_update_interval=10000 // 4, q_func_factory='fqf', scaler='pixel', n_frames=4, use_gpu=True) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=1000000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.01, duration=1000000) # start training dqn.fit_online(env, buffer,
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.context import parallel from sklearn.model_selection import GridSearchCV # obtain dataset dataset, env = get_cartpole() # setup algowithm with GPU enabled dqn = DQN(use_gpu=True) # grid search with multiple GPUs assigned to individual processs with parallel(): env_score = evaluate_on_environment(env) gscv = GridSearchCV(estimator=dqn, param_grid={ 'learning_rate': [1e-3, 3e-4, 1e-4], 'gamma': [0.99, 0.95, 0.9] }, scoring={'environment': env_score}, refit=False, n_jobs=3) gscv.fit(dataset.episodes, n_epochs=1, show_progress=False) print(gscv.grid_scores_)
import gym from d3rlpy.algos import DQN from d3rlpy.envs import AsyncBatchEnv from d3rlpy.online.buffers import BatchReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy if __name__ == '__main__': env = AsyncBatchEnv([lambda: gym.make('CartPole-v0') for _ in range(10)]) eval_env = gym.make('CartPole-v0') # setup algorithm dqn = DQN(batch_size=32, learning_rate=1e-3, target_update_interval=1000, use_gpu=False) # replay buffer for experience replay buffer = BatchReplayBuffer(maxlen=100000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, duration=100000) # start training dqn.fit_batch_online(env, buffer, explorer, n_epochs=100, eval_interval=1,
def train(params): # setup algorithm if pretrain: dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) print("Saving Model") dqn.save_model(exp_name) print("convert buffer to dataset") dataset = buffer.to_mdp_dataset() # save MDPDataset dataset.dump('{0}.h5'.format(exp_name)) print("Loading Dataset for Offline Training") dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name)) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # The dataset can then be used to train a d3rlpy model cql = DiscreteCQL(learning_rate=6.25e-05, encoder_factory='default', q_func_factory='mean', batch_size=32, n_frames=1, n_steps=1, gamma=0.99, n_critics=1, bootstrap=False, share_encoder=False, target_reduction_type='min', target_update_interval=8000, use_gpu=True, scaler=None, augmentation=None, generator=None, impl=None) cql_exp = params.get("model_name") + "_offline_" + params.get( "environment") cql_log = '../../../logs/' + cql_exp cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, }, tensorboard_dir=cql_log) cql.save_model(cql_exp)
from d3rlpy.algos import DQN from d3rlpy.datasets import get_cartpole from d3rlpy.metrics.scorer import evaluate_on_environment # obtain dataset dataset, env = get_cartpole() # setup algorithm dqn = DQN() # train dqn.fit(dataset.episodes, n_epochs=1) # evaluate trained algorithm evaluate_on_environment(env, render=True)(dqn)