def test_to_pickle(): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=20, add_valid=False) model = create_dqn_model(data, FixedTargetDQNModule, opt=torch.optim.RMSprop) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method) learner.fit(2) assert len(data.x.info) == 2 assert 0 in data.x.info assert 1 in data.x.info data.to_pickle('./data/test_to_pickle') assert os.path.exists('./data/test_to_pickle_CartPole-v0')
def test_resolution_wrapper(): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=10, add_valid=False, memory_management_strategy='k_top', k=1, feed_type=FEED_TYPE_IMAGE, res_wrap=partial(ResolutionWrapper, w_step=2, h_step=2)) model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop, lr=0.1, channels=[32, 32, 32], ks=[5, 5, 5], stride=[2, 2, 2]) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method, callback_fns=[RewardMetric, EpsilonMetric]) learner.fit(2) temp = gym.make('CartPole-v0') temp.reset() original_shape = temp.render(mode='rgb_array').shape assert data.env.render(mode='rgb_array').shape == (original_shape[0] // 2, original_shape[1] // 2, 3)
def test_dataset_memory_manager(memory_strategy, k): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=20, add_valid=False, memory_management_strategy=memory_strategy, k=k) model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop, lr=0.1) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method, callback_fns=[RewardMetric, EpsilonMetric]) learner.fit(10) data_info = { episode: data.train_ds.x.info[episode] for episode in data.train_ds.x.info if episode != -1 } full_episodes = [ episode for episode in data_info if not data_info[episode][1] ] assert sum([not _[1] for _ in data_info.values() ]) == k, 'There should be k episodes but there is not.' if memory_strategy.__contains__( 'top') and not memory_strategy.__contains__('both'): assert (np.argmax([_[0] for _ in data_info.values()])) in full_episodes
def __init__(self, data: MDPDataBunch, memory=ExperienceReplay(10000), tau=0.001, batch=64, discount=0.99, lr=0.005): """ Implementation of a continuous control algorithm using an actor/critic architecture. Notes: Uses 4 networks, 2 actors, 2 critics. All models use batch norm for feature invariance. Critic simply predicts Q while the Actor proposes the actions to take given a state s. References: [1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning." arXiv preprint arXiv:1509.02971 (2015). Args: data: Primary data object to use. memory: How big the memory buffer will be for offline training. tau: Defines how "soft/hard" we will copy the target networks over to the primary networks. batch: Size of per memory query. discount: Determines the amount of discounting the existing Q reward. lr: Rate that the opt will learn parameter gradients. """ super().__init__(data) self.name = 'DDPG' self.lr = lr self.discount = discount self.batch = batch self.tao = tau self.memory = memory self.action_model = self.initialize_action_model([64, 64], data) self.critic_model = self.initialize_critic_model([64, 64], data) self.opt = OptimWrapper.create(Adam, lr=lr, layer_groups=self.action_model) self.critic_optimizer = OptimWrapper.create( Adam, lr=lr, layer_groups=self.action_model) self.t_action_model = self.initialize_action_model([64, 64], data) self.t_critic_model = self.initialize_critic_model([64, 64], data) self.target_copy_over() self.callbacks = [BaseDDPGCallback] self.loss_func = MSELoss() # TODO Move to Ornstein-Uhlenbeck process self.exploration_strategy = GreedyEpsilon(decay=0.001, epsilon_end=0.1, epsilon_start=1, do_exploration=True)
def test_metrics_reward_init(): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=20) model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method, callback_fns=[RewardMetric]) learner.fit(2)
def test_from_pickle(): data = MDPDataBunch.from_pickle('./data/test_to_pickle_CartPole-v0') model = create_dqn_model(data, FixedTargetDQNModule, opt=torch.optim.RMSprop) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method) learner.fit(2) assert len(data.x.info) == 4 assert 0 in data.x.info assert 3 in data.x.info
def trained_learner(model_cls, env, s_format, experience, bs, layers, memory_size=1000000, decay=0.001, copy_over_frequency=300, lr=None, epochs=450, **kwargs): if lr is None: lr = [0.001, 0.00025] memory = experience(memory_size=memory_size, reduce_ram=True) explore = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=decay) if type(lr) == list: lr = lr[0] if model_cls == DQNModule else lr[1] data = MDPDataBunch.from_env(env, render='human', bs=bs, add_valid=False, keep_env_open=False, feed_type=s_format, memory_management_strategy='k_partitions_top', k=3, **kwargs) if model_cls == DQNModule: model = create_dqn_model(data=data, base_arch=model_cls, lr=lr, layers=layers, opt=optim.RMSprop) else: model = create_dqn_model(data=data, base_arch=model_cls, lr=lr, layers=layers) learn = dqn_learner(data, model, memory=memory, exploration_method=explore, copy_over_frequency=copy_over_frequency, callback_fns=[RewardMetric, EpsilonMetric]) learn.fit(epochs) return learn
def __init__(self, data: MDPDataBunch, memory=None, batch_size=32, lr=0.001, grad_clip=5, max_episodes=None): """Trains an Agent using the Q Learning method on a neural net. Notes: This is not a true implementation of [1]. A true implementation uses a fixed target network. References: [1] Mnih, Volodymyr, et al. "Playing atari with deep reinforcement learning." arXiv preprint arXiv:1312.5602 (2013). Args: data: Used for size input / output information. """ super().__init__(data) # TODO add recommend cnn based on state size? self.name = 'DQN' self.batch_size = batch_size self.discount = 0.99 self.lr = lr self.gradient_clipping_norm = grad_clip self.loss_func = F.smooth_l1_loss self.memory = ifnone(memory, ExperienceReplay(100000)) self.action_model = self.initialize_action_model([64, 64], data) self.opt = OptimWrapper.create(optim.Adam, lr=self.lr, layer_groups=self.action_model) self.learner_callbacks += [ partial(BaseDQNCallback, max_episodes=max_episodes) ] + self.memory.callbacks self.exploration_strategy = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001, do_exploration=self.training)
def test_dqn_dqn_learner(model_cls, s_format, mem, env): data = MDPDataBunch.from_env(env, render='rgb_array', bs=32, add_valid=False, keep_env_open=False, feed_type=s_format) model = create_dqn_model(data, model_cls) memory = mem(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method) assert config_env_expectations[env]['action_shape'] == ( 1, data.action.n_possible_values.item()) if s_format == FEED_TYPE_STATE: assert config_env_expectations[env][ 'state_shape'] == data.state.s.shape
def test_export_learner(): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=20, add_valid=False) model = create_dqn_model(data, FixedTargetDQNModule, opt=torch.optim.RMSprop) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method) learner.fit(2) learner.export('test_export.pkl') #, pickle_data=True) learner = load_learner(learner.path, 'test_export.pkl') learner.fit(2)
def test_databunch_to_pickle(): data = MDPDataBunch.from_env('CartPole-v0', render='rgb_array', bs=5, max_steps=20, add_valid=False, memory_management_strategy='k_partitions_top', k=3) model = create_dqn_model(data, DQNModule, opt=torch.optim.RMSprop, lr=0.1) memory = ExperienceReplay(memory_size=1000, reduce_ram=True) exploration_method = GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, decay=0.001) learner = dqn_learner(data=data, model=model, memory=memory, exploration_method=exploration_method, callback_fns=[RewardMetric, EpsilonMetric]) learner.fit(10) data.to_pickle('./data/cartpole_10_epoch') MDPDataBunch.from_pickle(env_name='CartPole-v0', path='./data/cartpole_10_epoch')