class TestPrioritizedReplayBuffer(unittest.TestCase): def setUp(self): random.seed(1) np.random.seed(1) torch.manual_seed(1) self.replay_buffer = PrioritizedReplayBuffer(5, 0.6) def test_run(self): states = StateArray(torch.arange(0, 20), (20,), reward=torch.arange(-1, 19).float()) actions = torch.arange(0, 20).view((-1, 1)) expected_samples = State( torch.tensor( [ [0, 1, 2], [0, 1, 3], [5, 5, 5], [6, 6, 2], [7, 7, 7], [7, 8, 8], [7, 7, 7], ] ) ) expected_weights = [ [1.0000, 1.0000, 1.0000], [0.5659, 0.7036, 0.5124], [0.0631, 0.0631, 0.0631], [0.0631, 0.0631, 0.1231], [0.0631, 0.0631, 0.0631], [0.0776, 0.0631, 0.0631], [0.0866, 0.0866, 0.0866], ] actual_samples = [] actual_weights = [] for i in range(10): self.replay_buffer.store(states[i], actions[i], states[i + 1]) if i > 2: sample = self.replay_buffer.sample(3) sample_states = sample[0].observation self.replay_buffer.update_priorities(torch.randn(3)) actual_samples.append(sample_states) actual_weights.append(sample[-1]) actual_samples = State(torch.cat(actual_samples).view((-1, 3))) self.assert_states_equal(actual_samples, expected_samples) np.testing.assert_array_almost_equal( expected_weights, np.vstack(actual_weights), decimal=3 ) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.observation, expected.observation) self.assertEqual(actual.mask, expected.mask)
def _ddqn(env, writer=DummyWriter()): model = dueling_fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, target=FixedTarget(target_update_frequency), writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) return DDQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer) policy = SharedAutonomyPolicy(q, env.action_space.n, epsilon=0, pilot_tol=pilot_tol) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) else: replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return co_DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency)
def _rainbow(env, writer=DummyWriter()): model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, device=device ) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) return Rainbow( q, replay_buffer, exploration=0., discount_factor=discount_factor ** n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, )
def _rainbow(env, writer=DummyWriter()): model = build_model(env, sigma_init).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork( model, optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=mse_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, initial_epsilon=1, final_epsilon=0, annealing_start=replay_start_size, annealing_time=1 ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, final_beta_frame=final_beta_frame, device=device ) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
class TestPrioritizedReplayBuffer(unittest.TestCase): def setUp(self): random.seed(1) np.random.seed(1) torch.manual_seed(1) self.replay_buffer = PrioritizedReplayBuffer(5, 0.6) def test_run(self): states = State(torch.arange(0, 20)) actions = torch.arange(0, 20) rewards = torch.arange(0, 20) expected_samples = State( torch.tensor([ [0, 2, 2], [0, 1, 1], [3, 3, 5], [5, 3, 6], [3, 5, 7], [8, 5, 8], [8, 5, 5], ])) expected_weights = [[1., 1., 1.], [0.56589746, 0.5124394, 0.5124394], [0.5124343, 0.5124343, 0.5124343], [0.5090894, 0.6456939, 0.46323255], [0.51945686, 0.5801515, 0.45691562], [0.45691025, 0.5096957, 0.45691025], [0.5938914, 0.6220026, 0.6220026]] actual_samples = [] actual_weights = [] for i in range(10): self.replay_buffer.store(states[i], actions[i], rewards[i], states[i + 1]) if i > 2: sample = self.replay_buffer.sample(3) sample_states = sample[0].features self.replay_buffer.update_priorities(torch.randn(3)) actual_samples.append(sample_states) actual_weights.append(sample[-1]) actual_samples = State(torch.cat(actual_samples).view((-1, 3))) self.assert_states_equal(actual_samples, expected_samples) np.testing.assert_array_almost_equal(expected_weights, np.vstack(actual_weights)) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], scheduler=CosineAnnealingLR(optimizer, n_updates), v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) def agent_constructor(writer): return DeepmindAtariBody( Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True ) return MultiagentEncoder(IndependentMultiagent({ agent : agent_constructor(writers[agent]) for agent in env.agents }), env.agents, device)
def _ddqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_ddqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, device=device ) return DeepmindAtariBody( DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def _rainbow(env, writer=DummyWriter()): _model = model _optimizer = optimizer if _model is None: _model = dueling_conv_net( env, frames=agent_history_length).to(device) if _optimizer is None: _optimizer = Adam( _model.parameters(), lr=lr, eps=eps ) q = QNetwork( _model, _optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=smooth_l1_loss, writer=writer ) policy = GreedyPolicy(q, env.action_space.n, annealing_start=replay_start_size, annealing_time=final_exploration_frame - replay_start_size, initial_epsilon=initial_exploration, final_epsilon=final_exploration ) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, final_beta_frame=final_beta_frame, device=device ) return DeepmindAtariBody( DQN(q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), env, action_repeat=action_repeat, frame_stack=agent_history_length, noop_max=noop_max )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) return Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size'] ) / self.hyperparameters['update_frequency'] optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) q = QNetwork(self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device) return DeepmindAtariBody(DDQN( q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], ), lazy_frames=True)
def _rainbow(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QDist( model, optimizer, env.action_space.n, atoms, scheduler=CosineAnnealingLR(optimizer, last_update), v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) agent = Rainbow( q, replay_buffer, exploration=LinearScheduler(initial_exploration, final_exploration, 0, last_timestep, name='exploration', writer=writer), discount_factor=discount_factor**n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, ) return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
def setUp(self): random.seed(1) np.random.seed(1) torch.manual_seed(1) self.replay_buffer = PrioritizedReplayBuffer(5, 0.6)