class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential( nn.Linear(STATE_DIM, ACTIONS) ) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer) def test_run(self): state1 = State(torch.randn(1, STATE_DIM)) dist1 = self.policy(state1) action1 = dist1.sample() log_prob1 = dist1.log_prob(action1) self.assertEqual(action1.item(), 0) state2 = State(torch.randn(1, STATE_DIM)) dist2 = self.policy(state2) action2 = dist2.sample() log_prob2 = dist2.log_prob(action2) self.assertEqual(action2.item(), 2) loss = -(torch.tensor([-1, 1000000]) * torch.cat((log_prob1, log_prob2))).mean() self.policy.reinforce(loss) state3 = State(torch.randn(1, STATE_DIM)) dist3 = self.policy(state3) action3 = dist3.sample() self.assertEqual(action3.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states).sample() tt.assert_equal(actions, torch.tensor([2, 2, 0])) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) dist = self.policy(states) actions = dist.sample() log_probs = dist.log_prob(actions) tt.assert_equal(actions, torch.tensor([1, 2, 1])) loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean() self.policy.reinforce(loss) def test_reinforce(self): def loss(log_probs): return -log_probs.mean() states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) actions = self.policy.eval(states).sample() # notice the values increase with each successive reinforce log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3) self.policy.reinforce(loss(log_probs)) log_probs = self.policy(states).log_prob(actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS) def test_run(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 0) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 2) self.policy.reinforce(torch.tensor([-1, 1000000]).float()) action = self.policy(state) self.assertEqual(action.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([2, 2, 0])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_multi_batch_reinforce(self): self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float()) self.policy.reinforce(torch.tensor([1, 2]).float()) with self.assertRaises(Exception): self.policy.reinforce(torch.tensor([1, 2]).float()) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([1, 2, 1])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_action_prob(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) with torch.no_grad(): actions = self.policy(states) probs = self.policy(states, action=actions) tt.assert_almost_equal(probs, torch.tensor([0.204, 0.333, 0.217]), decimal=3)
class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) self.policy = SoftmaxPolicy(self.model, optimizer, ACTIONS) def test_run(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 0) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.item(), 2) self.policy.reinforce(torch.tensor([-1, 1000000]).float()) action = self.policy(state) self.assertEqual(action.item(), 2) def test_multi_action(self): states = State(torch.randn(3, STATE_DIM)) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([2, 2, 0])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_multi_batch_reinforce(self): self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy(State(torch.randn(2, STATE_DIM))) self.policy.reinforce(torch.tensor([1, 2, 3, 4]).float()) self.policy.reinforce(torch.tensor([1, 2]).float()) with self.assertRaises(Exception): self.policy.reinforce(torch.tensor([1, 2]).float()) def test_list(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) actions = self.policy(states) tt.assert_equal(actions, torch.tensor([1, 2, 1])) self.policy.reinforce(torch.tensor([[1, 2, 3]]).float()) def test_action_prob(self): torch.manual_seed(1) states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) with torch.no_grad(): actions = self.policy(states) log_probs = self.policy(states, action=actions) tt.assert_almost_equal(log_probs, torch.tensor([-1.59, -1.099, -1.528]), decimal=3) def test_custom_loss(self): def loss(log_probs): return -log_probs.mean() states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) actions = self.policy.eval(states) # notice the values increase with each successive reinforce log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.84, -0.62, -0.757]), decimal=3) self.policy.reinforce(loss) log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.811, -0.561, -0.701]), decimal=3) self.policy.reinforce(loss) log_probs = self.policy(states, actions) tt.assert_almost_equal(log_probs, torch.tensor([-0.785, -0.51, -0.651]), decimal=3)
class DiversityLearner: def __init__( self, model_fn, model_features, logger, device, num_targets, max_learn_steps, num_actions, obs_preproc, discount_factor=0.99, entropy_target=-2, lr_value=1e-3, lr_pi=1e-4, # Training settings polyak_rate=0.005, # Replay Buffer settings replay_start_size=5000, replay_buffer_size=1e6, # Exploration settings temperature_initial=0.1, lr_temperature=1e-5, entropy_target_scaling=1., ): self.writer = writer = DummyWriter() eps = 1e-5 self.discount_factor = discount_factor self.entropy_target = entropy_target self.temperature = temperature_initial self.lr_temperature = lr_temperature self.logger = logger self.device = device self.num_targets = num_targets self.max_learn_steps = max_learn_steps self.num_actions = num_actions final_anneal_step = (max_learn_steps) self.policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) self.policy = self.policy.to(device) self.obs_preproc = obs_preproc policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps) self.policy_learner = SoftmaxPolicy(self.policy, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) value_feature_model = model_fn().to(device) q_models = [ DuelingQValueLayer(model_features, num_targets, num_actions).to(device) for i in range(2) ] v_model = ValueLayer(model_features, num_targets, num_actions).to(device) feature_optimizer = Adam(value_feature_model.parameters(), lr=lr_value, eps=eps) q_optimizers = [ Adam(q_models[i].parameters(), lr=lr_value, eps=eps) for i in range(2) ] v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps) self.features = FeatureNetwork( value_feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), # clip_grad=clip_grad, writer=writer) self.qs = [ QContinuous(q_models[i], q_optimizers[i], scheduler=CosineAnnealingLR(q_optimizers[i], final_anneal_step), writer=writer, name=f'q_{i}') for i in range(2) ] self.v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) def learn_step(self, idxs, transition_batch, weights): Otm1, targ_vec, old_action, env_rew, done, Ot = transition_batch batch_size = len(Ot) obsm1 = self.obs_preproc(torch.tensor(Otm1, device=self.device)) targ_vec = torch.tensor(targ_vec, device=self.device) actions = torch.tensor(old_action, device=self.device) rewards = torch.tensor(env_rew, device=self.device) done = torch.tensor(done, device=self.device).float().to(self.device) next_obs = self.obs_preproc(torch.tensor(Ot, device=self.device)) weights = torch.tensor(weights, device=self.device) # assert (not (Otm1 == Ot).all()) # print(self.device) states = StateArray( { 'observation': obsm1, 'reward': rewards, 'done': done, }, shape=(batch_size, )) # print(states['mask']) next_states = StateArray( { 'observation': obsm1, 'reward': torch.zeros(batch_size, device=self.device), 'done': torch.zeros(batch_size, device=self.device), 'mask': torch.ones(batch_size, device=self.device), }, shape=(batch_size, )) # prediction_reward = self.predictor(Ot) * targ_vec with torch.no_grad(): distribution = self.policy_learner(states) _log_probs = distribution.log_prob(actions).detach().squeeze() value_feature1 = self.features(states) value_feature2 = self.features(next_states) _actions = distribution.sample() #torch.argmax(_log_probs, axis=-1) q_targets = rewards + self.discount_factor * self.v.target( value_feature2).detach() # print(value_feature1) v_targets = torch.min( self.qs[0].target(value_feature1, _actions), self.qs[1].target(value_feature1, _actions), ) - self.temperature * _log_probs # update Q and V-functions # print(q_targets.min(),torch.min( # self.qs[0].target(value_feature1, _actions), # self.qs[1].target(value_feature1, _actions), # )) for i in range(2): self.qs[i].reinforce( mse_loss(self.qs[i](value_feature1, actions), q_targets)) # print(self.v(value_feature1).shape) # print(v_targets.shape) self.v.reinforce(mse_loss(self.v(value_feature1), v_targets)) # update policy distribution = self.policy_learner(states) _actions2 = distribution.sample() _log_probs2 = distribution.log_prob(_actions2).squeeze() loss = (-self.qs[0](value_feature1, _actions2).detach() + self.temperature * _log_probs2).mean() self.policy_learner.reinforce(loss) self.features.reinforce() self.qs[0].zero_grad() # adjust temperature temperature_grad = (_log_probs + self.entropy_target).mean() self.temperature += self.lr_temperature * temperature_grad.detach( ).cpu().numpy()